In [1]:
# Importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import clear_output,display

from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
# importing the train and test datasets

train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
sample_sub=pd.read_csv("sample_submission.csv")

In [3]:
train["Week_No"]=np.ceil(train["Day_No"]/7)
train["Year_No"]=np.ceil(train["Week_No"]/52)
train["Week_No"]=(train["Week_No"]%52)

train.loc[train["Week_No"]==0,"Week_No"]=train.loc[train["Week_No"]==0,"Week_No"].apply(lambda x: x+1)

In [4]:
test["Week_No"]=np.ceil(test["Day_No"]/7)
test["Year_No"]=np.ceil(test["Week_No"]/52)
test["Week_No"]=(test["Week_No"]%52)

test.loc[test["Week_No"]==0,"Week_No"]=test.loc[test["Week_No"]==0,"Week_No"].apply(lambda x: x+1)

In [5]:
train.drop(columns=["ID","Year_No","User_Traffic"],inplace=True)
test.drop(columns=["ID","Year_No"],inplace=True)

In [6]:
def export_submission(predictions,filename="Sample.csv"):
    
    ss=sample_sub.copy()
    ss["Sales"]=predictions
    ss.to_csv(f"{filename}.csv",index=False)

In [7]:
# Imputing Missing Values

train.loc[(train["Course_ID"]==354) | (train["Course_ID"]==176),"Competition_Metric"]=train.groupby("Course_ID")["Competition_Metric"].mean().mean()
test.loc[(test["Course_ID"]==354) | (test["Course_ID"]==176),"Competition_Metric"]=train.groupby("Course_ID")["Competition_Metric"].mean().mean()

In [8]:
train.dtypes

Day_No                  int64
Course_ID               int64
Course_Domain          object
Course_Type            object
Short_Promotion         int64
Public_Holiday          int64
Long_Promotion          int64
Competition_Metric    float64
Sales                   int64
Week_No               float64
dtype: object

In [9]:
train=pd.get_dummies(train,drop_first=True)
test=pd.get_dummies(test,drop_first=True)

In [10]:
X_train=train.drop(columns="Sales")
y_train=train["Sales"]

## Randomized Search For XGBoost

In [11]:
parameters={'n_estimators' : [100,200,300,400,500],
            'max_depth' : range(1,30,1),
            'learning_rate' : np.arange(0.001,10,0.01)}

In [12]:
search=RandomizedSearchCV(XGBRegressor(random_state=10),parameters,scoring='neg_mean_squared_log_error',cv=5,n_jobs=-1,verbose=5,n_iter=100)

In [None]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


In [14]:
search.best_params_

{'max_depth': 6}

In [15]:
search.best_score_

-0.17376247324263985

In [11]:
boost=XGBRegressor()
boost.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [12]:
y_train_pred=boost.predict(X_train)
y_test_pred=boost.predict(test)

In [13]:
mean_squared_log_error(y_train,y_train_pred)

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [15]:
export_submission(y_test_pred,"XGBRegressor")

In [None]:
path=tree.cost_complexity_pruning_path(X_train,y_train)