In [59]:
# Importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import clear_output,display

from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [60]:
# importing the train and test datasets

train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
sample_sub=pd.read_csv("sample_submission.csv")

In [61]:
train["Week_No"]=np.ceil(train["Day_No"]/7)
train["Year_No"]=np.ceil(train["Week_No"]/52)
train["Week_No"]=(train["Week_No"]%52)

train.loc[train["Week_No"]==0,"Week_No"]=train.loc[train["Week_No"]==0,"Week_No"].apply(lambda x: x+1)

In [62]:
test["Week_No"]=np.ceil(test["Day_No"]/7)
test["Year_No"]=np.ceil(test["Week_No"]/52)
test["Week_No"]=(test["Week_No"]%52)

test.loc[test["Week_No"]==0,"Week_No"]=test.loc[test["Week_No"]==0,"Week_No"].apply(lambda x: x+1)

In [63]:
train.drop(columns=["ID","Year_No","User_Traffic"],inplace=True)
test.drop(columns=["ID","Year_No"],inplace=True)

In [64]:
def export_submission(predictions,filename="Sample.csv"):
    
    ss=sample_sub.copy()
    ss["Sales"]=predictions
    ss.to_csv(f"{filename}.csv",index=False)

In [65]:
train[['Day_No','Week_No']]=train[['Day_No','Week_No']].astype('int')
test[['Day_No','Week_No']]=test[['Day_No','Week_No']].astype('int')

In [66]:
train.dtypes

Day_No                  int32
Course_ID               int64
Course_Domain          object
Course_Type            object
Short_Promotion         int64
Public_Holiday          int64
Long_Promotion          int64
Competition_Metric    float64
Sales                   int64
Week_No                 int32
dtype: object

In [67]:
X_train=train.drop(columns="Sales")
y_train=train["Sales"]

In [20]:
# train[["Course_ID","Course_Domain","Course_Type","Short_Promotion","Public_Holiday","Long_promotion"]]=train[["Course_ID","Course_Domain","Course_Type","Short_Promotion","Public_Holiday","Long_Promotion"]].astype("category")

In [21]:
# test[["Course_ID","Course_Domain","Course_Type","Short_Promotion","Public_Holiday","Long_promotion"]]=test[["Course_ID","Course_Domain","Course_Type","Short_Promotion","Public_Holiday","Long_Promotion"]].astype("category")

In [39]:
train.columns

Index(['Day_No', 'Course_ID', 'Course_Domain', 'Course_Type',
       'Short_Promotion', 'Public_Holiday', 'Long_Promotion',
       'Competition_Metric', 'Sales', 'Week_No'],
      dtype='object')

In [80]:
boost=CatBoostRegressor(cat_features=['Course_ID', 'Course_Domain', 'Course_Type',
       'Short_Promotion', 'Public_Holiday', 'Long_Promotion'])

In [81]:
boost.fit(X_train,y_train)

0:	learn: 50.9703258	total: 309ms	remaining: 5m 9s
1:	learn: 48.0203251	total: 479ms	remaining: 3m 59s
2:	learn: 45.4718927	total: 753ms	remaining: 4m 10s
3:	learn: 43.2675812	total: 923ms	remaining: 3m 49s
4:	learn: 41.3727234	total: 1.18s	remaining: 3m 54s
5:	learn: 39.7209822	total: 1.34s	remaining: 3m 42s
6:	learn: 38.3298381	total: 1.58s	remaining: 3m 44s
7:	learn: 37.1153534	total: 1.76s	remaining: 3m 38s
8:	learn: 36.0894800	total: 1.94s	remaining: 3m 34s
9:	learn: 35.1875089	total: 2.15s	remaining: 3m 33s
10:	learn: 34.4283670	total: 2.33s	remaining: 3m 29s
11:	learn: 33.8054031	total: 2.51s	remaining: 3m 26s
12:	learn: 33.2736728	total: 2.65s	remaining: 3m 21s
13:	learn: 32.7943973	total: 2.84s	remaining: 3m 20s
14:	learn: 32.3992367	total: 2.97s	remaining: 3m 15s
15:	learn: 32.0581042	total: 3.15s	remaining: 3m 13s
16:	learn: 31.7549313	total: 3.37s	remaining: 3m 15s
17:	learn: 31.4852407	total: 3.59s	remaining: 3m 15s
18:	learn: 31.2531208	total: 3.77s	remaining: 3m 14s
19:	

KeyboardInterrupt: 

In [70]:
y_train_pred=boost.predict(X_train)
y_test_pred=boost.predict(test)

In [71]:
mean_squared_log_error(y_train,y_train_pred)

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [74]:
pd.Series(y_test_pred).value_counts()

231.839782    18
228.472691    18
222.542873    18
112.884303    12
122.026869    12
              ..
129.182897     1
154.800163     1
166.041378     1
215.606410     1
115.340009     1
Length: 10489, dtype: int64

In [75]:
export_submission(y_test_pred,"CatBoost dayy")

## Randomized Search For XGBoost

In [11]:
parameters={'n_estimators' : [100,200,300,400,500],
            'max_depth' : range(1,30,1),
            'learning_rate' : np.arange(0.001,10,0.01)}

In [12]:
search=RandomizedSearchCV(XGBRegressor(random_state=10),parameters,scoring='neg_mean_squared_log_error',cv=5,n_jobs=-1,verbose=5,n_iter=100)

In [None]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


In [14]:
search.best_params_

{'max_depth': 6}

In [15]:
search.best_score_

-0.17376247324263985

In [11]:
boost=XGBRegressor()
boost.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [12]:
y_train_pred=boost.predict(X_train)
y_test_pred=boost.predict(test)

In [13]:
mean_squared_log_error(y_train,y_train_pred)

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [15]:
export_submission(y_test_pred,"XGBRegressor")

In [None]:
path=tree.cost_complexity_pruning_path(X_train,y_train)