In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales
0,1,1,05/02/2010,42.31,2.572,,,,,,8.106,False,1643690.9
1,2,1,12/02/2010,38.51,2.548,,,,,,8.106,True,1641957.44
2,3,1,19/02/2010,39.93,2.514,,,,,,8.106,False,1611968.17
3,4,1,26/02/2010,46.63,2.561,,,,,,8.106,False,1409727.59
4,5,1,05/03/2010,46.5,2.625,,,,,,8.106,False,1554806.68


In [4]:
train.iloc[:, 5:10] += 266
test.iloc[:, 5:10] += 266

In [5]:
train['Date'] = pd.to_datetime(train['Date'], format = '%d/%m/%Y')
test['Date'] = pd.to_datetime(test['Date'], format = '%d/%m/%Y')

In [6]:
train['Week'] = train['Date'].dt.weekofyear
test['Week'] = test['Date'].dt.weekofyear

In [7]:
train['Year'] = train['Date'].dt.year
test['Year'] = test['Date'].dt.year

In [8]:
train['Month'] = train['Date'].dt.month
test['Month'] = test['Date'].dt.month

In [9]:
train['Day'] = train['Date'].dt.day
test['Day'] = test['Date'].dt.day

In [10]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

In [11]:
train['promotion_sum'] = train.iloc[:, 5:10].sum(axis = 1)
test['promotion_sum'] = test.iloc[:, 5:10].sum(axis = 1)

In [12]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from ngboost import NGBRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [13]:
kf = KFold(n_splits = 15, random_state = 42, shuffle = True)

In [24]:
X = train[['Store', 'Year', 'Week', 'Month', 'Day']]
y = train['Weekly_Sales']

In [25]:
target = test[X.columns]

In [26]:
cb_rmse = []
cb_pred = np.zeros(target.shape[0])
for i, idx in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.iloc[idx[0]], y.iloc[idx[0]]
    val_x, val_y = X.iloc[idx[1]], y.iloc[idx[1]]
    
    cb = CatBoostRegressor(random_state = 42, max_depth = 4, learning_rate = 0.03, n_estimators = 10000, use_best_model = True)
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 0, early_stopping_rounds = 1000)
    
    val_pred = cb.predict(val_x)
    val_rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    print(f"{i + 1} Fold RMSE = {val_rmse}")
    cb_rmse.append(val_rmse)
    
    fold_pred = cb.predict(target) / kf.n_splits
    cb_pred += fold_pred
    
print(f"\n##### {cb.__class__.__name__} AVG of RMSE = {np.mean(cb_rmse)} #####")

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

1 Fold RMSE = 72649.60268710609
2 Fold RMSE = 78098.41376270179
3 Fold RMSE = 56085.627612549804
4 Fold RMSE = 68344.70053713185
5 Fold RMSE = 76421.96260843344
6 Fold RMSE = 67277.78144600712
7 Fold RMSE = 57493.806048520026
8 Fold RMSE = 62159.401184799906
9 Fold RMSE = 57330.75544973902
10 Fold RMSE = 63952.8873181059
11 Fold RMSE = 58142.036605667374
12 Fold RMSE = 65857.92215417988
13 Fold RMSE = 73655.19315122526
14 Fold RMSE = 56947.85080726911
15 Fold RMSE = 63054.53793725812

##### CatBoostRegressor AVG of RMSE = 65164.831954046305 #####


In [27]:
submission['Weekly_Sales'] = cb_pred

In [28]:
submission.to_csv('1st.csv', index = False)