In [9]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [81]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [82]:
train.head()

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales
0,1,1,05/02/2010,42.31,2.572,,,,,,8.106,False,1643690.9
1,2,1,12/02/2010,38.51,2.548,,,,,,8.106,True,1641957.44
2,3,1,19/02/2010,39.93,2.514,,,,,,8.106,False,1611968.17
3,4,1,26/02/2010,46.63,2.561,,,,,,8.106,False,1409727.59
4,5,1,05/03/2010,46.5,2.625,,,,,,8.106,False,1554806.68


In [83]:
train['month'] = train['Date'].str[3:5]
test['month'] = test['Date'].str[3:5]

In [84]:
train['year'] = train['Date'].str[6:]
test['year'] = test['Date'].str[6:]

In [85]:
train['day'] = train['Date'].str[:2]
test['day'] = test['Date'].str[:2]

In [86]:
train['Date'] = train['year'] + '-' + train['month'] + '-' + train['day']
test['Date'] = test['year'] + '-' + test['month'] + '-' + test['day']

In [87]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [88]:
train['week'] = train['Date'].dt.weekofyear
test['week'] = test['Date'].dt.weekofyear

In [92]:
train['month'] = train['Date'].dt.month
test['month'] = test['Date'].dt.month

In [89]:
train.fillna(train.mean(), inplace = True)
test.fillna(test.mean(), inplace = True)

In [149]:
X = train.drop(['id', 'Date', 'day', 'Weekly_Sales'], axis = 1)

In [150]:
y = train['Weekly_Sales']

In [151]:
target = test[X.columns]

In [153]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [154]:
cb_pred = np.zeros(target.shape[0])
cb_rmse = []
for i, idx in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.loc[idx[0]], y.iloc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.iloc[idx[1]]
    
    cb = CatBoostRegressor(random_state = 42, max_depth = 4, learning_rate = 0.02, iterations = 10000)
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 100, verbose = 0)
    
    val_pred = cb.predict(val_x)
    fold_rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    print(f'{i + 1} Fold RMSE = {fold_rmse}')
    cb_rmse.append(fold_rmse)

    fold_pred = cb.predict(target) / kf.n_splits
    cb_pred += fold_pred
print(f'{cb.__class__.__name__} AVG of RMSE = {np.mean(cb_rmse)}')

1 Fold RMSE = 102137.89188911566
2 Fold RMSE = 96594.59887021086
3 Fold RMSE = 84667.89180159249
4 Fold RMSE = 88589.0946749238
5 Fold RMSE = 81951.73279609733
6 Fold RMSE = 85596.18464407457
7 Fold RMSE = 76809.67720558363
8 Fold RMSE = 73514.51202411878
9 Fold RMSE = 80383.76416604749
10 Fold RMSE = 78963.70119633518
CatBoostRegressor AVG of RMSE = 84920.90492680998


In [155]:
gbr_pred = np.zeros(target.shape[0])
gbr_rmse = []
for i, idx in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.loc[idx[0]], y.iloc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.iloc[idx[1]]
    
    gbr = GradientBoostingRegressor(random_state = 42, max_depth = 4, n_estimators = 1000, learning_rate = 0.03)
    
    gbr.fit(tr_x, tr_y)
    
    val_pred = gbr.predict(val_x)
    fold_rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    print(f'{i + 1} Fold RMSE = {fold_rmse}')
    gbr_rmse.append(fold_rmse)

    fold_pred = gbr.predict(target) / kf.n_splits
    gbr_pred += fold_pred
print(f'{gbr.__class__.__name__} AVG of RMSE = {np.mean(gbr_rmse)}')

1 Fold RMSE = 115837.98585618714
2 Fold RMSE = 107242.0354979529
3 Fold RMSE = 95187.60082954555
4 Fold RMSE = 104892.02995472575
5 Fold RMSE = 102311.84554041711
6 Fold RMSE = 91500.8329599822
7 Fold RMSE = 96942.60543736364
8 Fold RMSE = 91944.10504120908
9 Fold RMSE = 98434.39887139996
10 Fold RMSE = 94489.62057509988
GradientBoostingRegressor AVG of RMSE = 99878.30605638832


In [156]:
submission['Weekly_Sales'] = cb_pred * 0.6 + gbr_pred * 0.4

In [157]:
submission.to_csv('2nd.csv', index = False)