In [4]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [99]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [100]:
train.iloc[:, 5:10] += 266
test.iloc[:, 5:10] += 266

In [101]:
train['Date'] = train['Date'].str[-4:] + '-' + train['Date'].str[3:5] + '-' + train['Date'].str[:2]
test['Date'] = test['Date'].str[-4:] + '-' + test['Date'].str[3:5] + '-' + test['Date'].str[:2]

In [102]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [104]:
train['Year'] = train['Date'].dt.year
test['Year'] = test['Date'].dt.year

In [105]:
train['Week'] = train['Date'].dt.weekofyear
test['Week'] = test['Date'].dt.weekofyear

In [106]:
train['promotion_sum'] = train.iloc[:, 5:10].sum(axis = 1)
test['promotion_sum'] = test.iloc[:, 5:10].sum(axis = 1)

In [107]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

In [115]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from ngboost import NGBRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [109]:
kf = KFold(n_splits = 15, random_state = 42, shuffle = True)

In [110]:
X = train.drop(['id', 'Date', 'Weekly_Sales'], axis = 1)
y = train['Weekly_Sales']

In [111]:
target = test[X.columns]

In [112]:
X[['Promotion2', 'Promotion4', 'Promotion5']] = np.log1p(X[['Promotion2', 'Promotion4', 'Promotion5']])
target[['Promotion2', 'Promotion4', 'Promotion5']] = np.log1p(target[['Promotion2', 'Promotion4', 'Promotion5']])

In [121]:
cb_rmse = []
cb_pred = np.zeros(target.shape[0])
for i, idx in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.loc[idx[0]], y.iloc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.iloc[idx[1]]
    
    cb = CatBoostRegressor(random_state = 42, max_depth = 4, learning_rate = 0.03, n_estimators = 10000, use_best_model = True, cat_features = ['Store'])
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 0, early_stopping_rounds = 1000)
    
    val_pred = cb.predict(val_x)
    val_rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    print(f"{i + 1} Fold RMSE = {val_rmse}")
    cb_rmse.append(val_rmse)
    
    fold_pred = cb.predict(target) / kf.n_splits
    cb_pred += fold_pred
    
print(f"\n##### {cb.__class__.__name__} AVG of RMSE = {np.mean(cb_rmse)} #####")

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

1 Fold RMES = 80230.77811439162
2 Fold RMES = 70638.17509570885
3 Fold RMES = 63342.76071211906
4 Fold RMES = 64700.84117797445
5 Fold RMES = 69275.57683155508
6 Fold RMES = 75606.52901636912
7 Fold RMES = 74310.80212071408
8 Fold RMES = 75148.71813157207
9 Fold RMES = 70273.06349818582
10 Fold RMES = 63909.776374097244
11 Fold RMES = 65711.76267621954
12 Fold RMES = 66988.66131109039
13 Fold RMES = 69071.98026163268
14 Fold RMES = 67725.49256444232
15 Fold RMES = 67105.6162382926
CatBoostRegressor AVG of RMSE = 69602.70227495766


In [None]:
ngb_rmse = []
ngb_pred = np.zeros(target.shape[0])
for i, idx in enumerate(kf.split(X, y)) :
    tr_x, tr_y = X.loc[idx[0]], y.iloc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.iloc[idx[1]]
    
    ngb = NGBRegressor(random_state = 42, n_estimators = 10000, verbose = 0)
    
    ngb.fit(tr_x, tr_y, val_x, val_y, early_stopping_rounds = 1000)
    
    val_pred = ngb.predict(val_x)
    val_rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    print(f"{i + 1} Fold RMSE = {val_rmse}")
    ngb_rmse.append(val_rmse)
    
    fold_pred = ngb.predict(target) / kf.n_splits
    ngb_pred += fold_pred
    
print(f"\n##### {ngb.__class__.__name__} AVG of RMSE = {np.mean(ngb_rmse)} #####")

In [126]:
submission = pd.read_csv('sample_submission.csv')

In [127]:
submission['Weekly_Sales'] = cb_pred

In [128]:
submission.to_csv('1st.csv', index = False)