In [6]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [7]:
data = pd.read_csv('DATA/prepared_short.csv')
data.drop(['Unnamed: 0'], axis=1,inplace = True)
data.head()

Unnamed: 0.1,Unnamed: 0,SALES,day,id
0,0,1171.82,0,74
1,1,1221.74,0,155
2,2,1488.56,0,195
3,3,1571.74,0,258
4,4,1264.39,0,310


In [8]:
data.drop(['Unnamed: 0'], axis=1,inplace = True)

In [9]:
data2 = data.copy()
data2['prev_day_sales'] = data2.groupby(['id'])['SALES'].shift()
data2['prev_day_diff'] = data2.groupby(['id'])['prev_day_sales'].diff()
data2 = data2.dropna()
data2.head()

Unnamed: 0,SALES,day,id,prev_day_sales,prev_day_diff
23901,3854.22,1,243043,1114.59,-1133.76
52586,593.22,2,216724,434.59,-195.71
78056,4068.0,2,619798,51864.48,50919.56
78057,110105.93,2,619798,4068.0,-47796.48
78058,10974.57,2,619798,110105.93,106037.93


In [10]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [11]:
mean_error = []
for day in range(20,30):
    train = data2[data2['day'] < day]
    val = data2[data2['day'] == day]
    
    p = val['prev_day_sales'].values
    
    error = rmsle(val['SALES'].values, p)
    print('day %d - Error %.5f' % (day, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

day 20 - Error 0.90927
day 21 - Error 0.93331
day 22 - Error 0.89107
day 23 - Error 0.95020
day 24 - Error 0.92590
day 25 - Error 0.96722
day 26 - Error 0.88408
day 27 - Error 0.96149
day 28 - Error 2.98483
day 29 - Error 0.86401
Mean Error = 1.12714


In [12]:
mean_error = []
for day in range(20,29):
    train = data2[data2['day'] < day]
    val = data2[data2['day'] == day]
    
    xtr, xts = train.drop(['SALES'], axis=1), val.drop(['SALES'], axis=1)
    ytr, yts = train['SALES'].values, val['SALES'].values
    
    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (day, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 20 - Error 0.68678
Week 21 - Error 0.68578
Week 22 - Error 0.65277
Week 23 - Error 0.66918
Week 24 - Error 0.72147
Week 25 - Error 0.76588
Week 26 - Error 0.66867
Week 27 - Error 0.76558
Week 28 - Error 2.69364
Mean Error = 0.92331


In [13]:
%%time
data4 = data.copy()
data4['prev_day_sales'] = data4.groupby(['id'])['SALES'].shift()
data4['prev_day_diff'] = data4.groupby(['id'])['prev_day_sales'].diff()
data4['prev-1_day_sales'] = data4.groupby(['id'])['SALES'].shift(2)
data4['prev-1_day_diff'] = data4.groupby(['id'])['prev-1_day_sales'].diff()
data4['prev-2_day_sales'] = data4.groupby(['id'])['SALES'].shift(3)
data4['prev-2_day_diff'] = data4.groupby(['id'])['prev-2_day_sales'].diff()
data4['prev-3_day_sales'] = data4.groupby(['id'])['SALES'].shift(4)
data4['prev-3_day_diff'] = data4.groupby(['id'])['prev-3_day_sales'].diff()
data4['prev-4_day_sales'] = data4.groupby(['id'])['SALES'].shift(5)
data4['prev-4_day_diff'] = data4.groupby(['id'])['prev-4_day_sales'].diff()
data4['prev-5_day_sales'] = data4.groupby(['id'])['SALES'].shift(6)
data4['prev-5_day_diff'] = data4.groupby(['id'])['prev-5_day_sales'].diff()
data4['prev-6_day_sales'] = data4.groupby(['id'])['SALES'].shift(7)
data4['prev-6_day_diff'] = data4.groupby(['id'])['prev-6_day_sales'].diff()
data4['prev-7_day_sales'] = data4.groupby(['id'])['SALES'].shift(8)
data4['prev-7_day_diff'] = data4.groupby(['id'])['prev-7_day_sales'].diff()
data4 = data4.dropna()
data4.head()

Unnamed: 0,SALES,day,id,prev_day_sales,prev_day_diff,prev-1_day_sales,prev-1_day_diff,prev-2_day_sales,prev-2_day_diff,prev-3_day_sales,prev-3_day_diff,prev-4_day_sales,prev-4_day_diff,prev-5_day_sales,prev-5_day_diff,prev-6_day_sales,prev-6_day_diff,prev-7_day_sales,prev-7_day_diff
251335,9966.1,7,619798,2712.0,-55677.76,58389.76,52287.76,6102.0,4527.2,1574.8,-9399.77,10974.57,-99131.36,110105.93,106037.93,4068.0,-47796.48,51864.48,50919.56
251336,14238.0,7,619798,9966.1,7254.1,2712.0,-55677.76,58389.76,52287.76,6102.0,4527.2,1574.8,-9399.77,10974.57,-99131.36,110105.93,106037.93,4068.0,-47796.48
251337,885.75,7,619798,14238.0,4271.9,9966.1,7254.1,2712.0,-55677.76,58389.76,52287.76,6102.0,4527.2,1574.8,-9399.77,10974.57,-99131.36,110105.93,106037.93
317413,2051661.99,10,619799,244599.47,-680083.03,924682.5,-303159.93,1227842.43,-3462334.89,4690177.32,1883947.1,2806230.22,885696.39,1920533.83,484842.78,1435691.05,-2936018.19,4371709.24,-1148806.53
317414,5549101.22,10,619799,2051661.99,1807062.52,244599.47,-680083.03,924682.5,-303159.93,1227842.43,-3462334.89,4690177.32,1883947.1,2806230.22,885696.39,1920533.83,484842.78,1435691.05,-2936018.19


In [14]:
%%time
mean_error = []
for day in (20,21,22,23,24,25,27,28,29):
    train = data4[data4['day'] < day]
    val = data4[data4['day'] == day]
    
    xtr, xts = train.drop(['SALES'], axis=1), val.drop(['SALES'], axis=1)
    ytr, yts = train['SALES'].values, val['SALES'].values
    
    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    print (day)
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (day, error))
    mean_error.append(error)

print('Mean Error = %.5f' % np.mean(mean_error))

20
Week 20 - Error 1.97795
21
Week 21 - Error 0.25954
22
Week 22 - Error 1.30774
23
Week 23 - Error 0.84721
24
Week 24 - Error 1.07058
25
Week 25 - Error 1.82388
27
Week 27 - Error 1.22312
28
Week 28 - Error 2.32128
29
Week 29 - Error 0.95665
Mean Error = 1.30977
