In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [2]:
data = pd.read_csv('DATA/prepared_short.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,SALES,day,id
0,0,1171.82,0,74
1,1,1221.74,0,155
2,2,1488.56,0,195
3,3,1571.74,0,258
4,4,1264.39,0,310


In [3]:
data2 = data.copy()
data2['prev_day_sales'] = data2.groupby(['id'])['SALES'].shift()
data2['prev_day_diff'] = data2.groupby(['id'])['prev_day_sales'].diff()
data2 = data2.dropna()
data2.head()

Unnamed: 0.1,Unnamed: 0,SALES,day,id,prev_day_sales,prev_day_diff
23901,23901,3854.22,1,243043,1114.59,-1133.76
52586,52586,593.22,2,216724,434.59,-195.71
78056,78056,4068.0,2,619798,51864.48,50919.56
78057,78057,110105.93,2,619798,4068.0,-47796.48
78058,78058,10974.57,2,619798,110105.93,106037.93


In [4]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [41]:
mean_error = []
for day in range(20,30):
    train = data2[data2['day'] < day]
    val = data2[data2['day'] == day]
    
    p = val['prev_day_sales'].values
    
    error = rmsle(val['SALES'].values, p)
    print('day %d - Error %.5f' % (day, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

day 20 - Error 0.90927
day 21 - Error 0.93331
day 22 - Error 0.89107
day 23 - Error 0.95020
day 24 - Error 0.92590
day 25 - Error 0.96722
day 26 - Error 0.88408
day 27 - Error 0.96149
day 28 - Error 2.98483
day 29 - Error 0.86401
Mean Error = 1.12714


In [25]:
mean_error = []
for day in range(20,29):
    train = data2[data2['day'] < day]
    val = data2[data2['day'] == day]
    
    xtr, xts = train.drop(['SALES'], axis=1), val.drop(['SALES'], axis=1)
    ytr, yts = train['SALES'].values, val['SALES'].values
    
    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (day, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 20 - Error 0.72022
Week 21 - Error 0.76008
Week 22 - Error 0.70992
Week 23 - Error 0.76282
Week 24 - Error 0.83087
Week 25 - Error 3.69956
Week 26 - Error 0.76375
Week 27 - Error 0.80742
Week 28 - Error 2.64420
Mean Error = 1.29987


In [28]:
data4 = data.copy()
data4['prev_day_sales'] = data4.groupby(['id'])['SALES'].shift()
data4['prev_day_diff'] = data4.groupby(['id'])['prev_day_sales'].diff()
data4['prev-1_day_sales'] = data4.groupby(['id'])['SALES'].shift(2)
data4['prev-1_day_diff'] = data4.groupby(['id'])['prev-1_day_sales'].diff()
data4['prev-2_day_sales'] = data4.groupby(['id'])['SALES'].shift(3)
data4['prev-2_day_diff'] = data4.groupby(['id'])['prev-2_day_sales'].diff()
data4['prev-3_day_sales'] = data4.groupby(['id'])['SALES'].shift(4)
data4['prev-3_day_diff'] = data4.groupby(['id'])['prev-3_day_sales'].diff()
data4['prev-4_day_sales'] = data4.groupby(['id'])['SALES'].shift(5)
data4['prev-4_day_diff'] = data4.groupby(['id'])['prev-4_day_sales'].diff()
data4['prev-5_day_sales'] = data4.groupby(['id'])['SALES'].shift(6)
data4['prev-5_day_diff'] = data4.groupby(['id'])['prev-5_day_sales'].diff()
data4['prev-6_day_sales'] = data4.groupby(['id'])['SALES'].shift(7)
data4['prev-6_day_diff'] = data4.groupby(['id'])['prev-6_day_sales'].diff()
data4['prev-7_day_sales'] = data4.groupby(['id'])['SALES'].shift(8)
data4['prev-7_day_diff'] = data4.groupby(['id'])['prev-7_day_sales'].diff()
data4 = data4.dropna()
data4.head()

Unnamed: 0.1,Unnamed: 0,SALES,day,id,prev_day_sales,prev_day_diff,prev-1_day_sales,prev-1_day_diff,prev-2_day_sales,prev-2_day_diff,prev-3_day_sales,prev-3_day_diff,prev-4_day_sales,prev-4_day_diff,prev-5_day_sales,prev-5_day_diff,prev-6_day_sales,prev-6_day_diff,prev-7_day_sales,prev-7_day_diff
251335,251335,9966.1,7,619798,2712.0,-55677.76,58389.76,52287.76,6102.0,4527.2,1574.8,-9399.77,10974.57,-99131.36,110105.93,106037.93,4068.0,-47796.48,51864.48,50919.56
251336,251336,14238.0,7,619798,9966.1,7254.1,2712.0,-55677.76,58389.76,52287.76,6102.0,4527.2,1574.8,-9399.77,10974.57,-99131.36,110105.93,106037.93,4068.0,-47796.48
251337,251337,885.75,7,619798,14238.0,4271.9,9966.1,7254.1,2712.0,-55677.76,58389.76,52287.76,6102.0,4527.2,1574.8,-9399.77,10974.57,-99131.36,110105.93,106037.93
317413,317413,2051661.99,10,619799,244599.47,-680083.03,924682.5,-303159.93,1227842.43,-3462334.89,4690177.32,1883947.1,2806230.22,885696.39,1920533.83,484842.78,1435691.05,-2936018.19,4371709.24,-1148806.53
317414,317414,5549101.22,10,619799,2051661.99,1807062.52,244599.47,-680083.03,924682.5,-303159.93,1227842.43,-3462334.89,4690177.32,1883947.1,2806230.22,885696.39,1920533.83,484842.78,1435691.05,-2936018.19


In [38]:
data4[data4['day'] == 27]

Unnamed: 0.1,Unnamed: 0,SALES,day,id,prev_day_sales,prev_day_diff,prev-1_day_sales,prev-1_day_diff,prev-2_day_sales,prev-2_day_diff,prev-3_day_sales,prev-3_day_diff,prev-4_day_sales,prev-4_day_diff,prev-5_day_sales,prev-5_day_diff,prev-6_day_sales,prev-6_day_diff,prev-7_day_sales,prev-7_day_diff
753302,753302,128059.45,27,0,771463.05,-265797.01,1037260.06,375871.04,661389.02,422267.47,239121.55,173103.11,66018.44,-372851.87,438870.31,249736.6,189133.71,-109920.9,299054.61,224457.84
753835,753835,2953.58,27,7364,1105.17,-209.29,1314.46,-1739.96,3054.42,-843.56,3897.98,943.51,2954.47,1889.94,1064.53,-1645.7,2710.23,658.36,2051.87,-653.85
754341,754341,6539.2,27,13737,2492.01,-2179.02,4671.03,1826.34,2844.69,1701.31,1143.38,-1149.34,2292.72,-370.36,2663.08,-2766.16,5429.24,4226.53,1202.71,-706.53
754342,754342,5081.46,27,13737,6539.2,4047.19,2492.01,-2179.02,4671.03,1826.34,2844.69,1701.31,1143.38,-1149.34,2292.72,-370.36,2663.08,-2766.16,5429.24,4226.53
757811,757811,1394.69,27,55987,1975.27,-646.7,2621.97,-575.45,3197.42,1264.41,1933.01,569.14,1363.87,-653.06,2016.93,-330.21,2347.14,-3173.94,5521.08,3353.4
757812,757812,1941.39,27,55987,1394.69,-580.58,1975.27,-646.7,2621.97,-575.45,3197.42,1264.41,1933.01,569.14,1363.87,-653.06,2016.93,-330.21,2347.14,-3173.94
760490,760490,1168.91,27,89020,1120.37,-6298.56,7418.93,6004.51,1414.42,-1564.54,2978.96,-3360.85,6339.81,4373.31,1966.5,-3695.04,5661.54,3773.4,1888.14,-1458.9
763203,763203,619.25,27,121069,522.18,-3766.37,4288.55,2927.02,1361.53,910.82,450.71,324.34,126.37,-339.02,465.39,-2120.31,2585.7,492.29,2093.41,1081.15
769369,769369,839.72,27,196345,6067.08,2858.33,3208.75,280.8,2927.95,-1295.67,4223.62,1872.85,2350.77,-3314.29,5665.06,-174.97,5840.03,1979.27,3860.76,708.25
796011,796011,961.48,27,483119,1036.98,-98.49,1135.47,-1200.08,2335.55,-108.77,2444.32,-5336.74,7781.06,1795.89,5985.17,5023.64,961.53,-869.44,1830.97,-5067.93


In [39]:
mean_error = []
for day in (20,21,22,23,24,25,27,28,29):
    train = data4[data4['day'] < day]
    val = data4[data4['day'] == day]
    
    xtr, xts = train.drop(['SALES'], axis=1), val.drop(['SALES'], axis=1)
    ytr, yts = train['SALES'].values, val['SALES'].values
    
    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    print (day)
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (day, error))
    mean_error.append(error)

print('Mean Error = %.5f' % np.mean(mean_error))

20
Week 20 - Error 2.67258
21
Week 21 - Error 0.27081
22
Week 22 - Error 1.31417
23
Week 23 - Error 0.85183
24
Week 24 - Error 1.07269
25
Week 25 - Error 2.69869
27
Week 27 - Error 1.21468
28
Week 28 - Error 2.32020
29
Week 29 - Error 1.00495
Mean Error = 1.49118
