# XGBoost Enhanced Features
This notebook is being created after the addition of Promotion feature to the dataset

In [17]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector, promotionAggregation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


## Preparing our dataset
These steps were already seen on ```../pre-processing-features``` notebooks.

In [18]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [14]:
# Changing our time signatures, 
# adding our promotion feature 
# and aggregating our data by weeks...
process_time(orders)
orders = promo_detector(orders)
df = promotionAggregation(orders, items)

In [19]:
# This cell lags and diffs our features 'orderSum' and "promotion"

shifting = df.copy()

for i in range(NUMBER_OF_LAGS):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)
    shifting[f'promotion_mean_{i}'] = shifting.groupby('itemID')['promotion_mean'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    shifting[f'promotion_mean_diff_{i}'] = shifting.groupby('itemID')[f'promotion_mean_{i}'].diff()
    

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [22]:
worst_possible_prediction = shifting.loc[shifting.group_backwards < 13]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 13]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 13]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 235.28159190179878


## Dataset Splitting
All my experiments will use weeks 1 to 11 as a train set, week 12 as our validation set and week 13 as a test set.

In [40]:
train = shifting.loc[shifting.group_backwards < 11]
val = shifting.loc[shifting.group_backwards == 11]
test = shifting.loc[shifting.group_backwards > 11]

In [41]:
y_train = train.pop('orderSum').values
y_val = val.pop('orderSum').values
X_train = train.values
X_val = val.values

In [25]:
dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_val, y_val)

param = {'max_depth':4, 'eta':0.01, 'objective':'reg:squarederror' }
num_round = 900
bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds = 5,
                evals = [(dtrain, 'train'), (dval, 'val')])

[0]	train-rmse:193.24292	val-rmse:218.67925
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 5 rounds.
[1]	train-rmse:191.35835	val-rmse:216.52889
[2]	train-rmse:189.49429	val-rmse:214.38393
[3]	train-rmse:187.64760	val-rmse:212.27712
[4]	train-rmse:185.81903	val-rmse:210.19272
[5]	train-rmse:184.00839	val-rmse:208.12697
[6]	train-rmse:182.21555	val-rmse:206.07704
[7]	train-rmse:180.44130	val-rmse:204.05148
[8]	train-rmse:178.68352	val-rmse:202.07361
[9]	train-rmse:176.94406	val-rmse:200.06737
[10]	train-rmse:175.22070	val-rmse:198.12380
[11]	train-rmse:173.51454	val-rmse:196.17062
[12]	train-rmse:171.82591	val-rmse:194.23361
[13]	train-rmse:170.15289	val-rmse:192.34837
[14]	train-rmse:168.49582	val-rmse:190.41505
[15]	train-rmse:166.85645	val-rmse:188.56923
[16]	train-rmse:165.23209	val-rmse:186.67175
[17]	train-rmse:163.62380	val-rmse:184.80371
[18]	train-rmse:162.03119	val-rmse:183.01158
[19]	train-rmse

[183]	train-rmse:33.18339	val-rmse:36.07999
[184]	train-rmse:32.87609	val-rmse:35.73727
[185]	train-rmse:32.57172	val-rmse:35.39682
[186]	train-rmse:32.27070	val-rmse:35.06039
[187]	train-rmse:31.97158	val-rmse:34.72483
[188]	train-rmse:31.67562	val-rmse:34.38979
[189]	train-rmse:31.38349	val-rmse:34.07127
[190]	train-rmse:31.09323	val-rmse:33.74410
[191]	train-rmse:30.80584	val-rmse:33.42056
[192]	train-rmse:30.52192	val-rmse:33.10274
[193]	train-rmse:30.24053	val-rmse:32.78209
[194]	train-rmse:29.96153	val-rmse:32.46603
[195]	train-rmse:29.68530	val-rmse:32.15300
[196]	train-rmse:29.41244	val-rmse:31.85336
[197]	train-rmse:29.14154	val-rmse:31.54449
[198]	train-rmse:28.87352	val-rmse:31.24329
[199]	train-rmse:28.60867	val-rmse:30.94771
[200]	train-rmse:28.34584	val-rmse:30.65452
[201]	train-rmse:28.08553	val-rmse:30.35731
[202]	train-rmse:27.82781	val-rmse:30.06300
[203]	train-rmse:27.57297	val-rmse:29.77014
[204]	train-rmse:27.32010	val-rmse:29.48098
[205]	train-rmse:27.07011	val-rm

[372]	train-rmse:6.57894	val-rmse:5.96596
[373]	train-rmse:6.52923	val-rmse:5.91166
[374]	train-rmse:6.47999	val-rmse:5.86071
[375]	train-rmse:6.43116	val-rmse:5.81010
[376]	train-rmse:6.38276	val-rmse:5.75422
[377]	train-rmse:6.33481	val-rmse:5.70447
[378]	train-rmse:6.28726	val-rmse:5.65520
[379]	train-rmse:6.24015	val-rmse:5.60640
[380]	train-rmse:6.19347	val-rmse:5.55664
[381]	train-rmse:6.14722	val-rmse:5.50881
[382]	train-rmse:6.10135	val-rmse:5.46168
[383]	train-rmse:6.05589	val-rmse:5.41506
[384]	train-rmse:6.01086	val-rmse:5.36900
[385]	train-rmse:5.96616	val-rmse:5.31875
[386]	train-rmse:5.92195	val-rmse:5.27429
[387]	train-rmse:5.87810	val-rmse:5.22954
[388]	train-rmse:5.83465	val-rmse:5.18590
[389]	train-rmse:5.79159	val-rmse:5.14253
[390]	train-rmse:5.74891	val-rmse:5.09936
[391]	train-rmse:5.70638	val-rmse:5.05372
[392]	train-rmse:5.66444	val-rmse:5.01137
[393]	train-rmse:5.62289	val-rmse:4.96966
[394]	train-rmse:5.58171	val-rmse:4.92818
[395]	train-rmse:5.54078	val-rmse:

[568]	train-rmse:1.82028	val-rmse:1.61267
[569]	train-rmse:1.80971	val-rmse:1.60682
[570]	train-rmse:1.79934	val-rmse:1.60101
[571]	train-rmse:1.78899	val-rmse:1.59519
[572]	train-rmse:1.77875	val-rmse:1.58953
[573]	train-rmse:1.76855	val-rmse:1.58418
[574]	train-rmse:1.75845	val-rmse:1.57843
[575]	train-rmse:1.74840	val-rmse:1.57292
[576]	train-rmse:1.73843	val-rmse:1.56746
[577]	train-rmse:1.72851	val-rmse:1.56228
[578]	train-rmse:1.71866	val-rmse:1.55675
[579]	train-rmse:1.70889	val-rmse:1.55145
[580]	train-rmse:1.69916	val-rmse:1.54624
[581]	train-rmse:1.68951	val-rmse:1.54105
[582]	train-rmse:1.67990	val-rmse:1.53629
[583]	train-rmse:1.67038	val-rmse:1.53124
[584]	train-rmse:1.66092	val-rmse:1.52619
[585]	train-rmse:1.65149	val-rmse:1.52154
[586]	train-rmse:1.64215	val-rmse:1.51658
[587]	train-rmse:1.63286	val-rmse:1.51182
[588]	train-rmse:1.62355	val-rmse:1.50700
[589]	train-rmse:1.61437	val-rmse:1.50217
[590]	train-rmse:1.60527	val-rmse:1.49410
[591]	train-rmse:1.59621	val-rmse:

[764]	train-rmse:0.63160	val-rmse:0.97786
[765]	train-rmse:0.62836	val-rmse:0.97708
[766]	train-rmse:0.62513	val-rmse:0.97633
[767]	train-rmse:0.62193	val-rmse:0.97553
[768]	train-rmse:0.61874	val-rmse:0.97481
[769]	train-rmse:0.61557	val-rmse:0.97402
[770]	train-rmse:0.61241	val-rmse:0.97321
[771]	train-rmse:0.60927	val-rmse:0.97253
[772]	train-rmse:0.60615	val-rmse:0.97178
[773]	train-rmse:0.60305	val-rmse:0.97100
[774]	train-rmse:0.59996	val-rmse:0.97028
[775]	train-rmse:0.59690	val-rmse:0.96956
[776]	train-rmse:0.59384	val-rmse:0.96883
[777]	train-rmse:0.59081	val-rmse:0.96806
[778]	train-rmse:0.58779	val-rmse:0.96738
[779]	train-rmse:0.58478	val-rmse:0.96669
[780]	train-rmse:0.58179	val-rmse:0.96600
[781]	train-rmse:0.57882	val-rmse:0.96530
[782]	train-rmse:0.57586	val-rmse:0.96462
[783]	train-rmse:0.57292	val-rmse:0.96394
[784]	train-rmse:0.57000	val-rmse:0.96336
[785]	train-rmse:0.56709	val-rmse:0.96267
[786]	train-rmse:0.56420	val-rmse:0.96200
[787]	train-rmse:0.56132	val-rmse:

### Utilities

Predicting at test time

In [42]:
y_test = test.pop('orderSum').values
X_test = xgb.DMatrix(test.values)
bst.predict(X_test)

array([  2.0037785,  94.97699  ,   1.02636  , ...,   2.0037785,
         6.992962 , 199.9598   ], dtype=float32)

Saving our model in disk

In [50]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
modelName = 'xgb-' + now
bst.save_model(modelName)