# XGBoost Enhanced Features
This notebook is being created after the addition of Promotion feature to the dataset

In [37]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector, promotionAggregation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


## Preparing our dataset
These steps were already seen on ```../pre-processing-features``` notebooks.

In [8]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [9]:
# Changing our time signatures, 
# adding our promotion feature 
# and aggregating our data by weeks...
process_time(orders)
orders = promo_detector(orders)
df = promotionAggregation(orders, items)

In [10]:
# This cell lags and diffs our features 'orderSum' and "promotion"

shifting = df.copy()

for i in range(NUMBER_OF_LAGS):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)
    shifting[f'promotion_mean_{i}'] = shifting.groupby('itemID')['promotion_mean'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    shifting[f'promotion_mean_diff_{i}'] = shifting.groupby('itemID')[f'promotion_mean_{i}'].diff()
    

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [26]:
worst_possible_prediction = shifting.loc[shifting.group_backwards < 13]['orderSum'].mean()
prediction = np.full(target.shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 13]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target": mse(target, prediction) ** 0.5)

235.28159190179878

## Dataset Splitting
All my experiments will use weeks 1 to 11 as a train set, week 12 as our validation set and week 13 as a test set.

In [29]:
train = shifting.loc[shifting.group_backwards < 11]
val = shifting.loc[shifting.group_backwards == 12]
test = shifting.loc[shifting.group_backwards == 13]

In [30]:
y_train = train.pop('orderSum').values
y_val = val.pop('orderSum').values
X_train = train.values
X_val = val.values

In [54]:
dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_val, y_val)

param = {'max_depth':4, 'eta':0.01, 'objective':'reg:squarederror' }
num_round = 900
bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds = 5,
                evals = [(dtrain, 'train'), (dval, 'val')])

[0]	train-rmse:193.24292	val-rmse:179.95975
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 5 rounds.
[1]	train-rmse:191.35835	val-rmse:178.18346
[2]	train-rmse:189.49429	val-rmse:176.40239
[3]	train-rmse:187.64760	val-rmse:174.66232
[4]	train-rmse:185.81903	val-rmse:172.93318
[5]	train-rmse:184.00839	val-rmse:171.22330
[6]	train-rmse:182.21555	val-rmse:169.52477
[7]	train-rmse:180.44130	val-rmse:167.85873
[8]	train-rmse:178.68352	val-rmse:166.25211
[9]	train-rmse:176.94406	val-rmse:164.62921
[10]	train-rmse:175.22070	val-rmse:163.05054
[11]	train-rmse:173.51454	val-rmse:161.42334
[12]	train-rmse:171.82590	val-rmse:159.85251
[13]	train-rmse:170.15289	val-rmse:158.31361
[14]	train-rmse:168.49583	val-rmse:156.75145
[15]	train-rmse:166.85643	val-rmse:155.25818
[16]	train-rmse:165.23209	val-rmse:153.72446
[17]	train-rmse:163.62380	val-rmse:152.19991
[18]	train-rmse:162.03119	val-rmse:150.73775
[19]	train-rmse

[183]	train-rmse:33.18339	val-rmse:29.35600
[184]	train-rmse:32.87609	val-rmse:29.06320
[185]	train-rmse:32.57172	val-rmse:28.76795
[186]	train-rmse:32.27070	val-rmse:28.48613
[187]	train-rmse:31.97158	val-rmse:28.19509
[188]	train-rmse:31.67562	val-rmse:27.90706
[189]	train-rmse:31.38349	val-rmse:27.63124
[190]	train-rmse:31.09323	val-rmse:27.34771
[191]	train-rmse:30.80584	val-rmse:27.06687
[192]	train-rmse:30.52192	val-rmse:26.79484
[193]	train-rmse:30.24053	val-rmse:26.53472
[194]	train-rmse:29.96153	val-rmse:26.26214
[195]	train-rmse:29.68530	val-rmse:25.99266
[196]	train-rmse:29.41244	val-rmse:25.73464
[197]	train-rmse:29.14154	val-rmse:25.46897
[198]	train-rmse:28.87352	val-rmse:25.20726
[199]	train-rmse:28.60867	val-rmse:24.95645
[200]	train-rmse:28.34584	val-rmse:24.70217
[201]	train-rmse:28.08553	val-rmse:24.44802
[202]	train-rmse:27.82781	val-rmse:24.19511
[203]	train-rmse:27.57297	val-rmse:23.96018
[204]	train-rmse:27.32010	val-rmse:23.71520
[205]	train-rmse:27.07011	val-rm

[373]	train-rmse:6.52923	val-rmse:4.25914
[374]	train-rmse:6.47999	val-rmse:4.21564
[375]	train-rmse:6.43116	val-rmse:4.17228
[376]	train-rmse:6.38276	val-rmse:4.12888
[377]	train-rmse:6.33481	val-rmse:4.08724
[378]	train-rmse:6.28726	val-rmse:4.04604
[379]	train-rmse:6.24015	val-rmse:4.00526
[380]	train-rmse:6.19347	val-rmse:3.96404
[381]	train-rmse:6.14722	val-rmse:3.92433
[382]	train-rmse:6.10135	val-rmse:3.88505
[383]	train-rmse:6.05589	val-rmse:3.84597
[384]	train-rmse:6.01086	val-rmse:3.80786
[385]	train-rmse:5.96616	val-rmse:3.76865
[386]	train-rmse:5.92195	val-rmse:3.73123
[387]	train-rmse:5.87810	val-rmse:3.69419
[388]	train-rmse:5.83465	val-rmse:3.65756
[389]	train-rmse:5.79159	val-rmse:3.62124
[390]	train-rmse:5.74891	val-rmse:3.58510
[391]	train-rmse:5.70638	val-rmse:3.54789
[392]	train-rmse:5.66444	val-rmse:3.51242
[393]	train-rmse:5.62289	val-rmse:3.47731
[394]	train-rmse:5.58171	val-rmse:3.44257
[395]	train-rmse:5.54078	val-rmse:3.40799
[396]	train-rmse:5.50030	val-rmse:

[569]	train-rmse:1.80971	val-rmse:0.72884
[570]	train-rmse:1.79934	val-rmse:0.72527
[571]	train-rmse:1.78899	val-rmse:0.72178
[572]	train-rmse:1.77875	val-rmse:0.71831
[573]	train-rmse:1.76855	val-rmse:0.71487
[574]	train-rmse:1.75845	val-rmse:0.71221
[575]	train-rmse:1.74840	val-rmse:0.70890
[576]	train-rmse:1.73843	val-rmse:0.70567
[577]	train-rmse:1.72851	val-rmse:0.70262
[578]	train-rmse:1.71866	val-rmse:0.69954
[579]	train-rmse:1.70889	val-rmse:0.69649
[580]	train-rmse:1.69916	val-rmse:0.69375
[581]	train-rmse:1.68951	val-rmse:0.69087
[582]	train-rmse:1.67990	val-rmse:0.68810
[583]	train-rmse:1.67038	val-rmse:0.68529
[584]	train-rmse:1.66092	val-rmse:0.68259
[585]	train-rmse:1.65149	val-rmse:0.67999
[586]	train-rmse:1.64215	val-rmse:0.67735
[587]	train-rmse:1.63286	val-rmse:0.67503
[588]	train-rmse:1.62355	val-rmse:0.67249
[589]	train-rmse:1.61437	val-rmse:0.67079
[590]	train-rmse:1.60527	val-rmse:0.66841
[591]	train-rmse:1.59621	val-rmse:0.66624
[592]	train-rmse:1.58723	val-rmse:

In [50]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
modelName = 'xgb-' + now
bst.save_model(modelName)