# Boosting Models
This notebook follows the orders

## Loading our data...

In [1]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import xgboost as xgb

sys.path.append("../../main/datasets/")

# Flag that decides if our model will be based on
# on weeks or on days...
WEEKS_BASED = True

In [2]:
!ls  ../../main/datasets/

1.0v.zip


In [3]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [4]:
orders.head()

Unnamed: 0,time,transactID,itemID,order,salesPrice
0,2018-01-01 00:01:56,2278968,450,1,17.42
1,2018-01-01 00:01:56,2278968,83,1,5.19
2,2018-01-01 00:07:11,2255797,7851,2,20.47
3,2018-01-01 00:09:24,2278968,450,1,17.42
4,2018-01-01 00:09:24,2278968,83,1,5.19


## Preprocessing our orders

These cells were taken from Bruno's "1.1-First Model" Notebook, which can be found in this repository.

In [5]:
process_time(orders)

In [6]:
# This column is just not useful anymore, since we have 'days_backwards' and "group_backwards"
orders.drop('time', axis=1, inplace=True)

In [7]:
# Trying to 'guesstimate' whether an item is on sale.
orders = promo_detector(orders)

In [8]:
# Sanity checking...
orders.loc[orders['promotion'] == 1]

Unnamed: 0,transactID,itemID,order,salesPrice,days,days_backwards,group_backwards,salesPriceMode,promotion
574,3330,7851,1,18.72,49,132,10,20.47,1
575,2293923,7851,1,18.20,50,131,10,20.47,1
813,2263766,7851,1,16.38,120,61,5,20.47,1
1256,2278968,19,1,77.64,1,180,13,79.68,1
1257,2278968,19,1,77.64,1,180,13,79.68,1
...,...,...,...,...,...,...,...,...,...
2180253,2291000,7367,1,6.33,180,1,1,7.92,1
2180259,2260280,7367,1,6.33,180,1,1,7.92,1
2180260,2260280,7367,1,6.33,180,1,1,7.92,1
2180546,2256386,7367,2,6.33,180,1,1,7.92,1


In [9]:
orders = pd.merge(orders, items, on='itemID', validate="m:1")

In [10]:
# Removing useless columns, setting the index
# according to the time scale that we want and
# ordering according to our index.
timeScale, indexScale = ('days_backwards', 'group_backwards') if WEEKS_BASED else (
    'group_backwards', 'days_backwards')
orders.drop(columns=['transactID', 'salesPriceMode',
                     timeScale, 'itemID', 'days'], inplace=True)
orders.set_index(indexScale, inplace=True)
orders.sort_index(inplace=True)
orders

Unnamed: 0_level_0,order,salesPrice,promotion,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
group_backwards,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,159.82,0,243,252,0.0,8,44,8,80.54
1,1,48.11,0,0,39,5.0,2,10,2,21.97
1,1,48.11,0,0,39,5.0,2,10,2,21.97
1,1,48.11,0,0,39,5.0,2,10,2,21.97
1,1,48.11,0,0,39,5.0,2,10,2,21.97
...,...,...,...,...,...,...,...,...,...,...
13,1,38.79,0,93,137,0.0,7,23,6,26.50
13,1,38.79,0,93,137,0.0,7,23,6,26.50
13,1,38.79,0,93,137,0.0,7,23,6,26.50
13,1,38.79,0,93,137,0.0,7,23,6,26.50


In [13]:
train = orders.loc[2:]
x_train = train.drop(columns="order")
y_train = train["order"]

test = orders.loc[1]
x_test = test.drop(columns="order")
y_test = test["order"]

In [17]:
print(f"x_test shape {x_test.shape} x_train shape {x_train.shape}")

x_test shape (234546, 10) x_train shape (1947409, 10)


In [14]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
# specify parameters via map
#param = {'max_depth':2, 'eta':0.3, 'objective': 'reg:squarederror'}
param = {'objective': 'reg:squarederror'}
num_round = 100
bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds=5,
                evals = [(dtrain, 'train'), (dtest, 'test')])

[0]	train-rmse:0.82920	test-rmse:0.86932
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 5 rounds.
[1]	train-rmse:0.73332	test-rmse:0.76870
[2]	train-rmse:0.68017	test-rmse:0.71503
[3]	train-rmse:0.65214	test-rmse:0.68477
[4]	train-rmse:0.63677	test-rmse:0.67022
[5]	train-rmse:0.62843	test-rmse:0.66171
[6]	train-rmse:0.62402	test-rmse:0.65821
[7]	train-rmse:0.62149	test-rmse:0.65791
[8]	train-rmse:0.62012	test-rmse:0.65679
[9]	train-rmse:0.61892	test-rmse:0.65687
[10]	train-rmse:0.61809	test-rmse:0.65657
[11]	train-rmse:0.61766	test-rmse:0.65638
[12]	train-rmse:0.61710	test-rmse:0.65653
[13]	train-rmse:0.61672	test-rmse:0.65692
[14]	train-rmse:0.61635	test-rmse:0.65634
[15]	train-rmse:0.61565	test-rmse:0.65581
[16]	train-rmse:0.61539	test-rmse:0.65603
[17]	train-rmse:0.61498	test-rmse:0.65597
[18]	train-rmse:0.61450	test-rmse:0.65589
[19]	train-rmse:0.61420	test-rmse:0.65561
[20]	train-rmse:0.61390	test

In [15]:
y_train.std(), y_test.std()

(0.6633578168435429, 0.6915112826663401)

In [None]:
bst.predict()