# Boosting Models
This notebook follows the orders

## Loading our data...

In [2]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import xgboost as xgb

sys.path.append("../../main/datasets/")

# Flag that decides if our model will be based on
# on weeks or on days...
WEEKS_BASED = True

In [3]:
!ls  ../../main/datasets/

1.0v.zip


In [4]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [5]:
orders.head()

Unnamed: 0,time,transactID,itemID,order,salesPrice
0,2018-01-01 00:01:56,2278968,450,1,17.42
1,2018-01-01 00:01:56,2278968,83,1,5.19
2,2018-01-01 00:07:11,2255797,7851,2,20.47
3,2018-01-01 00:09:24,2278968,450,1,17.42
4,2018-01-01 00:09:24,2278968,83,1,5.19


## Preprocessing our orders

These cells were taken from Bruno's "1.1-First Model" Notebook, which can be found in this repository.

In [6]:
process_time(orders)

In [7]:
# This column is just not useful anymore, since we have 'days_backwards' and "group_backwards"
orders.drop('time', axis=1, inplace=True)

In [8]:
# Trying to 'guesstimate' whether an item is on sale.
orders = promo_detector(orders)

In [9]:
# Sanity checking...
orders.loc[orders['promotion'] == 1]

Unnamed: 0,transactID,itemID,order,salesPrice,days,days_backwards,group_backwards,salesPriceMode,promotion
574,3330,7851,1,18.72,49,132,10,20.47,1
575,2293923,7851,1,18.20,50,131,10,20.47,1
813,2263766,7851,1,16.38,120,61,5,20.47,1
1256,2278968,19,1,77.64,1,180,13,79.68,1
1257,2278968,19,1,77.64,1,180,13,79.68,1
...,...,...,...,...,...,...,...,...,...
2180253,2291000,7367,1,6.33,180,1,1,7.92,1
2180259,2260280,7367,1,6.33,180,1,1,7.92,1
2180260,2260280,7367,1,6.33,180,1,1,7.92,1
2180546,2256386,7367,2,6.33,180,1,1,7.92,1


In [10]:
orders = pd.merge(orders, items, on='itemID', validate="m:1")

In [11]:
# Removing useless columns, setting the index
# according to the time scale that we want and
# ordering according to our index.
timeScale, indexScale = ('days_backwards', 'group_backwards') if WEEKS_BASED else (
    'group_backwards', 'days_backwards')
orders.drop(columns=['transactID', 'salesPriceMode',
                     timeScale, 'itemID', 'days'], inplace=True)
orders.set_index(indexScale, inplace=True)
orders.sort_index(inplace=True)
orders

Unnamed: 0_level_0,order,salesPrice,promotion,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
group_backwards,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,159.82,0,243,252,0.0,8,44,8,80.54
1,1,48.11,0,0,39,5.0,2,10,2,21.97
1,1,48.11,0,0,39,5.0,2,10,2,21.97
1,1,48.11,0,0,39,5.0,2,10,2,21.97
1,1,48.11,0,0,39,5.0,2,10,2,21.97
...,...,...,...,...,...,...,...,...,...,...
13,1,38.79,0,93,137,0.0,7,23,6,26.50
13,1,38.79,0,93,137,0.0,7,23,6,26.50
13,1,38.79,0,93,137,0.0,7,23,6,26.50
13,1,38.79,0,93,137,0.0,7,23,6,26.50


In [12]:
train = orders.loc[2:]
x_train = train.drop(columns="order")
y_train = train["order"]

test = orders.loc[1]
x_test = test.drop(columns="order")
y_test = test["order"]

In [13]:
print(f"x_test shape {x_test.shape} x_train shape {x_train.shape}")

x_test shape (234546, 9) x_train shape (1947409, 9)


In [18]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
# specify parameters via map
#param = {'max_depth':2, 'eta':0.3, 'objective': 'reg:squarederror'}
param = {'objective': 'reg:squarederror'}
num_round = 100

bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds=5,
                evals = [(dtrain, 'train'), (dtest, 'test')], )

XGBoostError: [17:20:17] /workspace/src/metric/rank_metric.cc:212: Check failed: dat[1] > 0.0f (0 vs. 0) : AUC: the dataset only contains pos or neg samples
Stack trace:
  [bt] (0) /home/joaopedromattos/.local/lib/python3.8/site-packages/xgboost/./lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x54) [0x7f8643d52614]
  [bt] (1) /home/joaopedromattos/.local/lib/python3.8/site-packages/xgboost/./lib/libxgboost.so(+0x25bda6) [0x7f8643e65da6]
  [bt] (2) /home/joaopedromattos/.local/lib/python3.8/site-packages/xgboost/./lib/libxgboost.so(xgboost::LearnerImpl::EvalOneIter(int, std::vector<xgboost::DMatrix*, std::allocator<xgboost::DMatrix*> > const&, std::vector<std::string, std::allocator<std::string> > const&)+0x419) [0x7f8643e3efb9]
  [bt] (3) /home/joaopedromattos/.local/lib/python3.8/site-packages/xgboost/./lib/libxgboost.so(XGBoosterEvalOneIter+0x363) [0x7f8643d45a23]
  [bt] (4) /usr/lib/libffi.so.6(ffi_call_unix64+0x4c) [0x7f869017369a]
  [bt] (5) /usr/lib/libffi.so.6(ffi_call+0x196) [0x7f8690172fb6]
  [bt] (6) /usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0xf084) [0x7f868f8ec084]
  [bt] (7) /usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0x1370d) [0x7f868f8f070d]
  [bt] (8) /usr/lib/libpython3.8.so.1.0(_PyObject_MakeTpCall+0x442) [0x7f8690e81f42]



In [None]:
y_train.std(), y_test.std()

In [None]:
bst.predict()