# XGBoost Enhanced Features
This notebook is being created after the addition of Promotion feature to the dataset and the main goal is to submit the predictions of this notebook in our private Kaggle Leaderboard

In [18]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector, promotionAggregation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 1

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


## Preparing our dataset
These steps were already seen on ```../pre-processing-features``` notebooks.

In [2]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [3]:
# Changing our time signatures, 
# adding our promotion feature 
# and aggregating our data by weeks...
process_time(orders)
orders = promo_detector(orders)
df = promotionAggregation(orders, items)

In [4]:
def prepareOrders(orders, items):
    """This function is responsible for adding in our 'orders' dataframe
    the items that were not sold. THIS IS NOT MODULARIZED, THUS YOU
    SHOULD CHANGE THE CODE TO BETTER SUIT YOUR DATASET FEATURES
    """
    
    df = orders.copy()
    
    # Getting the IDs that were never sold
    not_sold_items = items[np.logical_not(
        items.itemID.isin(sorted(orders['itemID'].unique())))]

    new_rows = []
    weeks_database = orders['group_backwards'].unique()

    for idd in df['itemID'].unique():
        orders_id = df[df.itemID == idd]
        example = orders_id.iloc[0]

        # finding weeks without itemID sales
        weeks_id = orders_id['group_backwards'].unique()
        weeks_without_id = np.setdiff1d(weeks_database, weeks_id)

        # creating new row
        for w in weeks_without_id:
            new_rows.append({'itemID': idd,
                             'group_backwards': w,
                             'salesPrice_mean': 0,
                             'customerRating': example['customerRating'],
                             'category1': example['category1'],
                             'category2': example['category2'],
                             'category3': example['category3'],
                             'recommendedRetailPrice': example['recommendedRetailPrice'],
                             'orderSum': 0,
                             'manufacturer': example['manufacturer'],
                             'brand': example['brand'],
                             'promotion_mean': 0
                             })
    #  Adding rows in every week with the IDs of the
    # items that were never sold.
    df = df.append(new_rows)
    not_sold_orders = pd.DataFrame()
    for i in range(1, 14):
        aux = not_sold_items.copy()
        aux['group_backwards'] = i
        aux['salesPrice_mean'] = 0
        aux['promotion_mean'] = 0
        aux['orderSum'] = 0
        not_sold_orders = pd.concat([not_sold_orders, aux], axis=0)
    df = pd.concat([df, not_sold_orders], axis=0).sort_values(
        ['group_backwards', 'itemID'], ascending=[False, True], ignore_index=True)
    return df

In [5]:
df = prepareOrders(df, items)

In [19]:
# This cell lags and diffs our features 'orderSum' and "promotion"

shifting = df.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)
    shifting[f'promotion_mean_{i}'] = shifting.groupby('itemID')['promotion_mean'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    shifting[f'promotion_mean_diff_{i}'] = shifting.groupby('itemID')[f'promotion_mean_{i}'].diff()
shifting.fillna(np.inf)

Unnamed: 0,group_backwards,itemID,orderSum,promotion_mean,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,orderSum_1,promotion_mean_1,orderSum_diff_1,promotion_mean_diff_1
0,13,1,0,0.0,0.00,0.0,1.0,4.38,1.0,1.0,1.0,8.84,inf,inf,inf,inf
1,13,2,0,0.0,0.00,0.0,2.0,3.00,1.0,2.0,1.0,16.92,inf,inf,inf,inf
2,13,3,1,0.0,14.04,0.0,3.0,5.00,1.0,3.0,1.0,15.89,inf,inf,inf,inf
3,13,4,0,0.0,0.00,0.0,2.0,4.44,1.0,2.0,1.0,40.17,inf,inf,inf,inf
4,13,5,2,0.0,7.84,0.0,2.0,2.33,1.0,1.0,1.0,17.04,inf,inf,inf,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0,0.0,0.00,180.0,253.0,0.00,8.0,44.0,8.0,56.57,0.0,0.0,0.0,0.0
136015,1,10460,0,0.0,0.00,0.0,253.0,0.00,8.0,44.0,8.0,163.81,0.0,0.0,0.0,0.0
136016,1,10461,0,0.0,0.00,0.0,253.0,0.00,8.0,44.0,8.0,128.01,0.0,0.0,0.0,0.0
136017,1,10462,0,0.0,0.00,180.0,253.0,0.00,8.0,44.0,8.0,166.97,0.0,0.0,0.0,0.0


## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [20]:
worst_possible_prediction = shifting.loc[shifting.group_backwards < 13]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 13]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 13]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 90.29706562119341


## Dataset Splitting
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

In [21]:
train = shifting.loc[shifting.group_backwards >= 3]
val = shifting.loc[shifting.group_backwards == 2]
test = shifting.loc[shifting.group_backwards == 1]

In [22]:
# I recommend to the other members of the team keeping the
# datatypes of our datasets as Pandas DataFrames instead of Numpy,
# since It will easier to use Boosting Analysis frameworks
y_train = train['orderSum']
y_val = val['orderSum']
X_train = train.drop(columns=["orderSum"])
X_val = val.drop(columns=["orderSum"])

In [23]:
dtrain = xgb.DMatrix(X_train, y_train, missing=np.inf)
dval = xgb.DMatrix(X_val, y_val, missing=np.inf)

param = {'max_depth':8, 'eta':0.01, 'objective':'reg:squarederror' }
num_round = 1000
bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds = 5,
                evals = [(dtrain, 'train'), (dval, 'val')])

[0]	train-rmse:104.32712	val-rmse:110.88815
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 5 rounds.
[1]	train-rmse:104.01231	val-rmse:110.49053
[2]	train-rmse:103.71067	val-rmse:110.09895
[3]	train-rmse:103.39635	val-rmse:109.67645
[4]	train-rmse:103.09214	val-rmse:109.25549
[5]	train-rmse:102.80353	val-rmse:108.88404
[6]	train-rmse:102.50948	val-rmse:108.48144
[7]	train-rmse:102.21883	val-rmse:108.08079
[8]	train-rmse:101.93532	val-rmse:107.72778
[9]	train-rmse:101.65281	val-rmse:107.34274
[10]	train-rmse:101.38764	val-rmse:107.00610
[11]	train-rmse:101.11524	val-rmse:106.64477
[12]	train-rmse:100.84586	val-rmse:106.28210
[13]	train-rmse:100.58027	val-rmse:105.93525
[14]	train-rmse:100.32457	val-rmse:105.62874
[15]	train-rmse:100.06397	val-rmse:105.27467
[16]	train-rmse:99.82569	val-rmse:104.99045
[17]	train-rmse:99.57445	val-rmse:104.69459
[18]	train-rmse:99.34520	val-rmse:104.41607
[19]	train-rmse:99

[185]	train-rmse:78.75133	val-rmse:87.31165
[186]	train-rmse:78.70046	val-rmse:87.30790
[187]	train-rmse:78.62048	val-rmse:87.28355
[188]	train-rmse:78.56756	val-rmse:87.28603
[189]	train-rmse:78.48887	val-rmse:87.26333
[190]	train-rmse:78.43616	val-rmse:87.25157
[191]	train-rmse:78.38422	val-rmse:87.25124
[192]	train-rmse:78.32304	val-rmse:87.24373
[193]	train-rmse:78.25006	val-rmse:87.22503
[194]	train-rmse:78.19811	val-rmse:87.22256
[195]	train-rmse:78.14976	val-rmse:87.22456
[196]	train-rmse:78.08677	val-rmse:87.21282
[197]	train-rmse:78.03642	val-rmse:87.20534
[198]	train-rmse:77.97231	val-rmse:87.20055
[199]	train-rmse:77.92088	val-rmse:87.19482
[200]	train-rmse:77.87418	val-rmse:87.19683
[201]	train-rmse:77.82885	val-rmse:87.20520
[202]	train-rmse:77.77353	val-rmse:87.17487
[203]	train-rmse:77.71316	val-rmse:87.16915
[204]	train-rmse:77.66768	val-rmse:87.17066
[205]	train-rmse:77.61461	val-rmse:87.17286
[206]	train-rmse:77.56091	val-rmse:87.14408
[207]	train-rmse:77.51360	val-rm

In [24]:
params = {
          "objective" : "poisson",
          "metric" :"rmse",
          "learning_rate" : 0.1,
          'verbosity': 2,
          'max_depth': 8
         }

lgbtrain = lgb.Dataset(X_train, label = y_train)
lgbvalid = lgb.Dataset(X_val, label = y_val)

num_round = 1000
model = lgb.train(params, lgbtrain, num_round, valid_sets = [lgbtrain, lgbvalid], 
                  verbose_eval=20, early_stopping_rounds=20)

Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 83.0187	valid_1's rmse: 91.6425
[40]	training's rmse: 77.7523	valid_1's rmse: 91.787
Early stopping, best iteration is:
[24]	training's rmse: 81.5195	valid_1's rmse: 91.1929


### Utilities

**Predicting at test time**

In [22]:
y_test = test['orderSum']
X_test = xgb.DMatrix(test.drop(columns=["orderSum"]))
final_predictions = bst.predict(X_test)

**Creating our Kaggle CSV**

In [30]:
final = pd.Series(0, index=np.arange(1, len(items)+1))
final[items.itemID] = np.rint(final_predictions)

final.to_csv("kaggle_df.csv", header=["demandPrediction"],
            index_label="itemID")

**Saving our model in disk**

In [50]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
modelName = 'xgb-' + now
bst.save_model(modelName)