# Viewing sharpley values  from dora xgboost 2 model with data leak removed

## Obs: i remove the rows with items with total zero sales

In [1]:
import numpy as np
import pandas as pd
import sys

sys.path.append("../dora/models/")
from utils import read_data, process_time, merge_data, promo_detector, promotionAggregation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import xgboost as xgb
from datetime import datetime

NUMBER_OF_LAGS = 4

## Preparing our dataset
These steps were already seen on ```../pre-processing-features``` notebooks.

In [2]:
infos, items, orders = read_data("../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [3]:
# Changing our time signatures, 
# adding our promotion feature 
# and aggregating our data by weeks...
process_time(orders)
orders = promo_detector(orders)
df = promotionAggregation(orders, items)

In [4]:
def prepareOrders(orders, items):
    """This function is responsible for adding in our 'orders' dataframe
    the items that were not sold. THIS IS NOT MODULARIZED, THUS YOU
    SHOULD CHANGE THE CODE TO BETTER SUIT YOUR DATASET FEATURES
    """
    
    df = orders.copy()

    new_rows = []
    weeks_database = orders['group_backwards'].unique()

    for idd in df['itemID'].unique():
        orders_id = df[df.itemID == idd]
        example = orders_id.iloc[0]

        # finding weeks without itemID sales
        weeks_id = orders_id['group_backwards'].unique()
        weeks_without_id = np.setdiff1d(weeks_database, weeks_id)

        # creating new row
        for w in weeks_without_id:
            new_rows.append({'itemID': idd,
                             'group_backwards': w,
                             'salesPrice_mean': 0,
                             'customerRating': example['customerRating'],
                             'category1': example['category1'],
                             'category2': example['category2'],
                             'category3': example['category3'],
                             'recommendedRetailPrice': example['recommendedRetailPrice'],
                             'orderSum': 0,
                             'manufacturer': example['manufacturer'],
                             'brand': example['brand'],
                             'promotion_mean': 0
                             })
    df = df.append(new_rows)
    df = df.sort_values(['group_backwards', 'itemID'], ascending=[False, True], ignore_index=True)
    return df

In [5]:
print(len(df))
df = prepareOrders(df, items)
print(len(df))


39515
127920


In [6]:
display(df)

Unnamed: 0,group_backwards,itemID,orderSum,promotion_mean,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,13,1,0,0.0000,0.000000,0.0,1.0,4.38,1.0,1.0,1.0,8.84
1,13,2,0,0.0000,0.000000,0.0,2.0,3.00,1.0,2.0,1.0,16.92
2,13,3,1,0.0000,14.040000,0.0,3.0,5.00,1.0,3.0,1.0,15.89
3,13,4,0,0.0000,0.000000,0.0,2.0,4.44,1.0,2.0,1.0,40.17
4,13,5,2,0.0000,7.840000,0.0,2.0,2.33,1.0,1.0,1.0,17.04
...,...,...,...,...,...,...,...,...,...,...,...,...
127915,1,10450,34,0.1875,53.555625,182.0,227.0,0.00,8.0,44.0,8.0,36.78
127916,1,10459,0,0.0000,0.000000,180.0,253.0,0.00,8.0,44.0,8.0,56.57
127917,1,10460,0,0.0000,0.000000,0.0,253.0,0.00,8.0,44.0,8.0,163.81
127918,1,10462,0,0.0000,0.000000,180.0,253.0,0.00,8.0,44.0,8.0,166.97


In [7]:
# This cell lags and diffs our features 'orderSum' and "promotion"

shifting = df.copy()

for i in range(1,NUMBER_OF_LAGS):

    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)
    shifting[f'promotion_mean_{i}'] = shifting.groupby('itemID')['promotion_mean'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    shifting[f'promotion_mean_diff_{i}'] = shifting.groupby('itemID')[f'promotion_mean_{i}'].diff()
    

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [8]:
worst_possible_prediction = shifting.loc[shifting.group_backwards < 13]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 13]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 13]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 93.14493040998501


## Dataset Splitting
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

In [9]:
train = shifting.loc[shifting.group_backwards >= 3]
val = shifting.loc[shifting.group_backwards == 2]
test = shifting.loc[shifting.group_backwards == 1]

In [10]:
# I recommend to the other members of the team keeping the
# datatypes of our datasets as Pandas DataFrames instead of Numpy,
# since It will easier to use Boosting Analysis frameworks
y_train = train['orderSum']
y_val = val['orderSum']
X_train = train.drop(columns=["orderSum"])
X_val = val.drop(columns=["orderSum"])

In [11]:
dtrain = xgb.DMatrix(X_train, y_train)
dval = xgb.DMatrix(X_val, y_val)

param = {'max_depth':32, 'eta':0.01, 'objective':'reg:squarederror' }
num_round = 600
bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds = 5,
                evals = [(dtrain, 'train'), (dval, 'val')])

[0]	train-rmse:107.15226	val-rmse:114.37853
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 5 rounds.
[1]	train-rmse:106.40302	val-rmse:114.00581
[2]	train-rmse:105.66099	val-rmse:113.64217
[3]	train-rmse:104.92551	val-rmse:113.28590
[4]	train-rmse:104.19418	val-rmse:112.92827
[5]	train-rmse:103.47201	val-rmse:112.59444
[6]	train-rmse:102.75311	val-rmse:112.27349
[7]	train-rmse:102.04147	val-rmse:111.95448
[8]	train-rmse:101.33382	val-rmse:111.64027
[9]	train-rmse:100.63448	val-rmse:111.33965
[10]	train-rmse:99.94049	val-rmse:111.05751
[11]	train-rmse:99.25253	val-rmse:110.77846
[12]	train-rmse:98.56931	val-rmse:110.51157
[13]	train-rmse:97.89185	val-rmse:110.25754
[14]	train-rmse:97.21864	val-rmse:110.00870
[15]	train-rmse:96.55033	val-rmse:109.76379
[16]	train-rmse:95.88757	val-rmse:109.53152
[17]	train-rmse:95.22805	val-rmse:109.28062
[18]	train-rmse:94.57433	val-rmse:109.04216
[19]	train-rmse:93.92550

### Utilities

**Predicting at test time**

In [12]:
y_test = test['orderSum']
X_test = xgb.DMatrix(test.drop(columns=["orderSum"]))
final_predictions = bst.predict(X_test)

**Saving our model in disk**

In [14]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
modelName = 'xgb-' + now
bst.save_model(modelName)

## Sharpley values

Too slow, i will try with the google collab latter

In [None]:
import shap
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(X_test)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.


### looking average impact of every feature

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")

### overview of every feature influence

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.dependence_plot("orderSum_0", shap_values, X_test)

## conclusions:

1- orderSum_0 is basically the only important feature, and that because it value is the same as the answer, the model has data leak

2- be careful with print(database), it doesn't show all the columns, the print below does not have orderSum_0

In [None]:
print(X_train)
print(X_train.columns)

In [None]:
pd.options.display.max_columns = None
display(X_train)