# LGBM - Item Relevance Feature

In [203]:
import numpy as np
import pandas as pd
from utils import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 2

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


<hr>

## Defining metrics

Baseline_score function

In [204]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [205]:
def feval(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

Objective Metric

In [206]:
def gradient(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight()
    return -2 * (predt - np.maximum(predt - y, 0) * 1.6) * (1 - (predt > y) * 1.6) * sp

def hessian(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight() 
    return -2 * ((1 - (predt > y) * 1.6) ** 2) * sp

def objective(predt, dtrain):
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [207]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [208]:
# Changing our time signatures
process_time(orders)

In [209]:
df = dataset_builder(orders, items)

<hr>

## Feature building

In [210]:
# percentage_accum_cat_3 feature...
df = cumulative_sale_by_category(df)

In [211]:
# Encoding our weeks as a series of sines and cosines...
# This function will consider our period as a semester in a year,
# so we can try other types of time encoding later!
df = time_encoder(df, 'group_backwards', 26)

In [212]:
shifting = df.copy()

In [213]:
# This cell creates rolling-window features based on 'orderSum' in our dataset!
item_group = shifting.groupby(["itemID", "group_backwards"]).agg({'orderSum':'sum'})

# We'll .shift(-1) because it sorts our "group_backwards", 
# so doing .shift(1) would cause a HUGE dataleak.
aux_shifting = item_group.groupby('itemID')[['orderSum']].shift(-1)

aux_shifting.sort_values(['itemID', 'group_backwards'], ascending=[True, False], inplace=True)

for i in range(3):
    rolled_window = aux_shifting.groupby(['itemID'], as_index=False)[['orderSum']].rolling(2 ** i).mean()
    rolled_window.rename(columns={'orderSum':f"orderSum_mean_rolled_{i}"}, inplace=True)
    shifting = pd.merge(shifting, rolled_window, left_on=['itemID', 'group_backwards'], right_on=['itemID', 'group_backwards'])

In [214]:
# This cell adds to our dataset two features: "itemRelevance" and "soldUntilThisWeek"
# 'soldUntilThisWeek' = weeksThatAGivenItemSold / numberOfPastWeeks
# 'itemRelevance' = 'soldUntilThisWeek' * 'recomendedRetailPrice'
df_copy = df.copy()

df_copy['soldUntilThisWeek'] = 0
df_copy.loc[df.orderSum > 0, 'soldUntilThisWeek'] = 1

group_item_sold = df_copy.groupby(['group_backwards', 'itemID']).agg({'soldUntilThisWeek':'sum'})

# Taking infos from the past to the present date with shifting...
# We'll .shift(-1) because it sorts our "group_backwards", 
# so doing .shift(1) would cause a HUGE dataleak.
test = group_item_sold.groupby('itemID')[['soldUntilThisWeek']].shift(-1)
test.sort_values(['itemID', 'group_backwards'], ascending=[True, False], inplace=True)
test.reset_index(inplace=True)
aux = pd.DataFrame()
for i in range(13, 0, -1):
    accum = test.loc[test.group_backwards >= i].groupby(['itemID'], as_index=False).agg({'soldUntilThisWeek':'sum'})
    accum['group_backwards'] = i
    accum['soldUntilThisWeek'] = accum['soldUntilThisWeek'] / (13 - i)
    aux = pd.concat([aux, accum])
    
aux.reset_index(0, drop=True, inplace=True)

new_feature = pd.merge(shifting, aux, left_on=['group_backwards', 'itemID'], right_on=['group_backwards', 'itemID'])
new_feature['itemRelevance'] = new_feature['recommendedRetailPrice'] * new_feature['soldUntilThisWeek']

In [215]:
shifting = new_feature

In [216]:
# This cell lags and diffs our feature 'orderSum'
for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1 and t-2
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)
    shifting[f'soldUntilThisWeek_{-i}'] = shifting.groupby('itemID')['soldUntilThisWeek'].shift(i)
    shifting[f'itemRelevance_{-i}'] = shifting.groupby('itemID')['itemRelevance'].shift(i)
    
    # Getting the difference between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    shifting[f'soldUntilThisWeek_diff_{-i}'] = shifting.groupby('itemID')[f'soldUntilThisWeek_{-i}'].shift(i)
    shifting[f'itemRelevance_{-i}'] = shifting.groupby('itemID')[f'itemRelevance_{-i}'].shift(i)


In [217]:
shifting

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,...,orderSum_1,soldUntilThisWeek_-1,itemRelevance_-1,orderSum_diff_1,soldUntilThisWeek_diff_-1,orderSum_2,soldUntilThisWeek_-2,itemRelevance_-2,orderSum_diff_2,soldUntilThisWeek_diff_-2
0,13,1,0.0,0,1,4.38,1,1,1,8.84,...,,,,,,,,,,
1,13,2,0.0,0,2,3.00,1,2,1,16.92,...,,,,,,,,,,
2,13,3,1.0,0,3,5.00,1,3,1,15.89,...,,,,,,,,,,
3,13,4,0.0,0,2,4.44,1,2,1,40.17,...,,,,,,,,,,
4,13,5,2.0,0,2,2.33,1,1,1,17.04,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0.0,180,253,0.00,8,44,8,56.57,...,0.0,0.090909,5.657,0.0,0.1,0.0,0.1,0.00000,-1.0,0.000
136015,1,10460,0.0,0,253,0.00,8,44,8,163.81,...,0.0,0.090909,16.381,0.0,0.1,0.0,0.1,20.47625,0.0,0.125
136016,1,10461,0.0,0,253,0.00,8,44,8,128.01,...,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.00000,0.0,0.000
136017,1,10462,0.0,180,253,0.00,8,44,8,166.97,...,0.0,0.090909,16.697,0.0,0.1,0.0,0.1,0.00000,0.0,0.000


In [218]:
aux = pd.DataFrame()
for i in range(13, 0, -1):
    accum = shifting.loc[df.group_backwards >= i].groupby(['itemID']).agg(orderSum_median=('orderSum','median'))
    accum['group_backwards'] = i
    aux = pd.concat([aux, accum])
aux.reset_index(inplace=True)
aux['orderSum_median'] = aux.groupby('itemID')['orderSum_median'].shift(1)
shifting = pd.merge(shifting, aux, left_on=['itemID', 'group_backwards'], right_on=['itemID', 'group_backwards'])
shifting['median_gap'] = shifting['orderSum_1'] - shifting['orderSum_median']

In [219]:
# LGBM Says on docs that it automatically handles zero values as NaN,
# so we'll keep this standard...
shifting.fillna(0, inplace=True)

<hr>

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [220]:
worst_possible_prediction = shifting.loc[shifting.group_backwards < 13]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 13]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 13]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 90.29706562119341


<hr>

## Dataset Splitting (Train until week 3 / Val. week 2/ Test week 1)
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

In [221]:
train = shifting.loc[shifting.group_backwards >= 3]
val = shifting.loc[shifting.group_backwards == 2]
test = shifting.loc[shifting.group_backwards == 1]

weights = infos.set_index('itemID')['simulationPrice'].to_dict()

w_train = train['itemID'].map(weights)
w_val = val['itemID'].map(weights)

In [307]:
# I recommend to the other members of the team keeping the
# datatypes of our datasets as Pandas DataFrames instead of Numpy,
# since It will easier to use Boosting Analysis frameworks
y_train = train['orderSum']
y_val = val['orderSum']
X_train = train.drop(columns=["orderSum"])
X_val = val.drop(columns=["orderSum"])

In [308]:
params = {
#           "objective" : "poisson",
          "objective" : "l1",
          "metric" :"rmse",
          "learning_rate" : 0.5,
          'verbosity': 1,
          'max_depth': 76,
          'num_leaves': 30,
          "min_data_in_leaf":1000,
         }


lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train, categorical_feature=[6,7,8])
lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val, categorical_feature=[6,7,8])

num_round = 1000
model = lgb.train(params,
                  lgbtrain,
                  num_round,
                  valid_sets = [lgbtrain, lgbvalid], 
                  verbose_eval=5,
                  early_stopping_rounds=5,
#                   fobj=objective,
                  feval=feval,
#                   callbacks=[lgb.reset_parameter(learning_rate=lambda current_round: alpha * np.e**(k * current_round))]
                  
                 )

Training until validation scores don't improve for 5 rounds
[5]	training's rmse: 40.3085	training's feval: 602433	valid_1's rmse: 43.6388	valid_1's feval: 74006.1
[10]	training's rmse: 40.2735	training's feval: 771823	valid_1's rmse: 43.5891	valid_1's feval: 97000.8
Early stopping, best iteration is:
[7]	training's rmse: 40.2786	training's feval: 720139	valid_1's rmse: 43.5979	valid_1's feval: 98986.6


In [281]:
X_train.columns[list(reversed(model.feature_importance().argsort()))]

Index(['category2', 'group_backwards', 'itemID', 'manufacturer',
       'orderSum_mean_rolled_2', 'orderSum_mean_rolled_1',
       'group_backwards_sin', 'percentage_accum_cat_3', 'soldUntilThisWeek',
       'brand', 'itemRelevance', 'customerRating', 'orderSum_diff_1',
       'orderSum_median', 'orderSum_diff_2', 'orderSum_mean_rolled_0',
       'soldUntilThisWeek_-1', 'itemRelevance_-1', 'soldUntilThisWeek_diff_-2',
       'orderSum_2', 'soldUntilThisWeek_diff_-1', 'category1', 'median_gap',
       'category3', 'itemRelevance_-2', 'soldUntilThisWeek_-2',
       'recommendedRetailPrice', 'orderSum_1', 'group_backwards_cos'],
      dtype='object')

<hr>

## Dataset Splitting (Train until week 2 and test with week 1)
All my experiments will use weeks 13 to 2 as a train set and week 1 as test

In [272]:
train = shifting.loc[shifting.group_backwards >= 2]
val = shifting.loc[shifting.group_backwards == 1]
test = shifting.loc[shifting.group_backwards == 1]

weights = infos.set_index('itemID')['simulationPrice'].to_dict()

w_train = train['itemID'].map(weights)
w_val = val['itemID'].map(weights)

In [273]:
# I recommend to the other members of the team keeping the
# datatypes of our datasets as Pandas DataFrames instead of Numpy,
# since It will easier to use Boosting Analysis frameworks
y_train = train['orderSum']
y_val = val['orderSum']
X_train = train.drop(columns=["orderSum"])
X_val = val.drop(columns=["orderSum"])

In [306]:
params = {
#           "objective" : "poisson",
          "objective" : "l1",
          "metric" :"rmse",
          "learning_rate" : 0.5,
          'verbosity': 1,
          'max_depth': 5,
          'num_leaves': 19,
          "min_data_in_leaf":3000,
         }

lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train)
lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val)

num_round = 1000
model = lgb.train(params,
                  lgbtrain,
                  num_round,
                  valid_sets = [lgbtrain, lgbvalid], 
                  verbose_eval=5,
                  early_stopping_rounds=5,
#                   fobj=objective,
                  feval=feval,
                  
                 )

Training until validation scores don't improve for 5 rounds
[5]	training's rmse: 40.3084	training's feval: 567297	valid_1's rmse: 43.6407	valid_1's feval: 72622.2
[10]	training's rmse: 40.2874	training's feval: 649069	valid_1's rmse: 43.6131	valid_1's feval: 83109.1
[15]	training's rmse: 40.2864	training's feval: 653121	valid_1's rmse: 43.6138	valid_1's feval: 82690
Early stopping, best iteration is:
[10]	training's rmse: 40.2874	training's feval: 649069	valid_1's rmse: 43.6131	valid_1's feval: 83109.1


<hr>

### Utilities

**Predicting at test time**

In [309]:
y_test = test['orderSum']
X_test = test.drop(columns=["orderSum"])
final_predictions = model.predict(X_test)

In [310]:
final_predictions

array([0.5 , 0.  , 0.75, ..., 1.  , 0.  , 1.  ])

In [311]:
final_predictions[final_predictions < 0] = 0

**Baseline calculation**

In [312]:
baseline_score(final_predictions, y_test.values, infos['simulationPrice'])

98986.63799999999

**Creating our Kaggle CSV**

In [None]:
final = pd.Series(0, index=np.arange(1, len(items)+1))
final[items.itemID] = final_predictions.astype(int)

final.to_csv("lgbm_kaggle_df.csv", header=["demandPrediction"],
            index_label="itemID", sep="|")

**Saving our model in disk**

In [None]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
modelName = 'lgbm-' + now
xgb.save_model(modelName)