# LGBM - Rolling Windows

In [1]:
import numpy as np
import pandas as pd
from utils import *
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


<hr>

## Defining metrics

Baseline_score function

In [2]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [3]:
def feval(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

Objective Metric

In [4]:
def gradient(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight()
    return -2 * (predt - np.maximum(predt - y, 0) * 1.6) * (1 - (predt > y) * 1.6) * sp

def hessian(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight() 
    return -2 * ((1 - (predt > y) * 1.6) ** 2) * sp

def objective(predt, dtrain):
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [5]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [6]:
orders.head()

Unnamed: 0,time,transactID,itemID,order,salesPrice
0,2018-01-01 00:01:56,2278968,450,1,17.42
1,2018-01-01 00:01:56,2278968,83,1,5.19
2,2018-01-01 00:07:11,2255797,7851,2,20.47
3,2018-01-01 00:09:24,2278968,450,1,17.42
4,2018-01-01 00:09:24,2278968,83,1,5.19


In [7]:
# Changing our time signatures
process_time(orders)

In [8]:
orders.head()

Unnamed: 0,time,transactID,itemID,order,salesPrice,days,days_backwards,group_backwards
0,2018-01-01 00:01:56,2278968,450,1,17.42,1,180,13
1,2018-01-01 00:01:56,2278968,83,1,5.19,1,180,13
2,2018-01-01 00:07:11,2255797,7851,2,20.47,1,180,13
3,2018-01-01 00:09:24,2278968,450,1,17.42,1,180,13
4,2018-01-01 00:09:24,2278968,83,1,5.19,1,180,13


In [9]:
df = dataset_builder(orders, items)

In [10]:
print(df.shape)
df.head()

(136019, 10)


Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,13,1,0.0,0,1,4.38,1,1,1,8.84
1,13,2,0.0,0,2,3.0,1,2,1,16.92
2,13,3,1.0,0,3,5.0,1,3,1,15.89
3,13,4,0.0,0,2,4.44,1,2,1,40.17
4,13,5,2.0,0,2,2.33,1,1,1,17.04


<hr>

## Feature building

In [11]:
# Is in utils.py
def cumulative_sale_by_category(df):
    """
    This function add the percentage_acum_cat_3 in our dataset, which tries to describe how 
    important a certain item is inside Its group on category 3.

    Parameters: orders -> Orders DataFrame after "process_time" and "dataset_builder"

    Returns: our orders Dataframe with a new column (percentage_acum_cat_3)
    """
    acum = pd.DataFrame()
    for i in range(12, 0, -1):

        orders_per_item = df.loc[df.group_backwards > i].groupby(
            ['itemID', 'category3'], as_index=False).agg({'orderSum': 'sum'})
        orders_per_cat = df.loc[df.group_backwards > i].groupby(
            ['category3'], as_index=False).agg({'orderSum': 'sum'})

        # Mergin' the amount of sales by category
        # with the accumulated sales
        # of an item grouped by category
        # of the previous weeks
        cum_sum_mean = pd.merge(orders_per_item, orders_per_cat,
                                left_on='category3', right_on='category3', validate="m:1")

        # Calculating the mean of the accumulated sales...
        cum_sum_mean['percentage_accum_cat_3'] = cum_sum_mean['orderSum_x'] / \
            cum_sum_mean['orderSum_y'] * 100

        # These columns won't be useful anymore,
        # since they were used just to calculate our mean
        cum_sum_mean.drop(columns=['orderSum_x', 'orderSum_y'], inplace=True)

        feature_merge = pd.merge(df.loc[df.group_backwards == i], cum_sum_mean.drop(
            columns=['category3']), left_on='itemID', right_on='itemID')
        acum = pd.concat([acum, feature_merge])

    week_13 = df.loc[df.group_backwards == 13].copy()
    week_13['percentage_accum_cat_3'] = 0
    acum = pd.concat([week_13, acum])

    assert (acum.loc[acum.group_backwards == 13]['percentage_accum_cat_3'].sum(
    ) == 0), ("The values on week 13 should all be zero. Verify your inputs")
    
    acum.reset_index(drop=True, inplace=True)

    return acum

In [12]:
# percentage_accum_cat_3 feature...
df = cumulative_sale_by_category(df)

In [13]:
df.tail()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,percentage_accum_cat_3
136014,1,10459,0.0,180,253,0.0,8,44,8,56.57,0.001074
136015,1,10460,0.0,0,253,0.0,8,44,8,163.81,0.001074
136016,1,10461,0.0,0,253,0.0,8,44,8,128.01,0.0
136017,1,10462,0.0,180,253,0.0,8,44,8,166.97,0.001074
136018,1,10463,0.0,0,253,0.0,8,44,8,154.82,0.001074


In [14]:
# Is in utils.py
def time_encoder(data, col, max_val):
    """This function aims to encode a time series in function sines and cosines.
    
    Parameters
    -------------
    data : A pandas DataFrame with all the dataset
    col : A string corresponding to the name of the column that will be encoded
    max_val : Size of the time-window of encoding
                    
    Return
    -------------
    A new pandas DataFrame with two new columns, one encoded as sin and other as cosine.
    """
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [15]:
# Encoding our weeks as a series of sines and cosines...
# This function will consider our period as a semester in a year,
# so we can try other types of time encoding later!
df = time_encoder(df, 'group_backwards', 26)

In [16]:
df.loc[df['group_backwards'] == 12]

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,percentage_accum_cat_3,group_backwards_sin,group_backwards_cos
10463,12,1,2.0,0,1,4.38,1,1,1,8.84,0.000000,0.239316,-0.970942
10464,12,2,0.0,0,2,3.00,1,2,1,16.92,0.000000,0.239316,-0.970942
10465,12,3,95.0,0,3,5.00,1,3,1,15.89,0.008972,0.239316,-0.970942
10466,12,4,1.0,0,2,4.44,1,2,1,40.17,0.000000,0.239316,-0.970942
10467,12,5,1.0,0,2,2.33,1,1,1,17.04,0.017944,0.239316,-0.970942
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20921,12,10459,0.0,180,253,0.00,8,44,8,56.57,0.000000,0.239316,-0.970942
20922,12,10460,0.0,0,253,0.00,8,44,8,163.81,0.000000,0.239316,-0.970942
20923,12,10461,0.0,0,253,0.00,8,44,8,128.01,0.000000,0.239316,-0.970942
20924,12,10462,0.0,180,253,0.00,8,44,8,166.97,0.000000,0.239316,-0.970942


In [17]:
# This cell lags and diffs our feature 'orderSum'
shifting = df.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)

    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    

In [18]:
# This cell creates rolling-window features based on 'orderSum' in our dataset!

item_group = shifting.groupby(["itemID", "group_backwards"]).agg({'orderSum':'sum'})

# We'll .shift(-1) because it sorts our "group_backwards", 
# so doing .shift(1) would cause a HUGE dataleak.
aux_shifting = item_group.groupby('itemID')[['orderSum']].shift(-1)

aux_shifting.sort_values(['itemID', 'group_backwards'], ascending=[True, False], inplace=True)

for i in range(3):
    rolled_window = aux_shifting.groupby(['itemID'], as_index=False)[['orderSum']].rolling(2 ** i).mean()
    rolled_window.rename(columns={'orderSum':f"orderSum_mean_rolled_{i}"}, inplace=True)
    shifting = pd.merge(shifting, rolled_window, left_on=['itemID', 'group_backwards'], right_on=['itemID', 'group_backwards'])

In [19]:
# LGBM Says on docs that it automatically handles zero values as NaN,
# so we'll keep this standard...
shifting.fillna(0, inplace=True)

<hr>

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from pair-weeks -13 to -2, and that's what the cell below is computing.

In [20]:
worst_possible_prediction = shifting.loc[shifting.group_backwards > 1]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 1]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 1]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 118.20105838913783


<hr>

## Dataset Splitting (Train until week 3 / Val. week 2/ Test week 1)
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

In [21]:
train = shifting.loc[shifting.group_backwards >= 3]
val = shifting.loc[shifting.group_backwards == 2]
test = shifting.loc[shifting.group_backwards == 1]

weights = infos.set_index('itemID')['simulationPrice'].to_dict()

w_train = train['itemID'].map(weights)
w_val = val['itemID'].map(weights)

In [22]:
w_train.head()

0     3.43
1     9.15
2    14.04
3    14.10
4     7.48
Name: itemID, dtype: float64

In [23]:
# I recommend to the other members of the team keeping the
# datatypes of our datasets as Pandas DataFrames instead of Numpy,
# since It will easier to use Boosting Analysis frameworks
y_train = train['orderSum']
y_val = val['orderSum']
X_train = train.drop(columns=["orderSum"])
X_val = val.drop(columns=["orderSum"])

In [24]:
y_val.head()

115093    0.0
115094    0.0
115095    1.0
115096    0.0
115097    2.0
Name: orderSum, dtype: float64

In [25]:
X_val.head()

Unnamed: 0,group_backwards,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,percentage_accum_cat_3,...,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2
115093,2,1,0,1,4.38,1,1,1,8.84,0.369181,...,28.0,3.0,-296.0,299.0,297.0,2.0,1.0,31.0,17.0,83.75
115094,2,2,0,2,3.0,1,2,1,16.92,0.002675,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.5,0.75
115095,2,3,0,3,5.0,1,3,1,15.89,0.101659,...,-3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.5,0.75
115096,2,4,0,2,4.44,1,2,1,40.17,0.028357,...,4.0,1.0,-42.0,43.0,41.0,2.0,2.0,5.0,3.0,12.75
115097,2,5,0,2,2.33,1,1,1,17.04,0.094703,...,-36.0,39.0,35.0,4.0,-123.0,127.0,127.0,3.0,21.0,43.25


In [26]:
params = {
              "objective" : "l1",
              "metric" :"rmse",
              "learning_rate" : 0.1,
              'verbosity': 1,
              'max_depth': 6,
              'num_leaves': 15,
              "min_data_in_leaf": 2000,
         }

lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train)
lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val)

num_round = 1000

model = lgb.train(
                    params,
                    lgbtrain,
                    num_round,
                    valid_sets = [lgbtrain, lgbvalid], 
                    verbose_eval=5,
                    early_stopping_rounds=5,
                    feval=feval,   
                 )

Training until validation scores don't improve for 5 rounds
[5]	training's rmse: 39.9271	training's feval: 73304.1	valid_1's rmse: 44.9012	valid_1's feval: 14943.3
[10]	training's rmse: 39.9159	training's feval: 147112	valid_1's rmse: 44.884	valid_1's feval: 25599.5
[15]	training's rmse: 39.8983	training's feval: 254284	valid_1's rmse: 44.8611	valid_1's feval: 45427.1
[20]	training's rmse: 39.8859	training's feval: 350212	valid_1's rmse: 44.8481	valid_1's feval: 58560.5
[25]	training's rmse: 39.8759	training's feval: 408437	valid_1's rmse: 44.836	valid_1's feval: 69395.5
[30]	training's rmse: 39.8711	training's feval: 429720	valid_1's rmse: 44.8281	valid_1's feval: 72359.6
[35]	training's rmse: 39.8663	training's feval: 454300	valid_1's rmse: 44.8238	valid_1's feval: 77421.1
[40]	training's rmse: 39.8629	training's feval: 480205	valid_1's rmse: 44.8202	valid_1's feval: 79878.7
[45]	training's rmse: 39.8611	training's feval: 493234	valid_1's rmse: 44.8178	valid_1's feval: 82474.9
[50]	t

<hr>

<hr>

### Utilities

**Predicting at test time**

In [27]:
y_test = test['orderSum']
X_test = test.drop(columns=["orderSum"])
final_predictions = model.predict(X_test)

In [28]:
final_predictions

array([1.81794004, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ])

In [29]:
final_predictions[final_predictions < 0] = 0

**Baseline calculation**

In [30]:
baseline_score(final_predictions, y_test.values, infos['simulationPrice'])

81616.33600000001

**Creating our Kaggle CSV**

In [31]:
final = pd.Series(0, index=np.arange(1, len(items)+1))
final[items.itemID] = final_predictions.astype(int)

final.to_csv("lgbm_kaggle_df.csv", header=["demandPrediction"],
            index_label="itemID", sep="|")

**Saving our model in disk**

In [32]:
print(dir(xgb))

['Booster', 'DMatrix', 'VERSION_FILE', 'XGBClassifier', 'XGBModel', 'XGBRFClassifier', 'XGBRFRegressor', 'XGBRanker', 'XGBRegressor', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'absolute_import', 'callback', 'compat', 'core', 'cv', 'f', 'libpath', 'os', 'plot_importance', 'plot_tree', 'plotting', 'rabit', 'sklearn', 'to_graphviz', 'train', 'training']


In [33]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
modelName = 'lgbm-' + now
xgb.save_model(modelName)

AttributeError: module 'xgboost' has no attribute 'save_model'