# LGBM - New Feature + Baseline Rolling Window

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from utils import *
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


<hr>

## Defining metrics

Baseline_score function

In [2]:
def evaluate(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [3]:
def feval_lgbm(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

In [4]:
def feval_xgb(prediction, dtrain):
    prediction = prediction.astype(int)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [5]:
class feval_cat(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, simulationPrice):
        prediction = np.array(approxes[0]).astype(int)
        target = np.array(target).astype(int)
        simulationPrice = np.array(simulationPrice)
        score = np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)
        return score, 0

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [6]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [7]:
# Changing our time signatures
process_time(orders)

In [8]:
df = dataset_builder(orders, items)

In [9]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,13,1,0.0,0,1,4.38,1,1,1,8.84
1,13,2,0.0,0,2,3.0,1,2,1,16.92
2,13,3,1.0,0,3,5.0,1,3,1,15.89
3,13,4,0.0,0,2,4.44,1,2,1,40.17
4,13,5,2.0,0,2,2.33,1,1,1,17.04


<hr>

## Feature building

**Adding 'is_new'**

In [10]:
# This cell adds a feature responsible for indicating if in the current week
# a given item has its first appearance.
orders_sorted_by_week = orders.sort_values('group_backwards', ascending=False)
weeks_grouped_by_items = orders_sorted_by_week.groupby('itemID', as_index=False)
items_first_appearance = weeks_grouped_by_items.first()[['itemID', 'group_backwards']]
items_first_appearance.rename(columns={'group_backwards':'first_appearance'}, inplace=True)
df['is_new'] = 0
df = pd.merge(df, items_first_appearance, left_on=['itemID'], right_on=['itemID'], how='left', validate='m:1')
df.loc[df['first_appearance'] == df['group_backwards'], 'is_new'] = 1
df.drop(columns=['first_appearance'], inplace=True)

In [11]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0
1,13,2,0.0,0,2,3.0,1,2,1,16.92,0
2,13,3,1.0,0,3,5.0,1,3,1,15.89,1
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1


**Cumulative sale by category**

In [12]:
# percentage_accum_cat_3 feature...
df = cumulative_sale_by_category(df, category='category3')

In [13]:
df.tail()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3
136014,1,10459,0.0,180,253,0.0,8,44,8,56.57,0,0.001074
136015,1,10460,0.0,0,253,0.0,8,44,8,163.81,0,0.001074
136016,1,10461,0.0,0,253,0.0,8,44,8,128.01,0,0.0
136017,1,10462,0.0,180,253,0.0,8,44,8,166.97,0,0.001074
136018,1,10463,0.0,0,253,0.0,8,44,8,154.82,0,0.001074


**Time Encoding**

In [14]:
# Encoding our weeks as a series of sines and cosines...
# This function will consider our period as a semester in a year,
# so we can try other types of time encoding later!
df = time_encoder(df, 'group_backwards', 26)

In [15]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.216245e-16,-1.0
1,13,2,0.0,0,2,3.0,1,2,1,16.92,0,0.0,-3.216245e-16,-1.0
2,13,3,1.0,0,3,5.0,1,3,1,15.89,1,0.0,-3.216245e-16,-1.0
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0,0.0,-3.216245e-16,-1.0
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0,-3.216245e-16,-1.0


**Lags and diffs**

In [16]:
# This cell lags and diffs our feature 'orderSum'
shifting = df.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)

    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    

In [17]:
shifting.tail()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,...,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4
136014,1,10459,0.0,180,253,0.0,8,44,8,56.57,...,0.239316,0.970942,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0
136015,1,10460,0.0,0,253,0.0,8,44,8,163.81,...,0.239316,0.970942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
136016,1,10461,0.0,0,253,0.0,8,44,8,128.01,...,0.239316,0.970942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136017,1,10462,0.0,180,253,0.0,8,44,8,166.97,...,0.239316,0.970942,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0
136018,1,10463,0.0,0,253,0.0,8,44,8,154.82,...,0.239316,0.970942,0.0,-1.0,1.0,1.0,0.0,0.0,0.0,0.0


**Rolling window "orderSum"**

In [18]:
%%time
# This cell creates rolling-window features based on 'orderSum' in our dataset!
item_group = shifting.groupby(["itemID", "group_backwards"]).agg({'orderSum':'sum'})

# We'll .shift(-1) because it sorts our "group_backwards", 
# so doing .shift(1) would cause a HUGE dataleak.
aux_shifting = item_group.groupby('itemID')[['orderSum']].shift(-1)

aux_shifting.sort_values(['itemID', 'group_backwards'], ascending=[True, False], inplace=True)

for i in range(3):
    rolled_window = aux_shifting.groupby(['itemID'], as_index=False)[['orderSum']].rolling(2 ** i).mean()
    rolled_window.rename(columns={'orderSum':f"orderSum_mean_rolled_{i}"}, inplace=True)
    shifting = pd.merge(shifting, rolled_window, left_on=['itemID', 'group_backwards'], right_on=['itemID', 'group_backwards'])

CPU times: user 37.9 s, sys: 242 ms, total: 38.1 s
Wall time: 37.9 s


In [19]:
# LGBM Says on docs that it automatically handles zero values as NaN,
# so we'll keep this standard...
shifting.fillna(0, inplace=True)

In [20]:
shifting.query('itemID == 5')

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,...,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2
4,13,5,2.0,0,2,2.33,1,1,1,17.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10467,12,5,1.0,0,2,2.33,1,1,1,17.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
20930,11,5,0.0,0,2,2.33,1,1,1,17.04,...,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.5,0.0
31393,10,5,1.0,0,2,2.33,1,1,1,17.04,...,-1.0,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.5,0.0
41856,9,5,0.0,0,2,2.33,1,1,1,17.04,...,1.0,0.0,-1.0,1.0,-1.0,2.0,0.0,1.0,0.5,1.0
52319,8,5,0.0,0,2,2.33,1,1,1,17.04,...,-1.0,1.0,1.0,0.0,-1.0,1.0,-1.0,0.0,0.5,0.5
62782,7,5,0.0,0,2,2.33,1,1,1,17.04,...,0.0,0.0,-1.0,1.0,1.0,0.0,-1.0,0.0,0.0,0.25
73245,6,5,127.0,0,2,2.33,1,1,1,17.04,...,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,0.25
83708,5,5,4.0,0,2,2.33,1,1,1,17.04,...,127.0,0.0,0.0,0.0,0.0,0.0,-1.0,127.0,63.5,31.75
94171,4,5,39.0,0,2,2.33,1,1,1,17.04,...,-123.0,127.0,127.0,0.0,0.0,0.0,0.0,4.0,65.5,32.75


### Tobias features

In [21]:
def gbtransf(data, group_cols, targeted_cols, out_names, function, params = dict()):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = np.zeros((len(data), len(out_names)))
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval, **params)
        new_feat[i:j] = output

        # go to next group
        i = j
    
    out_df = pd.DataFrame(new_feat, columns = out_names, index = data.index)
        
    return out_df

In [22]:
shifting.sort_values(['itemID', 'group_backwards'], inplace = True, ascending=[True, False])

In [162]:
pd.set_option('display.max_columns', 50)
pd.set_option("display.precision", 4)

In [24]:
shifting.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10463,12,1,2.0,0,1,4.38,1,1,1,8.84,1,0.0,0.2393157,-0.970942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20926,11,1,313.0,0,1,4.38,1,1,1,8.84,0,0.006728,0.4647232,-0.885456,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
31389,10,1,35.0,0,1,4.38,1,1,1,8.84,0,0.746587,0.6631227,-0.748511,313.0,311.0,2.0,2.0,0.0,0.0,0.0,0.0,313.0,157.5,0.0
41852,9,1,3.0,0,1,4.38,1,1,1,8.84,0,0.608717,0.8229839,-0.568065,35.0,-278.0,313.0,311.0,2.0,2.0,0.0,0.0,35.0,174.0,87.5


In [25]:
data = shifting.copy()

In [26]:
data.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10463,12,1,2.0,0,1,4.38,1,1,1,8.84,1,0.0,0.2393157,-0.970942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20926,11,1,313.0,0,1,4.38,1,1,1,8.84,0,0.006728,0.4647232,-0.885456,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
31389,10,1,35.0,0,1,4.38,1,1,1,8.84,0,0.746587,0.6631227,-0.748511,313.0,311.0,2.0,2.0,0.0,0.0,0.0,0.0,313.0,157.5,0.0
41852,9,1,3.0,0,1,4.38,1,1,1,8.84,0,0.608717,0.8229839,-0.568065,35.0,-278.0,313.0,311.0,2.0,2.0,0.0,0.0,35.0,174.0,87.5


In [27]:
data.groupby("group_backwards")["is_new"].sum().to_dict()

{1: 728,
 2: 727,
 3: 794,
 4: 671,
 5: 785,
 6: 661,
 7: 716,
 8: 909,
 9: 785,
 10: 533,
 11: 371,
 12: 729,
 13: 1431}

In [28]:
the_cat = "manufacturer"

In [29]:
sla = data.groupby(["group_backwards", the_cat])["is_new"].sum().reset_index()

In [30]:
sla.sort_values(['group_backwards', the_cat], inplace = True, ascending=[False, True])

In [31]:
sla.head()

Unnamed: 0,group_backwards,manufacturer,is_new
3036,13,1,5
3037,13,2,59
3038,13,3,18
3039,13,4,4
3040,13,5,21


In [32]:
sla = sla.rename(columns={"is_new" : "leak_cat3"})

In [33]:
sla.head()

Unnamed: 0,group_backwards,manufacturer,leak_cat3
3036,13,1,5
3037,13,2,59
3038,13,3,18
3039,13,4,4
3040,13,5,21


In [34]:
data = pd.merge(data, sla, on = ["group_backwards", the_cat])

In [35]:
data.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2,leak_cat3
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,13,31,0.0,0,1,5.0,1,1,1,18.87,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2,13,32,1.0,0,1,4.6,1,1,1,26.49,1,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
3,13,37,1.0,0,1,5.0,1,1,1,17.1,1,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,13,108,0.0,0,1,5.0,1,1,1,13.35,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [36]:
data["total_new"] = data["group_backwards"].map(data.groupby("group_backwards")["is_new"].sum().to_dict())

In [37]:
data.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2,leak_cat3,total_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
1,13,31,0.0,0,1,5.0,1,1,1,18.87,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
2,13,32,1.0,0,1,4.6,1,1,1,26.49,1,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
3,13,37,1.0,0,1,5.0,1,1,1,17.1,1,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
4,13,108,0.0,0,1,5.0,1,1,1,13.35,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431


In [38]:
data.fillna(0, inplace=True)

In [39]:
# checking if we got what we wanted
data.query('itemID == 1290')

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2,leak_cat3,total_new
26329,13,1290,0.0,0,34,0.0,1,3,1,42.29,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1431
26347,12,1290,0.0,0,34,0.0,1,3,1,42.29,0,0.0,0.2393157,-0.970942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,729
26365,11,1290,0.0,0,34,0.0,1,3,1,42.29,0,0.0,0.4647232,-0.885456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,371
26383,10,1290,0.0,0,34,0.0,1,3,1,42.29,0,0.0,0.6631227,-0.748511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,533
26401,9,1290,0.0,0,34,0.0,1,3,1,42.29,0,0.0,0.8229839,-0.568065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,785
26419,8,1290,0.0,0,34,0.0,1,3,1,42.29,0,0.0,0.9350162,-0.354605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,909
26437,7,1290,0.0,0,34,0.0,1,3,1,42.29,0,0.0,0.9927089,-0.120537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,716
26455,6,1290,0.0,0,34,0.0,1,3,1,42.29,0,0.0,0.9927089,0.120537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,661
26473,5,1290,21.0,0,34,0.0,1,3,1,42.29,1,0.0,0.9350162,0.354605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14,785
26491,4,1290,3.0,0,34,0.0,1,3,1,42.29,0,0.013564,0.8229839,0.568065,21.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,10.5,5.25,2,671


In [135]:
data.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2,leak_cat3,total_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
1,13,31,0.0,0,1,5.0,1,1,1,18.87,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
2,13,32,1.0,0,1,4.6,1,1,1,26.49,1,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
3,13,37,1.0,0,1,5.0,1,1,1,17.1,1,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
4,13,108,0.0,0,1,5.0,1,1,1,13.35,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431


In [136]:
data.sort_values(['itemID', 'group_backwards'], inplace = True, ascending=[True, False])

In [137]:
data.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2,leak_cat3,total_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
40,12,1,2.0,0,1,4.38,1,1,1,8.84,1,0.0,0.2393157,-0.970942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,729
80,11,1,313.0,0,1,4.38,1,1,1,8.84,0,0.006728,0.4647232,-0.885456,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,6,371
120,10,1,35.0,0,1,4.38,1,1,1,8.84,0,0.746587,0.6631227,-0.748511,313.0,311.0,2.0,2.0,0.0,0.0,0.0,0.0,313.0,157.5,0.0,3,533
160,9,1,3.0,0,1,4.38,1,1,1,8.84,0,0.608717,0.8229839,-0.568065,35.0,-278.0,313.0,311.0,2.0,2.0,0.0,0.0,35.0,174.0,87.5,0,785


<hr>

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [40]:
worst_possible_prediction = shifting.loc[shifting.group_backwards > 1]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 1]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 1]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 118.20105838913783


<hr>

## Dataset Splitting (Train until week 3 / Val. week 2/ Test week 1)
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

In [138]:
dataset = data

In [139]:
train = dataset.query('3 <= group_backwards <= 13').reset_index(drop = True)
full_train = dataset.query('2 <= group_backwards <= 13').reset_index(drop = True)
val = dataset.query('group_backwards == 2').reset_index(drop = True)
sub = dataset.query('group_backwards == 1').reset_index(drop = True)

In [153]:
features = train.columns.values

In [154]:
features

array(['group_backwards', 'itemID', 'brand', 'manufacturer',
       'customerRating', 'category1', 'category2', 'category3',
       'recommendedRetailPrice', 'is_new', 'percentage_accum_category3',
       'group_backwards_sin', 'group_backwards_cos', 'orderSum_1',
       'orderSum_diff_1', 'orderSum_2', 'orderSum_diff_2', 'orderSum_3',
       'orderSum_diff_3', 'orderSum_4', 'orderSum_diff_4',
       'orderSum_mean_rolled_0', 'orderSum_mean_rolled_1',
       'orderSum_mean_rolled_2', 'leak_cat3', 'total_new'], dtype=object)

In [140]:
train.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2,leak_cat3,total_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
1,12,1,2.0,0,1,4.38,1,1,1,8.84,1,0.0,0.2393157,-0.970942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,729
2,11,1,313.0,0,1,4.38,1,1,1,8.84,0,0.006728,0.4647232,-0.885456,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,6,371
3,10,1,35.0,0,1,4.38,1,1,1,8.84,0,0.746587,0.6631227,-0.748511,313.0,311.0,2.0,2.0,0.0,0.0,0.0,0.0,313.0,157.5,0.0,3,533
4,9,1,3.0,0,1,4.38,1,1,1,8.84,0,0.608717,0.8229839,-0.568065,35.0,-278.0,313.0,311.0,2.0,2.0,0.0,0.0,35.0,174.0,87.5,0,785


In [141]:
len(train), len(val), len(sub)

(115093, 10463, 10463)

In [142]:
weights = infos.set_index('itemID')['simulationPrice'].to_dict()

In [143]:
y_train = train.pop('orderSum').values
y_full_train = full_train.pop('orderSum').values
y_val = val.pop('orderSum').values
y_sub = sub.pop('orderSum').values

X_train = train.values
X_full_train = full_train.values
X_val = val.values
X_sub = sub.values

w_train = train['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)

### XGBoost

In [144]:
xgb.__version__

'1.0.2'

In [145]:
# custom objective

def gradient(prediction, dtrain):
    y = dtrain.get_label()
    return -2 * (prediction - np.maximum(prediction - y, 0) * 1.6) * (1 - (prediction > y) * 1.6)

def hessian(prediction, dtrain):
    y = dtrain.get_label()
    return -2 * (1 - (prediction > y) * 1.6) ** 2

def objective(prediction, dtrain):
    w = dtrain.get_weight()
    grad = gradient(prediction, dtrain) * w
    hess = hessian(prediction, dtrain) * w
    return grad, hess

In [146]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)

# specify parameters via map
param = {
    'max_depth':10,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 3,

}

num_round = 400
bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
                feval = feval_xgb,
                verbose_eval=10,
                maximize = True,
                )

[0]	train-feval:122331.56006	val-feval:13554.07397
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[1]	train-feval:297010.44208	val-feval:29662.43797
[2]	train-feval:489268.65814	val-feval:53146.76200
[3]	train-feval:639702.21828	val-feval:67367.59198
[4]	train-feval:854979.40442	val-feval:93339.23799
[5]	train-feval:1005620.82864	val-feval:106548.24398
[6]	train-feval:1238294.17316	val-feval:132742.43610
[7]	train-feval:1361558.72914	val-feval:147099.41612
[8]	train-feval:1519893.15510	val-feval:159033.73017
[9]	train-feval:1660831.08120	val-feval:168436.91019
[10]	train-feval:1907373.68547	val-feval:187264.14417
[11]	train-feval:2040886.90374	val-feval:196033.63028
[12]	train-feval:2273850.80958	val-feval:207654.62431
[13]	train-feval:2397448.43156	val-feval:217688.21832
[14]	train-feval:2500121.30375	val-feval:223237.12435
[15]	train-feval:2637040.44781	val-feval:236291.99445
[16]	train-fe

In [225]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

417981.3259999999

In [229]:
# retrain!

In [230]:
bst_sub = xgb.train(
    param, 
    dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    feval = feval_xgb, 
    maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval = False,
)
bst_sub.best_ntree_limit

122

In [231]:
prediction = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

480858.03399999975

In [None]:
feat_v = bst.get_fscore().items()
feat_v = [imp for (trs, imp) in feat_v]
feat_imp = [p for p in zip(features, feat_v)]
feat_imp_df = pd.DataFrame(feat_imp, columns=['feature', 'importance'])
feat_imp_df = feat_imp_df.sort_values('importance', ascending=False)
feat_imp_df

In [237]:
feat_v_sub = bst_sub.get_fscore().items()
feat_v_sub = [imp for (trs, imp) in feat_v_sub]
feat_imp_sub = [p for p in zip(features, feat_v_sub)]
feat_imp_sub_df = pd.DataFrame(feat_imp_sub, columns=['feature', 'importance'])
feat_imp_sub_df = feat_imp_sub_df.sort_values('importance', ascending=False)
feat_imp_sub_df

In [242]:
diff = np.array(feat_v_sub) - np.array(feat_v)
diff_imp = [p for p in zip(features, diff)]
diff_imp_df = pd.DataFrame(diff_imp, columns=['feature', 'importance'])
diff_imp_df = diff_imp_df.sort_values('importance', ascending=False)
diff_imp_df

### LGBM

In [70]:
params = {
          "objective" : "l1",
          "metric" :"rmse",
          "learning_rate" : 0.5,
          'verbosity': 1,
          'max_depth': 7,
          'num_leaves': 15,
          "min_data_in_leaf":3500,
         }

lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train, categorical_feature=[2, 3, 5, 6, 7, 9])
lgbfulltrain = lgb.Dataset(X_full_train, label = y_full_train, weight=w_full_train, categorical_feature=[2, 3, 5, 6, 7, 9])
lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val, categorical_feature=[2, 3, 5, 6, 7, 9])
lgbsubmis = lgb.Dataset(X_sub, label = y_sub, weight=w_sub, categorical_feature=[2, 3, 5, 6, 7, 9])

num_round = 1000
lgb_model = lgb.train(params,
                  lgbtrain,
                  num_round,
                  valid_sets = [lgbtrain, lgbvalid],
                  valid_names = ['train', 'val'],
                  verbose_eval=5,
                  early_stopping_rounds=5,
                  feval=feval_lgbm,
                 )

Training until validation scores don't improve for 5 rounds
[5]	train's rmse: 39.4962	train's feval: 4.83791e+06	val's rmse: 44.5092	val's feval: 307293
[10]	train's rmse: 38.6102	train's feval: 7.3952e+06	val's rmse: 43.6229	val's feval: 568693
[15]	train's rmse: 38.7645	train's feval: 5.70617e+06	val's rmse: 43.7633	val's feval: 433730
Early stopping, best iteration is:
[10]	train's rmse: 38.6102	train's feval: 7.3952e+06	val's rmse: 43.6229	val's feval: 568693


In [147]:
prediction = lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration).astype(int)
evaluate(prediction, y_sub, w_sub)

554812.3219999999

In [72]:
lgb_model_sub = lgb.train(params,
                  lgbfulltrain,
                  lgb_model.best_iteration,
                  valid_sets = [lgbfulltrain],
                  valid_names = ['train'],
                  verbose_eval=5,
                  early_stopping_rounds=None,
                 feval = feval_lgbm,
                 )

[5]	train's rmse: 39.5943	train's feval: 6.28894e+06
[10]	train's rmse: 39.3364	train's feval: 6.63024e+06


In [73]:
prediction = lgb_model_sub.predict(X_sub, num_iteration=80).astype(int)
evaluate(prediction, y_sub, w_sub)

494132.524

In [None]:
feat_v = lgb_model.feature_importance()
feat_imp_ = [p for p in zip(features, feat_v)]
feat_imp_df = pd.DataFrame(feat_imp, columns=['feature', 'importance'])
feat_imp_df = feat_imp_df.sort_values('importance', ascending=False)
feat_imp_df

In [251]:
feat_v_sub = lgb_model_sub.feature_importance()
feat_imp_sub = [p for p in zip(features, feat_v_sub)]
feat_imp_sub_df = pd.DataFrame(feat_imp_sub, columns=['feature', 'importance'])
feat_imp_sub_df = feat_imp_sub_df.sort_values('importance', ascending=False)
feat_imp_sub_df

In [253]:
diff = [p for p in zip(features, feat_v_sub - feat_v)]
diff_df = pd.DataFrame(diff, columns=['feature', 'diff_importance_after_retrain'])
diff_df = diff_df.sort_values('diff_importance_after_retrain', ascending=False)
diff_df

Unnamed: 0,feature,diff_importance_after_retrain
22,orderSum_mean_rolled_1,8
25,total_new,4
3,manufacturer,4
15,orderSum_2,3
13,orderSum_1,2
0,group_backwards,1
16,orderSum_diff_2,0
24,leak_cat3,0
23,orderSum_mean_rolled_2,0
21,orderSum_mean_rolled_0,0


### CatBoost

In [76]:
from catboost import CatBoost, CatBoostRegressor, Pool

In [190]:
ds_params = {
#     'cat_features' : [8, 9, 10],
}
train_pool = Pool(X_train, label = y_train, weight = w_train, **ds_params)
trainfull_pool = Pool(X_full_train, label = y_full_train, weight = w_full_train, **ds_params)
val_pool = Pool(X_val, label = y_val, weight = w_sub, **ds_params)
sub_pool = Pool(X_sub, label = y_sub, weight = w_sub, **ds_params)

cat_model = CatBoostRegressor(
    depth=7, 
    learning_rate=0.1, 
    loss_function='MAE',
    early_stopping_rounds=5,
    eval_metric = feval_cat(),
    thread_count=-1,
    verbose=10, # show us the progress after every 10 iterations
)

cat_model.fit(
    train_pool,
    eval_set=[train_pool, val_pool],
);


0:	learn: 0.0000000	test: 0.0000000	test1: 0.0000000	best: 0.0000000 (0)	total: 224ms	remaining: 3m 44s
10:	learn: 1038131.4479152	test: 1038131.4479152	test1: 148444.3560021	best: 148444.3560021 (10)	total: 2.53s	remaining: 3m 47s
20:	learn: 2783334.4942974	test: 2783334.4942974	test1: 274451.2300601	best: 274493.7800674 (19)	total: 4.76s	remaining: 3m 41s
30:	learn: 4315956.6835821	test: 4315956.6835821	test1: 346366.4960583	best: 346366.4960583 (30)	total: 6.92s	remaining: 3m 36s
40:	learn: 4380640.4013933	test: 4380640.4013933	test1: 348092.3180717	best: 350468.5160748 (38)	total: 9.07s	remaining: 3m 32s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 350468.5161
bestIteration = 38

Shrink model to first 39 iterations.


In [191]:
prediction = cat_model.predict(X_sub, ntree_end = cat_model.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

472915.056

In [192]:
# retrain!

In [193]:
cat_model.best_iteration_

38

In [194]:
{**cat_model.get_params(), "iterations" : cat_model.best_iteration_}

{'learning_rate': 0.1,
 'depth': 7,
 'loss_function': 'MAE',
 'verbose': 10,
 'eval_metric': <__main__.feval_cat at 0x7efffc1e1210>,
 'early_stopping_rounds': 5,
 'iterations': 38}

In [195]:
cat_sub = CatBoostRegressor(**{**cat_model.get_params(), "iterations" : cat_model.best_iteration_})
cat_sub.fit(
    trainfull_pool,
    eval_set=[trainfull_pool],
);

0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 236ms	remaining: 8.73s
10:	learn: 1222396.9016862	test: 1222396.9016862	best: 1222396.9016862 (10)	total: 2.6s	remaining: 6.39s
20:	learn: 3074374.6702215	test: 3074374.6702215	best: 3074374.6702215 (20)	total: 4.91s	remaining: 3.98s
30:	learn: 4841309.5670845	test: 4841309.5670845	best: 4841309.5670845 (30)	total: 7.26s	remaining: 1.64s
37:	learn: 4890819.1130772	test: 4890819.1130772	best: 4890819.1130772 (37)	total: 8.87s	remaining: 0us

bestTest = 4890819.113
bestIteration = 37



In [196]:
prediction = cat_sub.predict(X_sub, ntree_end = cat_sub.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

483885.57999999996

In [197]:
feat_v = cat_model.get_feature_importance()
feat_imp = [p for p in zip(features, feat_v)]
feat_imp_df = pd.DataFrame(feat_imp, columns=['feature', 'importance'])
feat_imp_df = feat_imp_df.sort_values('importance', ascending=False)
feat_imp_df

Unnamed: 0,feature,importance
9,is_new,37.4993
8,recommendedRetailPrice,15.0845
12,group_backwards_cos,10.2735
21,orderSum_mean_rolled_0,3.462
15,orderSum_2,3.4355
16,orderSum_diff_2,3.3336
19,orderSum_4,3.2317
25,total_new,2.7805
18,orderSum_diff_3,2.0866
1,itemID,1.9976


In [198]:
feat_v_sub = cat_sub.get_feature_importance()
feat_imp_sub = [p for p in zip(features, feat_v_sub)]
feat_imp_sub_df = pd.DataFrame(feat_imp_sub, columns=['feature', 'importance'])
feat_imp_sub_df = feat_imp_sub_df.sort_values('importance', ascending=False)
feat_imp_sub_df

Unnamed: 0,feature,importance
9,is_new,42.508
8,recommendedRetailPrice,15.5035
12,group_backwards_cos,11.3877
16,orderSum_diff_2,3.4861
2,brand,2.9034
21,orderSum_mean_rolled_0,2.7843
1,itemID,2.6229
25,total_new,2.3368
20,orderSum_diff_4,2.0537
4,customerRating,1.9019


In [199]:
diff = [p for p in zip(features, feat_v_sub - feat_v)]
diff_df = pd.DataFrame(diff, columns=['feature', 'diff_importance_after_retrain'])
diff_df = diff_df.sort_values('diff_importance_after_retrain', ascending=False)
diff_df

Unnamed: 0,feature,diff_importance_after_retrain
9,is_new,5.0087
2,brand,1.6358
12,group_backwards_cos,1.1143
1,itemID,0.6254
20,orderSum_diff_4,0.5302
8,recommendedRetailPrice,0.4189
4,customerRating,0.1747
16,orderSum_diff_2,0.1525
5,category1,0.0686
6,category2,0.0284


### Ensemble

In [200]:
cat_w = 4
lgb_w = 0
xgb_w = 1
ensemble = cat_sub.predict(X_sub, ntree_end = cat_sub.best_iteration_) * cat_w
ensemble += lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration) * lgb_w
ensemble += bst_sub.predict(dsub, ntree_limit=bst.best_ntree_limit) * xgb_w
ensemble = ensemble / (cat_w + lgb_w + xgb_w)
evaluate(ensemble.astype(int), y_sub, w_sub)

595317.2259999999