In [215]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from utils import *
import sys
from datetime import datetime

In [4]:
sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


In [5]:
pd.set_option('display.max_columns', 50)
pd.set_option("display.precision", 4)

---

## Defining metrics

Baseline_score function

In [6]:
def evaluate(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.dot((prediction - np.maximum(prediction - target, 0) * 1.6), simulatedPrice)

Evaluation Metric

In [7]:
def feval_lgbm(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

In [8]:
def feval_xgb(prediction, dtrain):
    prediction = prediction.astype(int)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [9]:
class feval_cat(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, simulationPrice):
        prediction = np.array(approxes[0]).astype(int)
        target = np.array(target).astype(int)
        simulationPrice = np.array(simulationPrice)
        score = np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)
        return score, 0

<hr>

## Building our dataset

In [10]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [11]:
# Changing our time signatures
process_time(orders)

In [12]:
df = dataset_builder(orders, items)

In [13]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,13,1,0.0,0,1,4.38,1,1,1,8.84
1,13,2,0.0,0,2,3.0,1,2,1,16.92
2,13,3,1.0,0,3,5.0,1,3,1,15.89
3,13,4,0.0,0,2,4.44,1,2,1,40.17
4,13,5,2.0,0,2,2.33,1,1,1,17.04


<hr>

### Feature building

#### "Is New"

In [14]:
# This cell adds a feature responsible for indicating if in the current week
# a given item has its first appearance.
orders_sorted_by_week = orders.sort_values('group_backwards', ascending=False)
weeks_grouped_by_items = orders_sorted_by_week.groupby('itemID', as_index=False)
items_first_appearance = weeks_grouped_by_items.first()[['itemID', 'group_backwards']]
items_first_appearance.rename(columns={'group_backwards':'first_appearance'}, inplace=True)
df['is_new'] = 0
df = pd.merge(df, items_first_appearance, left_on=['itemID'], right_on=['itemID'], how='left', validate='m:1')
df.loc[df['first_appearance'] == df['group_backwards'], 'is_new'] = 1
df.drop(columns=['first_appearance'], inplace=True)

In [15]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0
1,13,2,0.0,0,2,3.0,1,2,1,16.92,0
2,13,3,1.0,0,3,5.0,1,3,1,15.89,1
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1


#### Cumulative sale by category

In [16]:
# percentage_accum_cat_3 feature...
df = cumulative_sale_by_category(df, category='category3')

In [17]:
df.tail()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3
136014,1,10459,0.0,180,253,0.0,8,44,8,56.57,0,0.0011
136015,1,10460,0.0,0,253,0.0,8,44,8,163.81,0,0.0011
136016,1,10461,0.0,0,253,0.0,8,44,8,128.01,0,0.0
136017,1,10462,0.0,180,253,0.0,8,44,8,166.97,0,0.0011
136018,1,10463,0.0,0,253,0.0,8,44,8,154.82,0,0.0011


#### Time Encoding

In [18]:
# Encoding our weeks as a series of sines and cosines...
# This function will consider our period as a semester in a year,
# so we can try other types of time encoding later!
df = time_encoder(df, 'group_backwards', 26)

In [19]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.2162e-16,-1.0
1,13,2,0.0,0,2,3.0,1,2,1,16.92,0,0.0,-3.2162e-16,-1.0
2,13,3,1.0,0,3,5.0,1,3,1,15.89,1,0.0,-3.2162e-16,-1.0
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0,0.0,-3.2162e-16,-1.0
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0,-3.2162e-16,-1.0


#### Lags and diffs

In [20]:
NUMBER_OF_LAGS = 4

In [21]:
# This cell lags and diffs our feature 'orderSum'
shifting = df.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)

    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    

In [22]:
shifting.tail()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4
136014,1,10459,0.0,180,253,0.0,8,44,8,56.57,0,0.0011,0.2393,0.9709,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0
136015,1,10460,0.0,0,253,0.0,8,44,8,163.81,0,0.0011,0.2393,0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
136016,1,10461,0.0,0,253,0.0,8,44,8,128.01,0,0.0,0.2393,0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136017,1,10462,0.0,180,253,0.0,8,44,8,166.97,0,0.0011,0.2393,0.9709,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0
136018,1,10463,0.0,0,253,0.0,8,44,8,154.82,0,0.0011,0.2393,0.9709,0.0,-1.0,1.0,1.0,0.0,0.0,0.0,0.0


#### Rolling window "orderSum"

In [23]:
NUMBER_OF_WINDOWS = 3

In [24]:
%%time
# This cell creates rolling-window features based on 'orderSum' in our dataset!
item_group = shifting.groupby(["itemID", "group_backwards"]).agg({'orderSum':'sum'})

# We'll .shift(-1) because it sorts our "group_backwards", 
# so doing .shift(1) would cause a HUGE dataleak.
aux_shifting = item_group.groupby('itemID')[['orderSum']].shift(-1)

aux_shifting.sort_values(['itemID', 'group_backwards'], ascending=[True, False], inplace=True)

for i in range(NUMBER_OF_WINDOWS):
    rolled_window = aux_shifting.groupby(['itemID'], as_index=False)[['orderSum']].rolling(2 ** i).mean()
    rolled_window.rename(columns={'orderSum':f"orderSum_mean_rolled_{2 ** i}"}, inplace=True)
    shifting = pd.merge(shifting, rolled_window, left_on=['itemID', 'group_backwards'], right_on=['itemID', 'group_backwards'])

CPU times: user 40.9 s, sys: 313 ms, total: 41.3 s
Wall time: 41.6 s


In [25]:
# LGBM Says on docs that it automatically handles zero values as NaN,
# so we'll keep this standard...
shifting.fillna(0, inplace=True)

In [26]:
shifting.query('itemID == 5')

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10467,12,5,1.0,0,2,2.33,1,1,1,17.04,0,0.0179,0.23932,-0.9709,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
20930,11,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0101,0.46472,-0.8855,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.5,0.0
31393,10,5,1.0,0,2,2.33,1,1,1,17.04,0,0.0071,0.66312,-0.7485,0.0,-1.0,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.5,0.0
41856,9,5,0.0,0,2,2.33,1,1,1,17.04,0,0.007,0.82298,-0.5681,1.0,1.0,0.0,-1.0,1.0,-1.0,2.0,0.0,1.0,0.5,1.0
52319,8,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0053,0.93502,-0.3546,0.0,-1.0,1.0,1.0,0.0,-1.0,1.0,-1.0,0.0,0.5,0.5
62782,7,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0043,0.99271,-0.1205,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,-1.0,0.0,0.0,0.25
73245,6,5,127.0,0,2,2.33,1,1,1,17.04,0,0.0037,0.99271,0.1205,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,0.25
83708,5,5,4.0,0,2,2.33,1,1,1,17.04,0,0.1061,0.93502,0.3546,127.0,127.0,0.0,0.0,0.0,0.0,0.0,-1.0,127.0,63.5,31.75
94171,4,5,39.0,0,2,2.33,1,1,1,17.04,0,0.0872,0.82298,0.5681,4.0,-123.0,127.0,127.0,0.0,0.0,0.0,0.0,4.0,65.5,32.75


#### Tobias features

In [27]:
def gbtransf(data, group_cols, targeted_cols, out_names, function, params = dict()):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = np.zeros((len(data), len(out_names)))
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval, **params)
        new_feat[i:j] = output

        # go to next group
        i = j
    
    out_df = pd.DataFrame(new_feat, columns = out_names, index = data.index)
        
    return out_df

In [28]:
shifting.sort_values(['itemID', 'group_backwards'], inplace=True, ascending=[True, False])

In [29]:
shifting.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10463,12,1,2.0,0,1,4.38,1,1,1,8.84,1,0.0,0.23932,-0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20926,11,1,313.0,0,1,4.38,1,1,1,8.84,0,0.0067,0.46472,-0.8855,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
31389,10,1,35.0,0,1,4.38,1,1,1,8.84,0,0.7466,0.66312,-0.7485,313.0,311.0,2.0,2.0,0.0,0.0,0.0,0.0,313.0,157.5,0.0
41852,9,1,3.0,0,1,4.38,1,1,1,8.84,0,0.6087,0.82298,-0.5681,35.0,-278.0,313.0,311.0,2.0,2.0,0.0,0.0,35.0,174.0,87.5


In [30]:
data = shifting.copy()

In [31]:
data.groupby("group_backwards")["is_new"].sum().to_dict()

{1: 728,
 2: 727,
 3: 794,
 4: 671,
 5: 785,
 6: 661,
 7: 716,
 8: 909,
 9: 785,
 10: 533,
 11: 371,
 12: 729,
 13: 1431}

In [32]:
the_cat = "manufacturer"

In [33]:
sla = data.groupby(["group_backwards", the_cat])["is_new"].sum().reset_index()

In [34]:
sla.sort_values(['group_backwards', the_cat], inplace = True, ascending=[False, True])

In [35]:
sla = sla.rename(columns={"is_new" : f"{the_cat}_new_items"})

In [36]:
data = pd.merge(data, sla, on = ["group_backwards", the_cat])

In [37]:
data["total_new"] = data["group_backwards"].map(data.groupby("group_backwards")["is_new"].sum().to_dict())

In [38]:
data.fillna(0, inplace=True)

In [39]:
# checking if we got what we wanted
data.query('itemID == 5')

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4,manufacturer_new_items,total_new
522,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59,1431
865,12,5,1.0,0,2,2.33,1,1,1,17.04,0,0.0179,0.23932,-0.9709,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,51,729
1208,11,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0101,0.46472,-0.8855,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.5,0.0,30,371
1551,10,5,1.0,0,2,2.33,1,1,1,17.04,0,0.0071,0.66312,-0.7485,0.0,-1.0,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.5,0.0,11,533
1894,9,5,0.0,0,2,2.33,1,1,1,17.04,0,0.007,0.82298,-0.5681,1.0,1.0,0.0,-1.0,1.0,-1.0,2.0,0.0,1.0,0.5,1.0,47,785
2237,8,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0053,0.93502,-0.3546,0.0,-1.0,1.0,1.0,0.0,-1.0,1.0,-1.0,0.0,0.5,0.5,19,909
2580,7,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0043,0.99271,-0.1205,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,-1.0,0.0,0.0,0.25,16,716
2923,6,5,127.0,0,2,2.33,1,1,1,17.04,0,0.0037,0.99271,0.1205,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,0.25,34,661
3266,5,5,4.0,0,2,2.33,1,1,1,17.04,0,0.1061,0.93502,0.3546,127.0,127.0,0.0,0.0,0.0,0.0,0.0,-1.0,127.0,63.5,31.75,40,785
3609,4,5,39.0,0,2,2.33,1,1,1,17.04,0,0.0872,0.82298,0.5681,4.0,-123.0,127.0,127.0,0.0,0.0,0.0,0.0,4.0,65.5,32.75,4,671


In [101]:
data.sort_values(['group_backwards', 'itemID'], inplace=True, ascending=[False, True])

In [102]:
data

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4,manufacturer_new_items,total_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0000,-3.2162e-16,-1.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,5,1431
520,13,2,0.0,0,2,3.00,1,2,1,16.92,0,0.0000,-3.2162e-16,-1.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,59,1431
4979,13,3,1.0,0,3,5.00,1,3,1,15.89,1,0.0000,-3.2162e-16,-1.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,18,1431
521,13,4,0.0,0,2,4.44,1,2,1,40.17,0,0.0000,-3.2162e-16,-1.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,59,1431
522,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0000,-3.2162e-16,-1.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,59,1431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0.0,180,253,0.00,8,44,8,56.57,0,0.0011,2.3932e-01,0.9709,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.25,0,728
136015,1,10460,0.0,0,253,0.00,8,44,8,163.81,0,0.0011,2.3932e-01,0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.00,0,728
136016,1,10461,0.0,0,253,0.00,8,44,8,128.01,0,0.0000,2.3932e-01,0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0,728
136017,1,10462,0.0,180,253,0.00,8,44,8,166.97,0,0.0011,2.3932e-01,0.9709,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,0.25,0,728


In [103]:
dataset = data.copy()

In [137]:
dataset.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4,manufacturer_new_items,total_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
520,13,2,0.0,0,2,3.0,1,2,1,16.92,0,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59,1431
4979,13,3,1.0,0,3,5.0,1,3,1,15.89,1,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18,1431
521,13,4,0.0,0,2,4.44,1,2,1,40.17,0,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59,1431
522,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59,1431


---

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [41]:
from sklearn.metrics import mean_squared_error as mse

In [89]:
# calculating rmse of worst prediction
worst_possible_prediction = dataset.loc[dataset.group_backwards > 1]['orderSum'].mean()
prediction = np.full(dataset.loc[dataset.group_backwards == 1]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = dataset.loc[dataset.group_backwards == 1]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 118.20105838913783


<hr>

## Dataset Splitting (Train until week 3 / Val. week 2/ Test week 1)

### Old items dataset

In [153]:
first_wp = 13

train = dataset.query('3 <= group_backwards <= @first_wp').reset_index(drop = True)
full_train = dataset.query('2 <= group_backwards <= @first_wp').reset_index(drop = True)
val = dataset.query('group_backwards == 2').reset_index(drop = True)
sub = dataset.query('group_backwards == 1').reset_index(drop = True)

In [154]:
len(train), len(full_train), len(val), len(sub)

(115093, 125556, 10463, 10463)

### New items dataset

In [155]:
# Weekpair is negative, so this works:
new_items = orders.sort_values("group_backwards", ascending=False).groupby(["itemID"])["group_backwards"].first()
new_items = new_items.reset_index()
new_items.head()

Unnamed: 0,itemID,group_backwards
0,1,12
1,2,9
2,3,13
3,4,12
4,5,13


In [209]:
train2 = pd.merge(new_items, train, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")
full_train2 = pd.merge(new_items, full_train, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")
val2 = pd.merge(new_items, val, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")
sub2 = pd.merge(new_items, sub, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")

# Check we didn't make mistakes...
assert len(train2) + len(val2) + len(sub2) == len(new_items)
assert len(new_items.query("group_backwards >= 3")) == len(train2)
assert len(new_items.query("group_backwards == 2")) == len(val2)
assert len(new_items.query("group_backwards == 1")) == len(sub2)

len(train2), len(val2), len(sub2)

(8385, 727, 728)

### Train/Full-train/Validation/Submission split

In [210]:
weights = infos.set_index('itemID')['simulationPrice'].to_dict()

In [211]:
y_train = train['orderSum']
y_train2 = train2['orderSum']
y_full_train = full_train['orderSum']
y_full_train2 = full_train2['orderSum']
y_val = val['orderSum']
y_val2 = val2['orderSum']
y_sub = sub['orderSum']
y_sub2 = sub2['orderSum']

X_train = train.drop(columns=['orderSum'])
X_train2 = train2.drop(columns=['orderSum', 'itemID', 'is_new'])
X_full_train = full_train.drop(columns=['orderSum'])
X_full_train2 = full_train2.drop(columns=['orderSum', 'itemID', 'is_new'])
X_val = val.drop(columns=['orderSum'])
X_val2 = val2.drop(columns=['orderSum', 'itemID', 'is_new'])
X_sub = sub.drop(columns=['orderSum'])
X_sub2 = sub2.drop(columns=['orderSum', 'itemID', 'is_new'])

# Mapping weights
w_train = train['itemID'].map(weights)
w_train2 = train2['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_full_train2 = full_train2['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_val2 = val2['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)
w_sub2 = sub2['itemID'].map(weights)

### Getting categorical features

In [212]:
features = X_train.columns.values
cat_feat = [2, 3, 5, 6, 7, 9]

In [213]:
features2 = X_train2.columns.values
cat_feat2 = [1, 2, 4, 5, 6]

---

## XGBoost

In [49]:
import xgboost as xgb

In [50]:
xgb.__version__

'1.0.2'

In [51]:
# custom objective

def gradient(prediction, dtrain):
    y = dtrain.get_label()
    return -2 * (prediction - np.maximum(prediction - y, 0) * 1.6) * (1 - (prediction > y) * 1.6)

def hessian(prediction, dtrain):
    y = dtrain.get_label()
    return -2 * (1 - (prediction > y) * 1.6) ** 2

def objective(prediction, dtrain):
    w = dtrain.get_weight()
    grad = gradient(prediction, dtrain) * w
    hess = hessian(prediction, dtrain) * w
    return grad, hess

In [52]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)

### Training the model (old items) - XGB

In [53]:
# specify parameters via map
param = {
    'max_depth':10,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 3,

}

num_round = 400

bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
                feval = feval_xgb,
                verbose_eval=25,
                maximize = True,
                )

[0]	train-feval:122331.56006	val-feval:13554.07397
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[25]	train-feval:4045713.45237	val-feval:315504.61481
[50]	train-feval:6496198.18996	val-feval:455168.78528
[75]	train-feval:8524369.89851	val-feval:543792.28432
[100]	train-feval:10129705.71744	val-feval:558981.43679
[125]	train-feval:11455965.00869	val-feval:531891.23698
Stopping. Best iteration:
[121]	train-feval:11410501.52750	val-feval:595515.76099



In [54]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

417981.32599999977

### Retrain! (old items)

In [55]:
bst_sub = xgb.train(
    param, 
    dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    feval = feval_xgb, 
    maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval=25,
)
bst_sub.best_ntree_limit

[0]	ftrain-feval:141452.75212
[25]	ftrain-feval:4423942.39844
[50]	ftrain-feval:7041687.50210
[75]	ftrain-feval:9252748.16756
[100]	ftrain-feval:10923820.56003
[121]	ftrain-feval:12326247.77054


122

In [56]:
prediction = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

480858.03399999987

### Training the model (only new items) - LGBM

In [235]:
lgbtrain2 = lgb.Dataset(X_train2, label=y_train2, weight=w_train2, categorical_feature=cat_feat)
lgbfulltrain2 = lgb.Dataset(X_full_train2, label=y_train2, weight=w_train2, categorical_feature=cat_feat)
lgbvalid2 = lgb.Dataset(X_val2, label=y_val2, weight=w_val2, categorical_feature=cat_feat)

In [236]:
params2 = {
          "objective" : "l2", # L2 works MUCH BETTER than L1
          "metric" :"rmse",
          #"learning_rate" : 0.5,
          'verbosity': 1,
          'max_depth': 5,
          #'num_leaves': 32,
          "min_data_in_leaf":3,
         }

num_round = 1000

model2_val = lgb.train(params2,
                  lgbtrain2,
                  num_round,
                  valid_sets = [lgbtrain2, lgbvalid2], 
                  verbose_eval=5,
                  early_stopping_rounds=5,
                  feval=lgb_feval,
                 )

NameError: name 'lgb_feval' is not defined

### Retrain! (only new items) - LGBM

In [None]:
lgb_new_sub = lgb.train(params2,
                  lgbfulltrain2,
                  lgb_new.best_iteration,
                  valid_sets = [lgbfulltrain2], 
                  verbose_eval=5,
                  early_stopping_rounds=5,
                  feval=feval,
                 )

In [None]:
prediction2 = lgb_new_sub.predict(X_sub2)

In [None]:
# Check to make sure itemIDs are in the correct order
if not new_items_id.is_monotonic_increasing:
    raise ValueError("ItemIDs not increasing! this will break the infos order!")

prices_new_items = infos.query("itemID in @new_items_id")["simulationPrice"]
assert len(y_test2) == len(prices_new_items)

In [None]:
evaluate(prediction2, y_sub2.values, prices_new_items)

<a href="#Table-of-Contents">GO TO BEGIN</a>

### Understanding feature importance

In [171]:
xgb_feat_v = bst.get_fscore().items()
xgb_feat_v = [imp for (trs, imp) in xgb_feat_v]
xgb_feat_imp = [p for p in zip(features, xgb_feat_v)]
xgb_feat_imp_df = pd.DataFrame(xgb_feat_imp, columns=['xgb_feature', 'importance'])
xgb_feat_imp_df = xgb_feat_imp_df.sort_values('importance', ascending=False)
xgb_feat_imp_df

Unnamed: 0,xgb_feature,importance
5,customerRating,10460
2,orderSum,5772
6,category1,5521
13,group_backwards_cos,3956
7,category2,3334
8,category3,3209
10,is_new,2478
12,group_backwards_sin,2077
3,brand,1409
1,itemID,1346


In [172]:
xgb_feat_v_sub = bst_sub.get_fscore().items()
xgb_feat_v_sub = [imp for (trs, imp) in xgb_feat_v_sub]
xgb_feat_imp_sub = [p for p in zip(features, xgb_feat_v_sub)]
xgb_feat_imp_sub_df = pd.DataFrame(xgb_feat_imp_sub, columns=['xgb_feature', 'importance_retrain'])
xgb_feat_imp_sub_df = xgb_feat_imp_sub_df.sort_values('importance_retrain', ascending=False)
xgb_feat_imp_sub_df

Unnamed: 0,xgb_feature,importance_retrain
5,customerRating,9009
6,category1,5787
2,orderSum,5506
9,recommendedRetailPrice,3340
7,category2,2577
8,category3,2559
13,group_backwards_cos,2411
11,percentage_accum_category3,2355
21,orderSum_diff_4,1101
3,brand,1043


In [174]:
# REMEMBER THAT XGBOOST'S SCALE OF FEATURE IMPORTANCE IS CONSIDERABLY LARGE THAN CATBOOST'S OR LGBM'S

xgb_diff = np.array(xgb_feat_v_sub) - np.array(xgb_feat_v)
xgb_diff_imp = [p for p in zip(features, xgb_diff)]
xgb_diff_imp_df = pd.DataFrame(xgb_diff_imp, columns=['xgb_feature', 'importance_gain_after_retrain'])
xgb_diff_imp_df = xgb_diff_imp_df.sort_values('importance_gain_after_retrain', ascending=False)
xgb_diff_imp_df

Unnamed: 0,xgb_feature,importance_gain_after_retrain
9,recommendedRetailPrice,2672
11,percentage_accum_category3,2068
21,orderSum_diff_4,809
19,orderSum_diff_3,600
6,category1,266
14,orderSum_1,72
23,orderSum_mean_rolled_2,7
0,group_backwards,-10
20,orderSum_4,-121
4,manufacturer,-139


---

## LGBM

In [216]:
import lightgbm as lgb

In [217]:
lgbtrain = lgb.Dataset(X_train, label=y_train, weight=w_train, categorical_feature=cat_feat)
lgbfulltrain = lgb.Dataset(X_full_train, label=y_full_train, weight=w_full_train, categorical_feature=cat_feat)
lgbvalid = lgb.Dataset(X_val, label=y_val, weight=w_val, categorical_feature=cat_feat)
lgbsubmis = lgb.Dataset(X_sub, label=y_sub, weight=w_sub, categorical_feature=cat_feat)

### Training the model (old items) - LGBM

In [218]:
params = {
          "objective": "l1",
          "metric": "rmse",
          "learning_rate": 0.5,
          'verbosity': 1,
          'max_depth': 6,
          'num_leaves': 32,
          "min_data_in_leaf": 3000,
         }

num_round = 1000

lgb_model = lgb.train(
                    params,
                    lgbtrain,
                    num_round,
                    valid_sets = [lgbtrain, lgbvalid],
                    valid_names = ['train', 'val'],
                    verbose_eval=5,
                    early_stopping_rounds=5,
                    feval=feval_lgbm,
                    )

Training until validation scores don't improve for 5 rounds
[5]	train's rmse: 39.4186	train's feval: 5.10801e+06	val's rmse: 44.4212	val's feval: 344630
[10]	train's rmse: 38.8189	train's feval: 7.33482e+06	val's rmse: 43.7995	val's feval: 572326
[15]	train's rmse: 38.8184	train's feval: 7.39799e+06	val's rmse: 43.7992	val's feval: 574303
Early stopping, best iteration is:
[14]	train's rmse: 38.8179	train's feval: 7.35266e+06	val's rmse: 43.7986	val's feval: 581480


In [219]:
prediction = lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration).astype(int)
evaluate(prediction, y_sub, w_sub)

643969.03

### Retrain! - LGBM

In [220]:
lgb_model_sub = lgb.train(
                    params,
                    lgbfulltrain,
                    lgb_model.best_iteration,
                    valid_sets = [lgbfulltrain],
                    valid_names = ['train'],
                    verbose_eval=5,
                    early_stopping_rounds=None,
                    feval = feval_lgbm,
                )

[5]	train's rmse: 39.6406	train's feval: 6.28435e+06
[10]	train's rmse: 38.229	train's feval: 1.02343e+07


In [221]:
prediction = lgb_model_sub.predict(X_sub, num_iteration=80).astype(int)
evaluate(prediction, y_sub, w_sub)

794533.5619999999

### Training the model (only new items)

In [228]:
lgbtrain2 = lgb.Dataset(X_train2, label=y_train2, weight=w_train2, categorical_feature=cat_feat)
lgbfulltrain2 = lgb.Dataset(X_full_train2, label=y_train2, weight=w_train2, categorical_feature=cat_feat)
lgbvalid2 = lgb.Dataset(X_val2, label=y_val2, weight=w_val2, categorical_feature=cat_feat)

In [229]:
params2 = {
          "objective" : "l2", # L2 works MUCH BETTER than L1
          "metric" :"rmse",
#           "learning_rate" : 0.5,
          'verbosity': 1,
          'max_depth': 5,
#           'num_leaves': 32,
          "min_data_in_leaf":3,
         }

num_round = 1000

lgb_new = lgb.train(
                    params2,
                    lgbtrain2,
                    num_round,
                    valid_sets = [lgbtrain2, lgbvalid2],
                    valid_names = ['train', 'val'],
                    verbose_eval=5,
                    early_stopping_rounds=5,
                    feval=feval_lgbm,
                 )

Training until validation scores don't improve for 5 rounds
[5]	train's rmse: 75.4316	train's feval: 6.53448e+06	val's rmse: 93.1416	val's feval: 838829
[10]	train's rmse: 68.8154	train's feval: 9.86951e+06	val's rmse: 85.9705	val's feval: 1.04007e+06
[15]	train's rmse: 64.8048	train's feval: 1.17628e+07	val's rmse: 86.7859	val's feval: 1.08046e+06
Early stopping, best iteration is:
[13]	train's rmse: 66.1412	train's feval: 1.1122e+07	val's rmse: 85.6068	val's feval: 1.07521e+06


In [230]:
prediction2 = lgb_new.predict(X_sub2)

In [231]:
new_items_id = test2["itemID"]
# Check to make sure itemIDs are in the correct order
if not new_items_id.is_monotonic_increasing:
    raise ValueError("ItemIDs not increasing! this will break the infos order!")

prices_new_items = infos.query("itemID in @new_items_id")["simulationPrice"]
assert len(y_sub2) == len(prices_new_items)


In [232]:
evaluate(prediction2, y_sub2.values, prices_new_items.values)

959573.406

### Retrain! (only new items) - LGBM

In [234]:
lgb_new_sub = lgb.train(
                    params2,
                    lgbfulltrain2,
                    lgb_new.best_iteration,
                    valid_sets = [lgbfulltrain2],
                    valid_names = ['train'],
                    verbose_eval=5,
                    early_stopping_rounds=None,
                    feval = feval_lgbm,
                )

LightGBMError: Length of label is not same with #data

### Merging models

<a href="#Table-of-Contents">GO TO BEGIN</a>

### Understanding feature importance

In [181]:
lgb_feat_v = lgb_model.feature_importance()
lgb_feat_imp = [p for p in zip(features, lgb_feat_v)]
lgb_feat_imp_df = pd.DataFrame(lgb_feat_imp, columns=['lgb_feature', 'importance'])
lgb_feat_imp_df = lgb_feat_imp_df.sort_values('importance', ascending=False)
lgb_feat_imp_df

Unnamed: 0,lgb_feature,importance
3,brand,38
10,is_new,14
24,orderSum_mean_rolled_4,12
6,category1,12
9,recommendedRetailPrice,7
15,orderSum_diff_1,4
0,group_backwards,3
11,percentage_accum_category3,3
22,orderSum_mean_rolled_1,2
13,group_backwards_cos,2


In [182]:
lgb_feat_v_sub = lgb_model_sub.feature_importance()
lgb_feat_imp_sub = [p for p in zip(features, lgb_feat_v_sub)]
lgb_feat_imp_sub_df = pd.DataFrame(lgb_feat_imp_sub, columns=['lgb_feature', 'importance_retrain'])
lgb_feat_imp_sub_df = lgb_feat_imp_sub_df.sort_values('importance_retrain', ascending=False)
lgb_feat_imp_sub_df

Unnamed: 0,lgb_feature,importance_retrain
3,brand,30
6,category1,15
10,is_new,10
24,orderSum_mean_rolled_4,9
22,orderSum_mean_rolled_1,9
9,recommendedRetailPrice,8
13,group_backwards_cos,4
23,orderSum_mean_rolled_2,3
16,orderSum_2,3
15,orderSum_diff_1,3


In [183]:
diff = [p for p in zip(features, lgb_feat_v_sub - lgb_feat_v)]
diff_df = pd.DataFrame(diff, columns=['lgb_feature', 'importance_gain_after_retrain'])
diff_df = diff_df.sort_values('importance_gain_after_retrain', ascending=False)
diff_df

Unnamed: 0,lgb_feature,importance_gain_after_retrain
22,orderSum_mean_rolled_1,7
16,orderSum_2,3
6,category1,3
13,group_backwards_cos,2
8,category3,2
23,orderSum_mean_rolled_2,2
1,itemID,1
9,recommendedRetailPrice,1
25,manufacturer_new_items,1
7,category2,0


---

## CatBoost

In [63]:
from catboost import CatBoost, CatBoostRegressor, Pool

In [64]:
ds_params = {
#     'cat_features': [2, 3, 5, 6, 7, 9],
}
train_pool = Pool(X_train, label = y_train, weight = w_train, **ds_params)
trainfull_pool = Pool(X_full_train, label = y_full_train, weight = w_full_train, **ds_params)
val_pool = Pool(X_val, label = y_val, weight = w_sub, **ds_params)
sub_pool = Pool(X_sub, label = y_sub, weight = w_sub, **ds_params)

### Training the model

In [65]:
cat_model = CatBoostRegressor(
    depth=7, 
    learning_rate=0.1, 
    loss_function='MAE',
    early_stopping_rounds=5,
    eval_metric = feval_cat(),
    thread_count=-1,
    verbose=10, # show us the progress after every 10 iterations
)

cat_model.fit(
    train_pool,
    eval_set=[train_pool, val_pool],
);


0:	learn: 0.0000000	test: 0.0000000	test1: 0.0000000	best: 0.0000000 (0)	total: 403ms	remaining: 6m 42s
10:	learn: 1038131.4479152	test: 1038131.4479152	test1: 148444.3560021	best: 148444.3560021 (10)	total: 2.58s	remaining: 3m 52s
20:	learn: 2783334.4942974	test: 2783334.4942974	test1: 274451.2300601	best: 274493.7800674 (19)	total: 4.84s	remaining: 3m 45s
30:	learn: 4315956.6835821	test: 4315956.6835821	test1: 346366.4960583	best: 346366.4960583 (30)	total: 7.01s	remaining: 3m 39s
40:	learn: 4380640.4013933	test: 4380640.4013933	test1: 348092.3180717	best: 350468.5160748 (38)	total: 9.07s	remaining: 3m 32s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 350468.5161
bestIteration = 38

Shrink model to first 39 iterations.


In [66]:
prediction = cat_model.predict(X_sub, ntree_end = cat_model.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

472915.056

### Retrain!

In [69]:
params = {**cat_model.get_params(), "iterations" : cat_model.best_iteration_}

cat_sub = CatBoostRegressor(**params)
cat_sub.fit(
    trainfull_pool,
    eval_set=[trainfull_pool],
);

0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 234ms	remaining: 8.66s
10:	learn: 1222396.9016862	test: 1222396.9016862	best: 1222396.9016862 (10)	total: 2.6s	remaining: 6.38s
20:	learn: 3074374.6702215	test: 3074374.6702215	best: 3074374.6702215 (20)	total: 5.06s	remaining: 4.09s
30:	learn: 4841309.5670845	test: 4841309.5670845	best: 4841309.5670845 (30)	total: 7.55s	remaining: 1.7s
37:	learn: 4890819.1130772	test: 4890819.1130772	best: 4890819.1130772 (37)	total: 9.28s	remaining: 0us

bestTest = 4890819.113
bestIteration = 37



In [70]:
prediction = cat_sub.predict(X_sub, ntree_end = cat_sub.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

483885.57999999996

<a href="#Table-of-Contents">GO TO BEGIN</a>

### Understanding feature importance

In [192]:
ctb_feat_v = cat_model.get_feature_importance()
ctb_feat_imp = [p for p in zip(features, ctb_feat_v)]
ctb_feat_imp_df = pd.DataFrame(ctb_feat_imp, columns=['ctb_feature', 'importance'])
ctb_feat_imp_df = ctb_feat_imp_df.sort_values('importance', ascending=False)
ctb_feat_imp_df

Unnamed: 0,ctb_feature,importance
9,recommendedRetailPrice,54.7736
8,category3,9.8393
1,itemID,5.291
5,customerRating,4.4554
24,orderSum_mean_rolled_4,3.2997
25,manufacturer_new_items,2.7955
3,brand,2.7256
21,orderSum_diff_4,2.4859
0,group_backwards,2.4308
19,orderSum_diff_3,2.0684


In [193]:
ctb_feat_v_sub = cat_sub.get_feature_importance()
ctb_feat_imp_sub = [p for p in zip(features, ctb_feat_v_sub)]
ctb_feat_imp_sub_df = pd.DataFrame(ctb_feat_imp_sub, columns=['ctb_feature', 'importance_retrain'])
ctb_feat_imp_sub_df = ctb_feat_imp_sub_df.sort_values('importance_retrain', ascending=False)
ctb_feat_imp_sub_df

Unnamed: 0,ctb_feature,importance_retrain
9,recommendedRetailPrice,59.6762
2,orderSum,6.5735
7,category2,6.2216
21,orderSum_diff_4,3.2813
4,manufacturer,2.8646
12,group_backwards_sin,2.5055
15,orderSum_diff_1,2.0029
19,orderSum_diff_3,1.9884
16,orderSum_2,1.817
22,orderSum_mean_rolled_1,1.7589


In [194]:
ctb_diff = [p for p in zip(features, ctb_feat_v_sub - ctb_feat_v)]
ctb_diff_df = pd.DataFrame(ctb_diff, columns=['ctb_feature', 'importance_gain_after_retrain'])
ctb_diff_df = ctb_diff_df.sort_values('importance_gain_after_retrain', ascending=False)
ctb_diff_df

Unnamed: 0,ctb_feature,importance_gain_after_retrain
2,orderSum,5.8892
7,category2,5.8723
9,recommendedRetailPrice,4.9025
12,group_backwards_sin,2.2657
4,manufacturer,2.1796
16,orderSum_2,1.4297
22,orderSum_mean_rolled_1,0.8423
17,orderSum_diff_2,0.8339
21,orderSum_diff_4,0.7954
15,orderSum_diff_1,0.7641


## Ensemble

In [74]:
cat_w = 1
lgb_w = 1
xgb_w = 1

ensemble = cat_sub.predict(X_sub, ntree_end = cat_sub.best_iteration_) * cat_w
ensemble += lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration) * lgb_w
ensemble += bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit) * xgb_w

ensemble = ensemble / (cat_w + lgb_w + xgb_w)

evaluate(ensemble.astype(int), y_sub, w_sub)

661698.294