In [97]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from utils import *
import sys
from datetime import datetime

In [2]:
sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option("display.precision", 4)

---

## Defining metrics

Baseline_score function

In [522]:
def evaluate(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.dot((prediction - np.maximum(prediction - target, 0) * 1.6), simulatedPrice)

Evaluation Metric

In [5]:
def feval_lgbm(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

In [6]:
def feval_xgb(prediction, dtrain):
    prediction = prediction.astype(int)
    target = dtrain.get_label()
    simulationPrice = dtrain.get_weight()
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)

In [7]:
class feval_cat(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, simulationPrice):
        prediction = np.array(approxes[0]).astype(int)
        target = np.array(target).astype(int)
        simulationPrice = np.array(simulationPrice)
        score = np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulationPrice)
        return score, 0

<hr>

## Building our dataset

In [8]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [9]:
# Changing our time signatures
process_time(orders)

In [10]:
df = dataset_builder(orders, items)

In [11]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,13,1,0.0,0,1,4.38,1,1,1,8.84
1,13,2,0.0,0,2,3.0,1,2,1,16.92
2,13,3,1.0,0,3,5.0,1,3,1,15.89
3,13,4,0.0,0,2,4.44,1,2,1,40.17
4,13,5,2.0,0,2,2.33,1,1,1,17.04


<hr>

## Feature building

In [12]:
# This cell adds a feature responsible for indicating if in the current week
# a given item has its first appearance.
orders_sorted_by_week = orders.sort_values('group_backwards', ascending=False)
weeks_grouped_by_items = orders_sorted_by_week.groupby('itemID', as_index=False)
items_first_appearance = weeks_grouped_by_items.first()[['itemID', 'group_backwards']]
items_first_appearance.rename(columns={'group_backwards':'first_appearance'}, inplace=True)
df['is_new'] = 0
df = pd.merge(df, items_first_appearance, left_on=['itemID'], right_on=['itemID'], how='left', validate='m:1')
df.loc[df['first_appearance'] == df['group_backwards'], 'is_new'] = 1
df.drop(columns=['first_appearance'], inplace=True)

In [13]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0
1,13,2,0.0,0,2,3.0,1,2,1,16.92,0
2,13,3,1.0,0,3,5.0,1,3,1,15.89,1
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1


**Cumulative sale by category**

In [14]:
# percentage_accum_cat_3 feature...
df = cumulative_sale_by_category(df, category='category3')

In [15]:
df.tail()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3
136014,1,10459,0.0,180,253,0.0,8,44,8,56.57,0,0.0011
136015,1,10460,0.0,0,253,0.0,8,44,8,163.81,0,0.0011
136016,1,10461,0.0,0,253,0.0,8,44,8,128.01,0,0.0
136017,1,10462,0.0,180,253,0.0,8,44,8,166.97,0,0.0011
136018,1,10463,0.0,0,253,0.0,8,44,8,154.82,0,0.0011


**Time Encoding**

In [16]:
# Encoding our weeks as a series of sines and cosines...
# This function will consider our period as a semester in a year,
# so we can try other types of time encoding later!
df = time_encoder(df, 'group_backwards', 26)

In [17]:
df.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.2162e-16,-1.0
1,13,2,0.0,0,2,3.0,1,2,1,16.92,0,0.0,-3.2162e-16,-1.0
2,13,3,1.0,0,3,5.0,1,3,1,15.89,1,0.0,-3.2162e-16,-1.0
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0,0.0,-3.2162e-16,-1.0
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0,-3.2162e-16,-1.0


**Lags and diffs**

In [18]:
NUMBER_OF_LAGS = 4

In [19]:
# This cell lags and diffs our feature 'orderSum'
shifting = df.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)

    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    

In [20]:
shifting.tail()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4
136014,1,10459,0.0,180,253,0.0,8,44,8,56.57,0,0.0011,0.2393,0.9709,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0
136015,1,10460,0.0,0,253,0.0,8,44,8,163.81,0,0.0011,0.2393,0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
136016,1,10461,0.0,0,253,0.0,8,44,8,128.01,0,0.0,0.2393,0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136017,1,10462,0.0,180,253,0.0,8,44,8,166.97,0,0.0011,0.2393,0.9709,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0
136018,1,10463,0.0,0,253,0.0,8,44,8,154.82,0,0.0011,0.2393,0.9709,0.0,-1.0,1.0,1.0,0.0,0.0,0.0,0.0


**Rolling window "orderSum"**

In [21]:
NUMBER_OF_WINDOWS = 3

In [22]:
%%time
# This cell creates rolling-window features based on 'orderSum' in our dataset!
item_group = shifting.groupby(["itemID", "group_backwards"]).agg({'orderSum':'sum'})

# We'll .shift(-1) because it sorts our "group_backwards", 
# so doing .shift(1) would cause a HUGE dataleak.
aux_shifting = item_group.groupby('itemID')[['orderSum']].shift(-1)

aux_shifting.sort_values(['itemID', 'group_backwards'], ascending=[True, False], inplace=True)

for i in range(NUMBER_OF_WINDOWS):
    rolled_window = aux_shifting.groupby(['itemID'], as_index=False)[['orderSum']].rolling(2 ** i).mean()
    rolled_window.rename(columns={'orderSum':f"orderSum_mean_rolled_{2 ** i}"}, inplace=True)
    shifting = pd.merge(shifting, rolled_window, left_on=['itemID', 'group_backwards'], right_on=['itemID', 'group_backwards'])

CPU times: user 38.4 s, sys: 309 ms, total: 38.7 s
Wall time: 38.5 s


In [23]:
# LGBM Says on docs that it automatically handles zero values as NaN,
# so we'll keep this standard...
shifting.fillna(0, inplace=True)

In [24]:
shifting.query('itemID == 5')

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10467,12,5,1.0,0,2,2.33,1,1,1,17.04,0,0.0179,0.23932,-0.9709,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
20930,11,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0101,0.46472,-0.8855,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.5,0.0
31393,10,5,1.0,0,2,2.33,1,1,1,17.04,0,0.0071,0.66312,-0.7485,0.0,-1.0,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.5,0.0
41856,9,5,0.0,0,2,2.33,1,1,1,17.04,0,0.007,0.82298,-0.5681,1.0,1.0,0.0,-1.0,1.0,-1.0,2.0,0.0,1.0,0.5,1.0
52319,8,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0053,0.93502,-0.3546,0.0,-1.0,1.0,1.0,0.0,-1.0,1.0,-1.0,0.0,0.5,0.5
62782,7,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0043,0.99271,-0.1205,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,-1.0,0.0,0.0,0.25
73245,6,5,127.0,0,2,2.33,1,1,1,17.04,0,0.0037,0.99271,0.1205,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,0.25
83708,5,5,4.0,0,2,2.33,1,1,1,17.04,0,0.1061,0.93502,0.3546,127.0,127.0,0.0,0.0,0.0,0.0,0.0,-1.0,127.0,63.5,31.75
94171,4,5,39.0,0,2,2.33,1,1,1,17.04,0,0.0872,0.82298,0.5681,4.0,-123.0,127.0,127.0,0.0,0.0,0.0,0.0,4.0,65.5,32.75


### Tobias features

In [25]:
def gbtransf(data, group_cols, targeted_cols, out_names, function, params = dict()):
  
    X = data.values
    col = {c : i for i, c in enumerate(data.columns)}

    # values that are going to calculated
    new_feat = np.zeros((len(data), len(out_names)))
    
    # numbers of the columns
    gcols = [col[c] for c in group_cols]
    tcols = [col[c] for c in targeted_cols]
    
    interval = None
    a = None
    i = 0
    while i < len(X):
        a = X[i, gcols]

        # find the whole interval of this group
        j = i
        while j < len(X):
            if (X[j, gcols] != a).any():
                break
            j += 1
        interval = X[i:j, tcols]

        # apply function on interval, save in new feature
        output = function(interval, **params)
        new_feat[i:j] = output

        # go to next group
        i = j
    
    out_df = pd.DataFrame(new_feat, columns = out_names, index = data.index)
        
    return out_df

In [26]:
shifting.sort_values(['itemID', 'group_backwards'], inplace = True, ascending=[True, False])

In [27]:
shifting.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10463,12,1,2.0,0,1,4.38,1,1,1,8.84,1,0.0,0.23932,-0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20926,11,1,313.0,0,1,4.38,1,1,1,8.84,0,0.0067,0.46472,-0.8855,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
31389,10,1,35.0,0,1,4.38,1,1,1,8.84,0,0.7466,0.66312,-0.7485,313.0,311.0,2.0,2.0,0.0,0.0,0.0,0.0,313.0,157.5,0.0
41852,9,1,3.0,0,1,4.38,1,1,1,8.84,0,0.6087,0.82298,-0.5681,35.0,-278.0,313.0,311.0,2.0,2.0,0.0,0.0,35.0,174.0,87.5


In [28]:
data = shifting.copy()

In [29]:
data.groupby("group_backwards")["is_new"].sum().to_dict()

{1: 728,
 2: 727,
 3: 794,
 4: 671,
 5: 785,
 6: 661,
 7: 716,
 8: 909,
 9: 785,
 10: 533,
 11: 371,
 12: 729,
 13: 1431}

In [30]:
the_cat = "manufacturer"

In [31]:
sla = data.groupby(["group_backwards", the_cat])["is_new"].sum().reset_index()

In [32]:
sla.sort_values(['group_backwards', the_cat], inplace = True, ascending=[False, True])

In [33]:
sla = sla.rename(columns={"is_new" : f"{the_cat}_new_items"})

In [34]:
data = pd.merge(data, sla, on = ["group_backwards", the_cat])

In [35]:
data["total_new"] = data["group_backwards"].map(data.groupby("group_backwards")["is_new"].sum().to_dict())

In [36]:
data.fillna(0, inplace=True)

In [37]:
# checking if we got what we wanted
data.query('itemID == 5')

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4,manufacturer_new_items,total_new
522,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59,1431
865,12,5,1.0,0,2,2.33,1,1,1,17.04,0,0.0179,0.23932,-0.9709,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,51,729
1208,11,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0101,0.46472,-0.8855,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.5,0.0,30,371
1551,10,5,1.0,0,2,2.33,1,1,1,17.04,0,0.0071,0.66312,-0.7485,0.0,-1.0,1.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.5,0.0,11,533
1894,9,5,0.0,0,2,2.33,1,1,1,17.04,0,0.007,0.82298,-0.5681,1.0,1.0,0.0,-1.0,1.0,-1.0,2.0,0.0,1.0,0.5,1.0,47,785
2237,8,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0053,0.93502,-0.3546,0.0,-1.0,1.0,1.0,0.0,-1.0,1.0,-1.0,0.0,0.5,0.5,19,909
2580,7,5,0.0,0,2,2.33,1,1,1,17.04,0,0.0043,0.99271,-0.1205,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,-1.0,0.0,0.0,0.25,16,716
2923,6,5,127.0,0,2,2.33,1,1,1,17.04,0,0.0037,0.99271,0.1205,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,0.25,34,661
3266,5,5,4.0,0,2,2.33,1,1,1,17.04,0,0.1061,0.93502,0.3546,127.0,127.0,0.0,0.0,0.0,0.0,0.0,-1.0,127.0,63.5,31.75,40,785
3609,4,5,39.0,0,2,2.33,1,1,1,17.04,0,0.0872,0.82298,0.5681,4.0,-123.0,127.0,127.0,0.0,0.0,0.0,0.0,4.0,65.5,32.75,4,671


In [38]:
data.sort_values(['itemID', 'group_backwards'], inplace=True, ascending=[True, False])

In [39]:
data.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4,manufacturer_new_items,total_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
40,12,1,2.0,0,1,4.38,1,1,1,8.84,1,0.0,0.23932,-0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,729
80,11,1,313.0,0,1,4.38,1,1,1,8.84,0,0.0067,0.46472,-0.8855,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,6,371
120,10,1,35.0,0,1,4.38,1,1,1,8.84,0,0.7466,0.66312,-0.7485,313.0,311.0,2.0,2.0,0.0,0.0,0.0,0.0,313.0,157.5,0.0,3,533
160,9,1,3.0,0,1,4.38,1,1,1,8.84,0,0.6087,0.82298,-0.5681,35.0,-278.0,313.0,311.0,2.0,2.0,0.0,0.0,35.0,174.0,87.5,0,785


---

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [40]:
from sklearn.metrics import mean_squared_error as mse

In [41]:
worst_possible_prediction = shifting.loc[shifting.group_backwards > 1]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 1]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 1]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 118.20105838913783


<hr>

## Dataset Splitting (Train until week 3 / Val. week 2/ Test week 1)
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
dataset = data

In [44]:
train = dataset.query('3 <= group_backwards <= 13').reset_index(drop = True)
full_train = dataset.query('2 <= group_backwards <= 13').reset_index(drop = True)
val = dataset.query('group_backwards == 2').reset_index(drop = True)
sub = dataset.query('group_backwards == 1').reset_index(drop = True)

In [45]:
features = train.columns.values

In [46]:
features

array(['group_backwards', 'itemID', 'orderSum', 'brand', 'manufacturer',
       'customerRating', 'category1', 'category2', 'category3',
       'recommendedRetailPrice', 'is_new', 'percentage_accum_category3',
       'group_backwards_sin', 'group_backwards_cos', 'orderSum_1',
       'orderSum_diff_1', 'orderSum_2', 'orderSum_diff_2', 'orderSum_3',
       'orderSum_diff_3', 'orderSum_4', 'orderSum_diff_4',
       'orderSum_mean_rolled_1', 'orderSum_mean_rolled_2',
       'orderSum_mean_rolled_4', 'manufacturer_new_items', 'total_new'],
      dtype=object)

In [47]:
train.head()

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,percentage_accum_category3,group_backwards_sin,group_backwards_cos,orderSum_1,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_1,orderSum_mean_rolled_2,orderSum_mean_rolled_4,manufacturer_new_items,total_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,0.0,-3.2162e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1431
1,12,1,2.0,0,1,4.38,1,1,1,8.84,1,0.0,0.23932,-0.9709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,729
2,11,1,313.0,0,1,4.38,1,1,1,8.84,0,0.0067,0.46472,-0.8855,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,6,371
3,10,1,35.0,0,1,4.38,1,1,1,8.84,0,0.7466,0.66312,-0.7485,313.0,311.0,2.0,2.0,0.0,0.0,0.0,0.0,313.0,157.5,0.0,3,533
4,9,1,3.0,0,1,4.38,1,1,1,8.84,0,0.6087,0.82298,-0.5681,35.0,-278.0,313.0,311.0,2.0,2.0,0.0,0.0,35.0,174.0,87.5,0,785


In [48]:
len(train), len(val), len(sub)

(115093, 10463, 10463)

In [49]:
weights = infos.set_index('itemID')['simulationPrice'].to_dict()

In [50]:
y_train = train.pop('orderSum').values
y_full_train = full_train.pop('orderSum').values
y_val = val.pop('orderSum').values
y_sub = sub.pop('orderSum').values

X_train = train.values
X_full_train = full_train.values
X_val = val.values
X_sub = sub.values

w_train = train['itemID'].map(weights)
w_full_train = full_train['itemID'].map(weights)
w_val = val['itemID'].map(weights)
w_sub = sub['itemID'].map(weights)

---

## XGBoost

In [51]:
import xgboost as xgb

In [52]:
xgb.__version__

'1.0.2'

In [53]:
# custom objective

def gradient(prediction, dtrain):
    y = dtrain.get_label()
    return -2 * (prediction - np.maximum(prediction - y, 0) * 1.6) * (1 - (prediction > y) * 1.6)

def hessian(prediction, dtrain):
    y = dtrain.get_label()
    return -2 * (1 - (prediction > y) * 1.6) ** 2

def objective(prediction, dtrain):
    w = dtrain.get_weight()
    grad = gradient(prediction, dtrain) * w
    hess = hessian(prediction, dtrain) * w
    return grad, hess

In [54]:
missing = 0
dtrain = xgb.DMatrix(X_train, y_train, w_train, missing = missing)
dfulltrain = xgb.DMatrix(X_full_train, y_full_train, w_full_train, missing = missing)
dval = xgb.DMatrix(X_val, y_val, w_val, missing = missing)
dsub = xgb.DMatrix(X_sub, y_sub, w_sub, missing = missing)

### Training the model

In [55]:
# specify parameters via map
param = {
    'max_depth':10,
    'eta':0.005,
    'objective':'reg:squarederror',
    'disable_default_eval_metric': 1,
    "min_child_weight" : 3,

}

num_round = 400

bst = xgb.train(param, dtrain,
                num_round,
                early_stopping_rounds = 10,
                evals = [(dtrain, 'train'), (dval, 'val')],
                feval = feval_xgb,
                verbose_eval=25,
                maximize = True,
                )

[0]	train-feval:122331.56006	val-feval:13554.07397
Multiple eval metrics have been passed: 'val-feval' will be used for early stopping.

Will train until val-feval hasn't improved in 10 rounds.
[25]	train-feval:4045713.45237	val-feval:315504.61481
[50]	train-feval:6496198.18996	val-feval:455168.78528
[75]	train-feval:8524369.89851	val-feval:543792.28432
[100]	train-feval:10129705.71744	val-feval:558981.43679
[125]	train-feval:11455965.00869	val-feval:531891.23698
Stopping. Best iteration:
[121]	train-feval:11410501.52750	val-feval:595515.76099



In [56]:
prediction = bst.predict(dsub, ntree_limit=bst.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

417981.3259999999

### Retrain!

In [57]:
bst_sub = xgb.train(
    param, 
    dfulltrain,
    num_boost_round = bst.best_ntree_limit,
    feval = feval_xgb, 
    maximize = True,
    evals = [(dfulltrain, 'ftrain')],
    verbose_eval=25,
)
bst_sub.best_ntree_limit

[0]	ftrain-feval:141452.75212
[25]	ftrain-feval:4423942.39844
[50]	ftrain-feval:7041687.50210
[75]	ftrain-feval:9252748.16756
[100]	ftrain-feval:10923820.56003
[121]	ftrain-feval:12326247.77054


122

In [58]:
prediction = bst_sub.predict(dsub, ntree_limit=bst_sub.best_ntree_limit).astype(int)
evaluate(prediction, y_sub, w_sub)

480858.03399999975

### Understanding feature importance

In [59]:
xgb_feat_v = bst.get_fscore().items()
xgb_feat_v = [imp for (trs, imp) in xgb_feat_v]
xgb_feat_imp = [p for p in zip(features, xgb_feat_v)]
xgb_feat_imp_df = pd.DataFrame(xgb_feat_imp, columns=['xgb_feature', 'importance'])
xgb_feat_imp_df = xgb_feat_imp_df.sort_values('importance', ascending=False)
xgb_feat_imp_df

Unnamed: 0,xgb_feature,importance
5,customerRating,14295
2,orderSum,9433
6,category1,8098
8,category3,5166
15,orderSum_diff_1,5137
7,category2,5133
16,orderSum_2,4227
9,recommendedRetailPrice,3041
18,orderSum_3,2484
3,brand,1951


In [60]:
xgb_feat_v_sub = bst_sub.get_fscore().items()
xgb_feat_v_sub = [imp for (trs, imp) in xgb_feat_v_sub]
xgb_feat_imp_sub = [p for p in zip(features, xgb_feat_v_sub)]
xgb_feat_imp_sub_df = pd.DataFrame(xgb_feat_imp_sub, columns=['xgb_feature', 'importance'])
xgb_feat_imp_sub_df = xgb_feat_imp_sub_df.sort_values('importance', ascending=False)
xgb_feat_imp_sub_df

Unnamed: 0,xgb_feature,importance
5,customerRating,14706
2,orderSum,8949
6,category1,8235
8,category3,5062
9,recommendedRetailPrice,5058
10,is_new,4811
11,percentage_accum_category3,4282
7,category2,3843
16,orderSum_2,2152
3,brand,1921


In [83]:
# REMEMBER THAT XGBOOST'S SCALE OF FEATURE IMPORTANCE IS CONSIDERABLY LARGE THAN CATBOOST'S OR LGBM'S

xgb_diff = np.array(xgb_feat_v_sub) - np.array(xgb_feat_v)
xgb_diff_imp = [p for p in zip(features, xgb_diff)]
xgb_diff_imp_df = pd.DataFrame(xgb_diff_imp, columns=['xgb_feature', 'importance_gain_after_retrain'])
xgb_diff_imp_df = xgb_diff_imp_df.sort_values('importance_gain_after_retrain', ascending=False)
xgb_diff_imp_df

Unnamed: 0,xgb_feature,gain_importance_after_retrain
10,is_new,4478
11,percentage_accum_category3,3454
9,recommendedRetailPrice,2017
19,orderSum_diff_3,549
5,customerRating,411
17,orderSum_diff_2,244
13,group_backwards_cos,230
6,category1,137
23,orderSum_mean_rolled_2,79
12,group_backwards_sin,65


---

## LGBM

In [62]:
import lightgbm as lgb

In [63]:
lgbtrain = lgb.Dataset(X_train, label=y_train, weight=w_train, categorical_feature=[2, 3, 5, 6, 7, 9])
lgbfulltrain = lgb.Dataset(X_full_train, label=y_full_train, weight=w_full_train, categorical_feature=[2, 3, 5, 6, 7, 9])
lgbvalid = lgb.Dataset(X_val, label=y_val, weight=w_val, categorical_feature=[2, 3, 5, 6, 7, 9])
lgbsubmis = lgb.Dataset(X_sub, label=y_sub, weight=w_sub, categorical_feature=[2, 3, 5, 6, 7, 9])

### Training the model

In [64]:
params = {
          "objective": "l1",
          "metric": "rmse",
          "learning_rate": 0.5,
          'verbosity': 1,
          'max_depth': 7,
          'num_leaves': 15,
          "min_data_in_leaf": 3500,
         }

num_round = 1000

lgb_model = lgb.train(
                    params,
                    lgbtrain,
                    num_round,
                    valid_sets = [lgbtrain, lgbvalid],
                    valid_names = ['train', 'val'],
                    verbose_eval=5,
                    early_stopping_rounds=5,
                    feval=feval_lgbm,
                    )



Training until validation scores don't improve for 5 rounds
[5]	train's rmse: 39.4962	train's feval: 4.83791e+06	val's rmse: 44.5092	val's feval: 307293
[10]	train's rmse: 38.632	train's feval: 7.37376e+06	val's rmse: 43.6451	val's feval: 561797
[15]	train's rmse: 38.638	train's feval: 7.35815e+06	val's rmse: 43.651	val's feval: 561537
Early stopping, best iteration is:
[11]	train's rmse: 38.6304	train's feval: 7.37006e+06	val's rmse: 43.6435	val's feval: 562558


In [65]:
prediction = lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration).astype(int)
evaluate(prediction, y_sub, w_sub)

548045.6039999999

### Retrain!

In [66]:
lgb_model_sub = lgb.train(
                    params,
                    lgbfulltrain,
                    lgb_model.best_iteration,
                    valid_sets = [lgbfulltrain],
                    valid_names = ['train'],
                    verbose_eval=5,
                    early_stopping_rounds=None,
                    feval = feval_lgbm,
                    )

[5]	train's rmse: 39.5943	train's feval: 6.28894e+06
[10]	train's rmse: 39.3364	train's feval: 6.63024e+06


In [67]:
prediction = lgb_model_sub.predict(X_sub, num_iteration=80).astype(int)
evaluate(prediction, y_sub, w_sub)

525972.34

### Understanding feature importance

In [68]:
lgb_feat_v = lgb_model.feature_importance()
lgb_feat_imp = [p for p in zip(features, lgb_feat_v)]
lgb_feat_imp_df = pd.DataFrame(lgb_feat_imp, columns=['lgb_feature', 'importance'])
lgb_feat_imp_df = lgb_feat_imp_df.sort_values('importance', ascending=False)
lgb_feat_imp_df

Unnamed: 0,lgb_feature,importance
3,brand,32
6,category1,31
24,orderSum_mean_rolled_4,21
10,is_new,13
9,recommendedRetailPrice,11
13,group_backwards_cos,8
11,percentage_accum_category3,7
23,orderSum_mean_rolled_2,6
22,orderSum_mean_rolled_1,3
0,group_backwards,3


In [69]:
lgb_feat_v_sub = lgb_model_sub.feature_importance()
lgb_feat_imp_sub = [p for p in zip(features, lgb_feat_v_sub)]
lgb_feat_imp_sub_df = pd.DataFrame(lgb_feat_imp_sub, columns=['lgb_feature', 'importance'])
lgb_feat_imp_sub_df = lgb_feat_imp_sub_df.sort_values('importance', ascending=False)
lgb_feat_imp_sub_df

Unnamed: 0,lgb_feature,importance
3,brand,35
6,category1,24
24,orderSum_mean_rolled_4,20
10,is_new,14
22,orderSum_mean_rolled_1,12
9,recommendedRetailPrice,11
13,group_backwards_cos,10
23,orderSum_mean_rolled_2,7
15,orderSum_diff_1,5
25,manufacturer_new_items,5


In [70]:
diff = [p for p in zip(features, lgb_feat_v_sub - lgb_feat_v)]
diff_df = pd.DataFrame(diff, columns=['lgb_feature', 'importance_gain_after_retrain'])
diff_df = diff_df.sort_values('importance_gain_after_retrain', ascending=False)
diff_df

Unnamed: 0,lgb_feature,gain_importance_after_retrain
22,orderSum_mean_rolled_1,9
25,manufacturer_new_items,4
15,orderSum_diff_1,3
3,brand,3
13,group_backwards_cos,2
0,group_backwards,1
23,orderSum_mean_rolled_2,1
10,is_new,1
16,orderSum_2,0
21,orderSum_diff_4,0


---

## CatBoost

In [71]:
from catboost import CatBoost, CatBoostRegressor, Pool

In [92]:
ds_params = {
#     'cat_features': [2, 3, 5, 6, 7, 9],
}
train_pool = Pool(X_train, label = y_train, weight = w_train, **ds_params)
trainfull_pool = Pool(X_full_train, label = y_full_train, weight = w_full_train, **ds_params)
val_pool = Pool(X_val, label = y_val, weight = w_sub, **ds_params)
sub_pool = Pool(X_sub, label = y_sub, weight = w_sub, **ds_params)

### Training the model

In [72]:
cat_model = CatBoostRegressor(
    depth=7, 
    learning_rate=0.1, 
    loss_function='MAE',
    early_stopping_rounds=5,
    eval_metric = feval_cat(),
    thread_count=-1,
    verbose=10, # show us the progress after every 10 iterations
)

cat_model.fit(
    train_pool,
    eval_set=[train_pool, val_pool],
);


0:	learn: 0.0000000	test: 0.0000000	test1: 0.0000000	best: 0.0000000 (0)	total: 270ms	remaining: 4m 29s
10:	learn: 1038131.4479152	test: 1038131.4479152	test1: 148444.3560021	best: 148444.3560021 (10)	total: 2.38s	remaining: 3m 33s
20:	learn: 2783334.4942974	test: 2783334.4942974	test1: 274451.2300601	best: 274493.7800674 (19)	total: 4.45s	remaining: 3m 27s
30:	learn: 4315956.6835821	test: 4315956.6835821	test1: 346366.4960583	best: 346366.4960583 (30)	total: 6.51s	remaining: 3m 23s
40:	learn: 4380640.4013933	test: 4380640.4013933	test1: 348092.3180717	best: 350468.5160748 (38)	total: 8.55s	remaining: 3m 19s
Stopped by overfitting detector  (5 iterations wait)

bestTest = 350468.5161
bestIteration = 38

Shrink model to first 39 iterations.


In [73]:
prediction = cat_model.predict(X_sub, ntree_end = cat_model.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

472915.056

### Retrain!

In [74]:
cat_model.best_iteration_

38

In [75]:
{**cat_model.get_params(), "iterations" : cat_model.best_iteration_}

{'learning_rate': 0.1,
 'depth': 7,
 'loss_function': 'MAE',
 'verbose': 10,
 'eval_metric': <__main__.feval_cat at 0x7f75faa0f710>,
 'early_stopping_rounds': 5,
 'iterations': 38}

In [76]:
cat_sub = CatBoostRegressor(**{**cat_model.get_params(), "iterations" : cat_model.best_iteration_})
cat_sub.fit(
    trainfull_pool,
    eval_set=[trainfull_pool],
);

0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 237ms	remaining: 8.76s
10:	learn: 1222396.9016862	test: 1222396.9016862	best: 1222396.9016862 (10)	total: 2.45s	remaining: 6s
20:	learn: 3074374.6702215	test: 3074374.6702215	best: 3074374.6702215 (20)	total: 4.63s	remaining: 3.75s
30:	learn: 4841309.5670845	test: 4841309.5670845	best: 4841309.5670845 (30)	total: 6.8s	remaining: 1.53s
37:	learn: 4890819.1130772	test: 4890819.1130772	best: 4890819.1130772 (37)	total: 8.32s	remaining: 0us

bestTest = 4890819.113
bestIteration = 37



In [77]:
prediction = cat_sub.predict(X_sub, ntree_end = cat_sub.best_iteration_).astype(int)
evaluate(prediction, y_sub, w_sub)

483885.57999999996

### Understanding feature importance

In [78]:
ctb_feat_v = cat_model.get_feature_importance()
ctb_feat_imp = [p for p in zip(features, ctb_feat_v)]
ctb_feat_imp_df = pd.DataFrame(ctb_feat_imp, columns=['ctb_feature', 'importance'])
ctb_feat_imp_df = ctb_feat_imp_df.sort_values('importance', ascending=False)
ctb_feat_imp_df

Unnamed: 0,ctb_feature,importance
9,recommendedRetailPrice,37.4993
8,category3,15.0845
12,group_backwards_sin,10.2735
21,orderSum_diff_4,3.462
15,orderSum_diff_1,3.4355
16,orderSum_2,3.3336
19,orderSum_diff_3,3.2317
25,manufacturer_new_items,2.7805
18,orderSum_3,2.0866
1,itemID,1.9976


In [80]:
ctb_feat_v_sub = cat_sub.get_feature_importance()
ctb_feat_imp_sub = [p for p in zip(features, ctb_feat_v_sub)]
ctb_feat_imp_sub_df = pd.DataFrame(ctb_feat_imp_sub, columns=['ctb_feature', 'importance'])
ctb_feat_imp_sub_df = ctb_feat_imp_sub_df.sort_values('importance', ascending=False)
ctb_feat_imp_sub_df

Unnamed: 0,ctb_feature,importance
9,recommendedRetailPrice,42.508
8,category3,15.5035
12,group_backwards_sin,11.3877
16,orderSum_2,3.4861
2,orderSum,2.9034
21,orderSum_diff_4,2.7843
1,itemID,2.6229
25,manufacturer_new_items,2.3368
20,orderSum_4,2.0537
4,manufacturer,1.9019


In [81]:
ctb_diff = [p for p in zip(features, ctb_feat_v_sub - ctb_feat_v)]
ctb_diff_df = pd.DataFrame(ctb_diff, columns=['ctb_feature', 'importance_gain_after_retrain'])
ctb_diff_df = ctb_diff_df.sort_values('importance_gain_after_retrain', ascending=False)
ctb_diff_df

Unnamed: 0,ctb_feature,gain_importance_after_retrain
9,recommendedRetailPrice,5.0087
2,orderSum,1.6358
12,group_backwards_sin,1.1143
1,itemID,0.6254
20,orderSum_4,0.5302
8,category3,0.4189
4,manufacturer,0.1747
16,orderSum_2,0.1525
5,customerRating,0.0686
6,category1,0.0284


## Ensemble

In [539]:
def ens_gradient(vectors, weights, costs):
    # vectors (n x 3), weights (1 x 3), costs (1 x n)
    w_sum = np.sum(weights)
    
    # calculando a jacobiana do ensemble
    ens = np.dot(vectors, weights.reshape(-1, 1))
    J = vectors * w_sum
    J -= ens
    J = J/(w_sum ** 2)
    
    # calculando o gradiente
    
    # tem que calcular a jacobiana da evaluate aqui
    J_eval = J # so pra ter algum resultado, tirar depois
    
    grad = np.dot(costs, J_eval)
    
    return grad, (ens/w_sum).reshape(-1) # retorna o gradiente e o ensemble

In [540]:
def ensemble(vectors, target, costs, tol=0.0001, epochs=10000, verbose=False, verbosity=500, lr=0.1):
    # vectors have a shape of (n x 3)
    weights = np.array([1.0, 1.0, 1.0]).reshape(1, -1)
    grad, ens = ens_gradient(vectors, weights, costs)
    
    for epoch in np.arange(epochs):
        loss = np.sum(target - ens)
        
        if np.abs(loss) <= tol:
            if verbose:
                print(f"Tolerance criterion met! Stopping at iteration {epoch + 1} with Loss: {loss}.")
            print(f"\nBest weights: {weights.reshape(1, -1)}")
            return ens
        
        # If user wants to see output
        if verbose and epoch % verbosity == 0:
            print(f"Epoch {(epoch + 1):04d} =>\tLoss: {loss}\tWeights: {weights.reshape(1, -1)}")
            
        weights -= grad*lr
        grad, ens = ens_gradient(vectors, weights, costs)
    
    if verbose:
        print(f"Epoch {epochs} =>\tLoss: {np.sum(target-ens)}\tWeights: {weights.reshape(1, -1)}")
    
    print(f"\nBest weights: {weights.reshape(1, -1)}")
    
    return ens

In [541]:
xgb_v = bst_sub.predict(dsub, ntree_limit=bst.best_ntree_limit)
lgb_v = lgb_model.predict(X_sub, num_iteration=lgb_model.best_iteration)
cat_v = cat_sub.predict(X_sub, ntree_end=cat_sub.best_iteration_)

In [542]:
xgb_v = xgb_v.reshape(-1, 1)
lgb_v = lgb_v.reshape(-1, 1)
cat_v = cat_v.reshape(-1, 1)
vectors = np.concatenate([xgb_v, lgb_v, cat_v], axis=1)
vectors.shape

(10463, 3)

In [546]:
ens = ensemble(vectors, y_sub, w_sub.values.reshape(1, -1), tol=3000, epochs=10000, verbose=True, verbosity=500, lr=1e-6)
evaluate(ens.reshape(-1), y_sub, w_sub)

Epoch 0001 =>	Loss: 234927.92044215882	Weights: [[1. 1. 1.]]
Epoch 0501 =>	Loss: 234686.09481062772	Weights: [[11.41941942 11.31828703 11.31112123]]
Epoch 1001 =>	Loss: 234646.58623284084	Weights: [[-5543.06339234 -5486.00982761 -5481.96726508]]
Epoch 1501 =>	Loss: 234647.10063979618	Weights: [[-5542.99164203 -5486.0423945  -5482.00722343]]
Epoch 2001 =>	Loss: 234647.61504665494	Weights: [[-5542.91989125 -5486.07496092 -5482.0471813 ]]
Epoch 2501 =>	Loss: 234648.12945341747	Weights: [[-5542.84814    -5486.10752686 -5482.0871387 ]]
Epoch 3001 =>	Loss: 234648.64386008383	Weights: [[-5542.77638827 -5486.14009233 -5482.12709562]]
Epoch 3501 =>	Loss: 234649.15826665427	Weights: [[-5542.70463608 -5486.17265732 -5482.16705207]]
Epoch 4001 =>	Loss: 234649.67267312887	Weights: [[-5542.63288342 -5486.20522185 -5482.20700804]]
Epoch 4501 =>	Loss: 234650.1870795079	Weights: [[-5542.56113029 -5486.23778589 -5482.24696354]]
Epoch 5001 =>	Loss: 234650.70148579153	Weights: [[-5542.48937669 -5486.27034

660739.402

In [545]:
man_weights = np.array([1, 2, 0])
man_weights = man_weights.reshape(-1, 1)
man_ens = np.dot(vectors, man_weights)
man_ens /= np.sum(man_weights)
evaluate(man_ens.reshape(-1), y_sub, w_sub)

689275.4759999999