# Submission notebook 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from utils_submission import *
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


<hr>

## Defining metrics

Baseline_score function

In [2]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [3]:
def feval(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [4]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [5]:
# Changing our time signatures
process_time(orders)

<hr>

## Feature building

In [6]:
df = dataset_builder(orders, items)

**Tobias's transaction features**

In [7]:
multiIndex = pd.MultiIndex.from_product([items['itemID'], range(13, -1, -1)], names=['itemID', 'group_backwards'])
aux = pd.DataFrame(index=multiIndex)

orders['transactions'] = 1
item_transactions = orders.groupby(['itemID', 'group_backwards']).agg(item_transactions=('transactions','sum'))

# Adding this multiindex
item_transactions = pd.merge(aux, item_transactions, left_on=['itemID', 'group_backwards'], right_on=['itemID', 'group_backwards'], how='left')

item_transactions = item_transactions.groupby(['itemID'])[['item_transactions']].shift(1)
df = pd.merge(df, item_transactions, left_on=['group_backwards', 'itemID'], right_on=['group_backwards', 'itemID'], how='left')

# I could have done this so much easier than this way...
# ... but I won't spend time refatoring it.
singleIndex = pd.MultiIndex.from_product([range(13, -1, -1)], names=['group_backwards'])
last_week_index = pd.DataFrame(index=singleIndex)

last_week_transactions = orders.groupby(['group_backwards']).agg(last_week_transactions=('transactions','sum'))
last_week_transactions = pd.merge(last_week_index, last_week_transactions, left_on=['group_backwards'], right_on=['group_backwards'], how='left')

last_week_transactions = last_week_transactions.shift(1).fillna(0)

df = pd.merge(df, last_week_transactions, left_on=['group_backwards'], right_on=['group_backwards'], how='left', validate='m:1')

df = df.fillna(0)
df['transactions_feature'] = df['item_transactions'] + df['last_week_transactions']

**Adding 'is_new'**

In [8]:
# This cell adds a feature responsible for indicating if in the current week
# a given item has its first appearance.
orders_sorted_by_week = orders.sort_values('group_backwards', ascending=False)
weeks_grouped_by_items = orders_sorted_by_week.groupby('itemID', as_index=False)
items_first_appearance = weeks_grouped_by_items.first()[['itemID', 'group_backwards']]

items_first_appearance.rename(columns={'group_backwards':'first_appearance'}, inplace=True)

# We'll need to change this code a little bit to use this feature to submission.
# These lines take for granted that if an item didn't sell any units in the entire
# dataset, It MUST be sold on week 0...
allSales = df.groupby('itemID', as_index=False).agg({'orderSum':'sum'})
notSoldItems = allSales.loc[allSales['orderSum'] == 0]
notSoldItems['first_appearance'] = 0
items_first_appearance = pd.concat([items_first_appearance, notSoldItems.drop(columns=['orderSum'])])

df['is_new'] = 0

df = pd.merge(df, items_first_appearance, left_on=['itemID'], right_on=['itemID'], how='left', validate='m:1')

df.loc[df['first_appearance'] == df['group_backwards'], 'is_new'] = 1

df.drop(columns=['first_appearance'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notSoldItems['first_appearance'] = 0


In [34]:
for i in range(13, -1, -1):
    print(df.query('group_backwards == @i')['is_new'].sum())

1431
729
371
533
785
909
716
661
785
671
794
727
728
623


In [44]:
shifting.query('group_backwards == 13')['transactions_feature']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
10458    0.0
10459    0.0
10460    0.0
10461    0.0
10462    0.0
Name: transactions_feature, Length: 10463, dtype: float64

**How many new items by category and manufacturer (Tobias's contribuition)**

In [10]:
new_manufacturers = df.groupby(['group_backwards', 'manufacturer']).is_new.sum().reset_index().rename(columns={'is_new':'new_manufacturers'})
df = pd.merge(df, new_manufacturers, left_on=['group_backwards', 'manufacturer'], right_on=['group_backwards', 'manufacturer'], how='left', validate='m:1')

In [11]:
new_cat3 = df.groupby(['group_backwards', 'category3']).is_new.sum().reset_index().rename(columns={'is_new':'new_cat3'})
df = pd.merge(df, new_cat3, left_on=['group_backwards', 'category3'], right_on=['group_backwards', 'category3'], how='left', validate='m:1')

In [12]:
new_cat2 = df.groupby(['group_backwards', 'category2']).is_new.sum().reset_index().rename(columns={'is_new':'new_cat2'})
df = pd.merge(df, new_cat2, left_on=['group_backwards', 'category2'], right_on=['group_backwards', 'category2'], how='left', validate='m:1')

In [13]:
new_cat1 = df.groupby(['group_backwards', 'category1']).is_new.sum().reset_index().rename(columns={'is_new':'new_cat1'})
df = pd.merge(df, new_cat1, left_on=['group_backwards', 'category1'], right_on=['group_backwards', 'category1'], how='left', validate='m:1')

**Cumulative sale by category**

In [14]:
# percentage_accum_cat_3 feature...
df = cumulative_sale_by_category(df, category='category3')

**Time Encoding**

In [15]:
# Encoding our weeks as a series of sines and cosines...
# This function will consider our period as a semester in a year,
# so we can try other types of time encoding later!
df = time_encoder(df, 'group_backwards', 26)

**Lags and diffs**

In [16]:
# This cell lags and diffs our feature 'orderSum'
shifting = df.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)

    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    

**Rolling window "orderSum"**

In [17]:
%%time
# This cell creates rolling-window features based on 'orderSum' in our dataset!
item_group = shifting.groupby(["itemID", "group_backwards"]).agg({'orderSum':'sum'})

# We'll .shift(-1) because it sorts our "group_backwards", 
# so doing .shift(1) would cause a HUGE dataleak.
aux_shifting = item_group.groupby('itemID')[['orderSum']].shift(-1)

aux_shifting.sort_values(['itemID', 'group_backwards'], ascending=[True, False], inplace=True)

for i in range(3):
    rolled_window = aux_shifting.groupby(['itemID'], as_index=False)[['orderSum']].rolling(2 ** i).mean()
    rolled_window.rename(columns={'orderSum':f"orderSum_mean_rolled_{i}"}, inplace=True)
    shifting = pd.merge(shifting, rolled_window, left_on=['itemID', 'group_backwards'], right_on=['itemID', 'group_backwards'])

CPU times: user 36.6 s, sys: 317 ms, total: 36.9 s
Wall time: 37 s


In [18]:
# LGBM Says on docs that it automatically handles zero values as NaN,
# so we'll keep this standard...
shifting.fillna(0, inplace=True)

<hr>

## Let's take a moment to apreciate this beautiful dataset, shall we?

In [19]:

shifting

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,...,orderSum_diff_1,orderSum_2,orderSum_diff_2,orderSum_3,orderSum_diff_3,orderSum_4,orderSum_diff_4,orderSum_mean_rolled_0,orderSum_mean_rolled_1,orderSum_mean_rolled_2
0,13,1,0.0,0,1,4.38,1,1,1,8.84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
1,13,2,0.0,0,2,3.00,1,2,1,16.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
2,13,3,1.0,0,3,5.00,1,3,1,15.89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
3,13,4,0.0,0,2,4.44,1,2,1,40.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
4,13,5,2.0,0,2,2.33,1,1,1,17.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146477,0,10459,0.0,180,253,0.00,8,44,8,56.57,...,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,0.25
146478,0,10460,0.0,0,253,0.00,8,44,8,163.81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
146479,0,10461,0.0,0,253,0.00,8,44,8,128.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00
146480,0,10462,0.0,180,253,0.00,8,44,8,166.97,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.00


<hr>

## Dataset Splitting (Train until week 3 / Val. week 2/ Test week 1)
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

In [20]:
train = shifting.loc[shifting.group_backwards >= 3]
val = shifting.loc[shifting.group_backwards == 2]
test = shifting.loc[shifting.group_backwards == 1]

weights = infos.set_index('itemID')['simulationPrice'].to_dict()

w_train = train['itemID'].map(weights)
w_val = val['itemID'].map(weights)

In [21]:
# I recommend to the other members of the team keeping the
# datatypes of our datasets as Pandas DataFrames instead of Numpy,
# since It will easier to use Boosting Analysis frameworks
y_train = train['orderSum']
y_val = val['orderSum']
X_train = train.drop(columns=["orderSum"])
X_val = val.drop(columns=["orderSum"])

In [46]:
X_train.columns

Index(['group_backwards', 'itemID', 'brand', 'manufacturer', 'customerRating',
       'category1', 'category2', 'category3', 'recommendedRetailPrice',
       'item_transactions', 'last_week_transactions', 'transactions_feature',
       'is_new', 'new_manufacturers', 'new_cat3', 'new_cat2', 'new_cat1',
       'percentage_accum_category3', 'group_backwards_sin',
       'group_backwards_cos', 'orderSum_1', 'orderSum_diff_1', 'orderSum_2',
       'orderSum_diff_2', 'orderSum_3', 'orderSum_diff_3', 'orderSum_4',
       'orderSum_diff_4', 'orderSum_mean_rolled_0', 'orderSum_mean_rolled_1',
       'orderSum_mean_rolled_2'],
      dtype='object')

In [47]:
params = {
#           "objective" : "poisson",
          "objective" : "l1",
          "metric" :"rmse",
          "learning_rate" : 0.6,
          'verbosity': 1,
          'max_depth': 6,
          "min_data_in_leaf":2000,
         }

lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train, categorical_feature=[2, 4, 5, 6, 7, 12])
lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val, categorical_feature=[2, 4, 5, 6, 7, 12])

num_round = 1000
model = lgb.train(params,
                  lgbtrain,
                  num_round,
                  valid_sets = [lgbtrain, lgbvalid], 
                  verbose_eval=5,
                  early_stopping_rounds=5,
#                   fobj=objective,
                  feval=feval,
                 )



Training until validation scores don't improve for 5 rounds
[5]	training's rmse: 39.2624	training's feval: 6.15797e+06	valid_1's rmse: 44.2417	valid_1's feval: 489828
[10]	training's rmse: 38.9401	training's feval: 7.10896e+06	valid_1's rmse: 43.9315	valid_1's feval: 564675
Early stopping, best iteration is:
[9]	training's rmse: 38.9401	training's feval: 7.11256e+06	valid_1's rmse: 43.9311	valid_1's feval: 565159


In [32]:
X_train.columns[list(reversed(model.feature_importance().argsort()))]

Index(['category2', 'brand', 'new_manufacturers', 'percentage_accum_category3',
       'is_new', 'group_backwards', 'orderSum_2', 'item_transactions',
       'new_cat2', 'orderSum_diff_2', 'orderSum_diff_1', 'new_cat1',
       'orderSum_mean_rolled_1', 'customerRating', 'orderSum_1',
       'manufacturer', 'itemID', 'orderSum_diff_4', 'category3',
       'recommendedRetailPrice', 'orderSum_mean_rolled_2',
       'transactions_feature', 'orderSum_diff_3', 'orderSum_3', 'new_cat3',
       'orderSum_4', 'category1', 'group_backwards_sin',
       'orderSum_mean_rolled_0', 'group_backwards_cos',
       'last_week_transactions'],
      dtype='object')

**New items model with validation**

In [26]:
# Taking the first appearance of each item...
first_fortnight_item = orders.sort_values("group_backwards",
                                     ascending=False)\
                          .groupby(["itemID"])["group_backwards"].first()
first_fortnight_item = first_fortnight_item.reset_index()

In [36]:
train = shifting.loc[shifting.group_backwards >= 3]
val = shifting.loc[shifting.group_backwards == 2]
test = shifting.loc[shifting.group_backwards == 1]


new_items_train = pd.merge(first_fortnight_item, train, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")
new_items_val = pd.merge(first_fortnight_item, val, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")
new_items_test = pd.merge(first_fortnight_item, test, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")

# Check we didn't make mistakes...
assert len(new_items_train) + len(new_items_val) + len(new_items_test) == len(first_fortnight_item)
assert len(first_fortnight_item.query("group_backwards >= 3")) == len(new_items_train)
assert len(first_fortnight_item.query("group_backwards == 2")) == len(new_items_val)
assert len(first_fortnight_item.query("group_backwards == 1")) == len(new_items_test)

len(new_items_train), len(new_items_val), len(new_items_test)

(8385, 727, 728)

In [28]:
w_new_items_train = new_items_train['itemID'].map(weights)
w_new_items_val = new_items_val['itemID'].map(weights)

In [29]:
y_new_items_train = new_items_train['orderSum']
y_new_items_val = new_items_val['orderSum']
y_new_items_test = new_items_test['orderSum']
# Maybe other features don't make sense
X_new_items_train = new_items_train.drop(columns=["orderSum", "itemID", "is_new"])
X_new_items_val = new_items_val.drop(columns=["orderSum", "itemID", "is_new"])
X_new_items_test = new_items_test.drop(columns=["orderSum", "itemID", "is_new"])
# Make sure to change the categorical features if you drop more cols
cat_feats = [1, 2, 4, 5, 6]

In [30]:
params2 = {
#           "objective" : "poisson",
          #"objective" : "l1",
          "objective" : "l2", # L2 works MUCH BETTER than L1
          "metric" :"rmse",
          #"learning_rate" : 0.5,
          'verbosity': 1,
          'max_depth': 5,
          #'num_leaves': 32,
#           "min_data_in_leaf":2500,
         }
lgbtrain2 = lgb.Dataset(X_new_items_train, label=y_new_items_train, weight=w_new_items_train, 
                        categorical_feature=cat_feats)
lgbvalid2 = lgb.Dataset(X_new_items_val, label=y_new_items_val, weight=w_new_items_val, 
                        categorical_feature=cat_feats)

num_round = 1000
model_new_items = lgb.train(params2,
                  lgbtrain2,
                  num_round,
                  valid_sets = [lgbtrain2, lgbvalid2], 
                  verbose_eval=5,
                  early_stopping_rounds=5,
#                   fobj=objective,
                  feval=feval,
                 )

Training until validation scores don't improve for 5 rounds
[5]	training's rmse: 75.7426	training's feval: 6.83009e+06	valid_1's rmse: 95.9824	valid_1's feval: 815903
[10]	training's rmse: 69.3136	training's feval: 1.03007e+07	valid_1's rmse: 89.9562	valid_1's feval: 1.00066e+06
[15]	training's rmse: 65.1485	training's feval: 1.2268e+07	valid_1's rmse: 87.6193	valid_1's feval: 1.07888e+06
[20]	training's rmse: 63.1203	training's feval: 1.32957e+07	valid_1's rmse: 86.3373	valid_1's feval: 1.12503e+06
Early stopping, best iteration is:
[18]	training's rmse: 63.9206	training's feval: 1.28882e+07	valid_1's rmse: 86.0919	valid_1's feval: 1.11692e+06


<hr>

## <span style='color:red'>Submission training</span>
Here'll (finally) train with our entire dataset to submit our data.

In [None]:
train = shifting.loc[shifting.group_backwards >= 1]
test = shifting.loc[shifting.group_backwards == 0]

weights = infos.set_index('itemID')['simulationPrice'].to_dict()

w_train = train['itemID'].map(weights)

In [None]:
# I recommend to the other members of the team keeping the
# datatypes of our datasets as Pandas DataFrames instead of Numpy,
# since It will easier to use Boosting Analysis frameworks
y_train = train['orderSum']
X_train = train.drop(columns=["orderSum"])

In [None]:
X_train.columns

In [None]:
params = {
#           "objective" : "poisson",
          "objective" : "l1",
          "metric" :"rmse",
          "learning_rate" : 0.6,
          'verbosity': 1,
          'max_depth': 6,
          'num_leaves': 32,
          "min_data_in_leaf":2000,
         }

lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train, categorical_feature=[2, 3, 5, 6, 7, 9])

model = lgb.train(params,
                  lgbtrain,
                  model.best_iteration,
                  valid_sets = [lgbtrain], 
                  valid_names = ['train'],
                  verbose_eval=5,
                  early_stopping_rounds=5,
#                   fobj=objective,
                  feval=feval,
                  
                 )

**New items model without validation for week 0**

In [None]:
# Taking the first appearance of each item...
first_fortnight_item = orders.sort_values("group_backwards",
                                     ascending=False)\
                          .groupby(["itemID"])["group_backwards"].first()
first_fortnight_item = first_fortnight_item.reset_index()

In [None]:
new_items_train = pd.merge(first_fortnight_item, train, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")

"""
THIS IS WHERE WE INSERT OUR TEST LEAK.
Note that this change in the pipeline is necessary due to the fact that
now we're submitting our results!!!
"""
all_items_orders = shifting.groupby('itemID').agg({'orderSum':'sum'})
items_not_sold = all_items_orders.loc[all_items_orders['orderSum'] == 0].copy()
items_not_sold['group_backwards'] = 0
items_not_sold = items_not_sold.reset_index().drop(columns=['orderSum'])
new_items_test = pd.merge(items_not_sold, test, on=["itemID", "group_backwards"],
                  how="inner", validate="1:1")

# Check we didn't make mistakes...
assert len(new_items_train) == len(first_fortnight_item) # Changed this sanity check...
assert len(first_fortnight_item.query("group_backwards >= 1")) == len(new_items_train)
assert len(items_not_sold) == len(new_items_test) # Changed this sanity check...

len(new_items_train), len(new_items_test)

In [None]:
w_new_items_train = new_items_train['itemID'].map(weights)

In [None]:
y_new_items_train = new_items_train['orderSum']
y_new_items_test = new_items_test['orderSum']
# Maybe other features don't make sense
X_new_items_train = new_items_train.drop(columns=["orderSum", "itemID", "is_new"])
X_new_items_test = new_items_test.drop(columns=["orderSum", "itemID", "is_new"])

# Make sure to change the categorical features if you drop more cols
cat_feats = [1, 2, 4, 5, 6]

In [None]:
X_new_items_train.columns

In [None]:
params2 = {
#           "objective" : "poisson",
          #"objective" : "l1",
          "objective" : "l2", # L2 works MUCH BETTER than L1
          "metric" :"rmse",
          #"learning_rate" : 0.5,
          'verbosity': 1,
          'max_depth': 5,
          #'num_leaves': 32,
#           "min_data_in_leaf":2500,
         }
lgbtrain2 = lgb.Dataset(X_new_items_train, label=y_new_items_train, weight=w_new_items_train, 
                        categorical_feature=cat_feats)

model_new_items = lgb.train(params2,
                  lgbtrain2,
                  model_new_items.best_iteration,
                  valid_sets = [lgbtrain2], 
                  verbose_eval=5,
                  early_stopping_rounds=5,
#                   fobj=objective,
                  feval=feval,
                 )

<hr>

### Utilities

**Predicting "common" items at test time**

In [32]:
test.columns == train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [37]:
y_test = test['orderSum']
X_test = test.drop(columns=["orderSum"])
final_predictions = model.predict(X_test)

LightGBMError: The number of features in data (28) is not the same as it was in training data (27).

**Predicting new items at test time**

In [None]:
final_new_items_predictions = model_new_items.predict(X_new_items_test)

**Mergin' predictions**

In [None]:
assert test["itemID"].is_monotonic_increasing
new_items_idx = X_test["itemID"].isin(first_fortnight_item.query('group_backwards == 1')['itemID'])
final_predictions[new_items_idx] = final_new_items_predictions
final_predictions[final_predictions < 0] = 0

In [None]:
baseline_score(final_predictions, y_test.values, infos['simulationPrice']) / 10 ** 6

**Items that are new in the test week**

In [None]:
final_predictions[new_items_idx] = 0
final_predictions[final_predictions < 0] = 0
baseline_score(final_predictions, y_test.values, infos['simulationPrice']) / 10 ** 6