# LGBM - Accumulated Sales of Category 3

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime
import seaborn as sns

#from sasaki_features import add_feature_position_month
sys.path.append("../dora/models")
from utils import read_data, process_time, merge_data, dataset_builder, cumulative_sale_by_category

NUMBER_OF_LAGS = 4

In [2]:
from sasaki_features import add_feature_position_month

## Defining metrics

Baseline_score function

In [3]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [4]:
def feval(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

Objective Metric

In [5]:
def gradient(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight()
    return -2 * (predt - np.maximum(predt - y, 0) * 1.6) * (1 - (predt > y) * 1.6) * sp

def hessian(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight() 
    return -2 * ((1 - (predt > y) * 1.6) ** 2) * sp

def objective(predt, dtrain):
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [6]:
infos, items, orders = read_data("../main/datasets/")
process_time(orders)

orders_columns = set(orders.columns)
print(orders_columns)

{'days', 'salesPrice', 'transactID', 'days_backwards', 'itemID', 'order', 'time', 'group_backwards'}


In [7]:
df = dataset_builder(orders, items)

In [8]:
orders2_columns = set(df.columns)
print(orders2_columns - orders_columns)

{'category3', 'category2', 'customerRating', 'manufacturer', 'category1', 'orderSum', 'recommendedRetailPrice', 'brand'}


In [9]:
#apply function without information from futures
def apply_to_serie(data, function,extraParans={}):
    
    new_data = pd.DataFrame()
 
    for time in data['group_backwards'].unique():
        new_rows = function(data,time,**extraParans)
        new_data = pd.concat([new_data, new_rows])
        

    return new_data

In [10]:
def add_feature_freq(data,time):
    
    #a linha de baixo utiliza o orders sem row com orderm 0
    orders_aux = orders.query(f"group_backwards >  {time}")#remember, its backwards
    
    nDays = orders_aux['days'].nunique()
    #nWeek = orders_aux['week_backwards'].nunique()
    nGroup = orders_aux['group_backwards'].nunique()

    newInfo = items[['itemID']].copy()
    newInfo["group_backwards"] = time
    
    #how many days in average the item is sold in day/week/pair
    newInfo['freq_day'] = orders_aux.groupby('itemID', as_index=False)['days'].nunique()/nDays
    #newInfo['freq_week'] = orders_aux.groupby('itemID', as_index=False)['week_backwards'].nunique()/nWeek
    newInfo['freq_group'] = orders_aux.groupby('itemID', as_index=False)['group_backwards'].nunique()/nGroup
    
    current_time = data.query(f"group_backwards == {time}")
    return pd.merge(current_time,newInfo, on=['itemID','group_backwards'], how="left", validate="m:1")

df2 = apply_to_serie(df,  add_feature_freq)

In [11]:
def add_feature_min_max(data,time):
    orders_aux = data.query(f'group_backwards > {time}')

    newInfo = items[['itemID']].copy()
    newInfo['group_backwards'] = time
    
    #minimun and maximum sales in a pair
    #too much zeros, trying for last 4 pairs
    newInfo['min_sale'] = orders_aux.groupby(['itemID'])['orderSum'].min()
    newInfo['max_sale'] = orders_aux.groupby(['itemID'])['orderSum'].max()


    #minimun and maximum sales in a group recent
    order_recent = data.query(f'group_backwards > {time} & group_backwards < {time+4}')
    newInfo['min_sale_rec'] = order_recent.groupby(['itemID'])['orderSum'].min()
    newInfo['max_sale_rec'] = order_recent.groupby(['itemID'])['orderSum'].max()

    current_time = data.query(f"group_backwards == {time}")
    return pd.merge(current_time,newInfo, on=['itemID','group_backwards'], how="left", validate="m:1")

df2 = apply_to_serie(df2,  add_feature_min_max)

In [12]:
df2 = add_feature_position_month(df2)

In [13]:
# percentage_accum_cat_3 feature...
df3 = cumulative_sale_by_category(df2)

In [14]:
orders3_columns = set(df3.columns)
print(orders3_columns - orders2_columns)

{'max_sale', 'min_sale_rec', 'posM_l_group', 'min_sale', 'freq_group', 'max_sale_rec', 'freq_day', 'percentage_accum_cat_3', 'posM_m_group', 'posM_f_group'}


## Feature building

In [15]:
# This cell lags and diffs our feature 'orderSum'
shifting = df3.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)
    shifting[f'percentage_accum_cat_3_{i}'] = shifting.groupby('itemID')['percentage_accum_cat_3'].shift(i)

    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    shifting[f'percentage_accum_cat_3_{i}'] = shifting.groupby('itemID')[f'percentage_accum_cat_3_{i}'].diff()
    
# LGBM Says on docs that it automatically handles zero values as NaN
shifting.fillna(-1, inplace=True)

In [16]:
#finding first apperance
not_zero_order = shifting.query('orderSum != 0')
first_appearance = not_zero_order.groupby('itemID',as_index=False)['group_backwards'].max()#remenber backwards
first_appearance.columns = ['itemID','first_appearance']

shifting2 = pd.merge(shifting, first_appearance, on="itemID",how="left", validate="m:1")

#putting in relation with the current timestamp
#positive means that the itemID was never sold
#negative means that the itemID was already sold
shifting2['first_appearance'] = shifting2['group_backwards'] - shifting2['first_appearance'] 

#removing dataleak
func = lambda x : np.nan if x >= 0 else x
shifting2['first_appearance'] = shifting2['first_appearance'].apply(func)

In [17]:
orders4_columns = set(shifting2.columns)
print(orders4_columns - orders3_columns)

{'orderSum_3', 'orderSum_1', 'percentage_accum_cat_3_2', 'orderSum_diff_3', 'orderSum_diff_2', 'orderSum_2', 'percentage_accum_cat_3_4', 'first_appearance', 'orderSum_diff_1', 'orderSum_diff_4', 'orderSum_4', 'percentage_accum_cat_3_1', 'percentage_accum_cat_3_3'}


<hr>

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [18]:
worst_possible_prediction = shifting.loc[shifting.group_backwards < 13]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 13]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 13]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 90.29706562119341


<hr>

## Dataset Splitting
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

### Utilities

**Predicting at test time**

In [19]:
params = {
#           "objective" : "poisson",
          "objective" : "l1",
          "metric" :"rmse",
          "learning_rate" : 0.1,
          'verbosity': 1,
          'max_depth': 6,
          'num_leaves': 15,
          "min_data_in_leaf":2000,
         }

In [56]:
def get_result(data, not_drop_columns):
    new_features = list(set(data.columns) - orders2_columns)
    for colum in not_drop_columns :
        new_features.remove(colum)
    
    data = data.drop(columns=new_features)
    train = data.loc[data.group_backwards >= 3]
    val = data.loc[data.group_backwards == 2]
    test = data.loc[data.group_backwards == 1]

    weights = infos.set_index('itemID')['simulationPrice'].to_dict()

    w_train = train['itemID'].map(weights)
    w_val = val['itemID'].map(weights)


    y_train = train['orderSum']
    y_val = val['orderSum']
    X_train = train.drop(columns=["orderSum"])
    X_val = val.drop(columns=["orderSum"])


    lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train)
    lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val)

    num_round = 1000
    model = lgb.train(params,
                  lgbtrain,
                  num_round,
                  valid_sets = [lgbtrain, lgbvalid], 
                  verbose_eval=0,
                  early_stopping_rounds=5,
#                   fobj=objective,
                  feval=feval,)


    y_test = test['orderSum']
    X_test = test.drop(columns=["orderSum"])
    final_predictions = model.predict(X_test)

    final_predictions[final_predictions < 0] = 0

    return baseline_score(final_predictions, y_test.values, infos['simulationPrice'])

In [57]:
print(set(shifting2.columns) - orders2_columns)

{'posM_l_group', 'orderSum_diff_2', 'first_appearance', 'percentage_accum_cat_3_3', 'posM_f_group', 'max_sale', 'orderSum_3', 'orderSum_1', 'freq_group', 'freq_day', 'posM_m_group', 'min_sale_rec', 'min_sale', 'orderSum_diff_3', 'orderSum_2', 'percentage_accum_cat_3_4', 'orderSum_diff_1', 'orderSum_diff_4', 'orderSum_4', 'max_sale_rec', 'percentage_accum_cat_3_2', 'percentage_accum_cat_3', 'percentage_accum_cat_3_1'}


In [61]:
print(get_result(shifting2,[]))#no new features

print(get_result(shifting2, list(set(shifting2.columns) - orders2_columns)))#all new features

print(get_result(shifting2, ['posM_f_group', 'posM_m_group','posM_l_group']))#position in month

print(get_result(shifting2, ['first_appearance']))#'first_appearance'

print(get_result(shifting2, ['freq_day','freq_group']))#frequency

print(get_result(shifting2, ['min_sale','max_sale','max_sale_rec','min_sale_rec']))#min max

print(get_result(shifting2, ['max_sale_rec','min_sale_rec']))#min max recent

print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'orderSum_diff_1','orderSum_diff_2','orderSum_diff_3']))

print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3']))

print(get_result(shifting2, ['orderSum_diff_1','orderSum_diff_2','orderSum_diff_3']))

print(get_result(shifting2, ['percentage_accum_cat_3_3','percentage_accum_cat_3_4','percentage_accum_cat_3_1','percentage_accum_cat_3_2',
                            'percentage_accum_cat_3']))
#nenhuma outra feature ajuda sem o orderm das semanas anteriores

0.0
8350.74
0.0
0.0
0.0
0.0
0.0
15465.743999999999
32055.995999999996
9855.214
0.0


In [62]:

print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'posM_f_group', 'posM_m_group','posM_l_group']))#position in month

print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'first_appearance']))#'first_appearance'

print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'freq_day','freq_group']))#frequency

print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'min_sale','max_sale','max_sale_rec','min_sale_rec']))#min max

print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'max_sale_rec','min_sale_rec']))#min max recent


print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'percentage_accum_cat_3_3','percentage_accum_cat_3_4','percentage_accum_cat_3_1','percentage_accum_cat_3_2',
                            'percentage_accum_cat_3']))

47221.562000000005
15736.068
32055.995999999996
47109.98199999999
38018.914
30938.184


In [63]:

print('\n soh semana anterior')
print(get_result(shifting2, ['orderSum_diff_1','orderSum_1']))
print(get_result(shifting2, ['orderSum_1']))
print('\n duas anteriores')
print(get_result(shifting2, ['orderSum_diff_1','orderSum_1','orderSum_diff_2','orderSum_2']))
print(get_result(shifting2, ['orderSum_1','orderSum_2']))
print('\n tres semanas anteriores')
print(get_result(shifting2, ['orderSum_diff_1','orderSum_1','orderSum_diff_2','orderSum_2','orderSum_diff_3','orderSum_3']))
print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3']))


 soh semana anterior
24359.47
17609.922

 duas anteriores
20960.606
39649.575999999994

 tres semanas anteriores
15465.743999999999
32055.995999999996
