# LGBM - Accumulated Sales of Category 3

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime
import seaborn as sns

#from sasaki_features import add_feature_position_month
sys.path.append("../dora/models")
from utils import read_data, process_time, merge_data, dataset_builder, cumulative_sale_by_category

NUMBER_OF_LAGS = 4

In [2]:
from sasaki_features import add_feature_position_month

## Defining metrics

Baseline_score function

In [3]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [4]:
def feval(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

Objective Metric

In [5]:
def gradient(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight()
    return -2 * (predt - np.maximum(predt - y, 0) * 1.6) * (1 - (predt > y) * 1.6) * sp

def hessian(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight() 
    return -2 * ((1 - (predt > y) * 1.6) ** 2) * sp

def objective(predt, dtrain):
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [6]:
infos, items, orders = read_data("../main/datasets/")
process_time(orders)

orders_columns = set(orders.columns)
print(orders_columns)

{'itemID', 'salesPrice', 'group_backwards', 'days', 'time', 'order', 'transactID', 'days_backwards'}


In [7]:
df = dataset_builder(orders, items)

In [8]:
orders2_columns = set(df.columns)
print(orders2_columns - orders_columns)

{'orderSum', 'customerRating', 'recommendedRetailPrice', 'category3', 'manufacturer', 'category2', 'brand', 'category1'}


In [9]:
#apply function without information from futures
def apply_to_serie(data, function,extraParans={}):
    
    new_data = pd.DataFrame()
 
    for time in data['group_backwards'].unique():
        new_rows = function(data,time,**extraParans)
        new_data = pd.concat([new_data, new_rows])
        

    return new_data

In [10]:
def add_feature_freq(data,time):
    
    #a linha de baixo utiliza o orders sem row com orderm 0
    orders_aux = orders.query(f"group_backwards >  {time}")#remember, its backwards
    
    nDays = orders_aux['days'].nunique()
    #nWeek = orders_aux['week_backwards'].nunique()
    nGroup = orders_aux['group_backwards'].nunique()

    newInfo = items[['itemID']].copy()
    newInfo["group_backwards"] = time
    
    #how many days in average the item is sold in day/week/pair
    newInfo['freq_day'] = orders_aux.groupby('itemID', as_index=False)['days'].nunique()/nDays
    #newInfo['freq_week'] = orders_aux.groupby('itemID', as_index=False)['week_backwards'].nunique()/nWeek
    newInfo['freq_group'] = orders_aux.groupby('itemID', as_index=False)['group_backwards'].nunique()/nGroup
    
    current_time = data.query(f"group_backwards == {time}")
    return pd.merge(current_time,newInfo, on=['itemID','group_backwards'], how="left", validate="m:1")

df2 = apply_to_serie(df,  add_feature_freq)

In [11]:
def add_feature_min_max(data,time):
    orders_aux = data.query(f'group_backwards > {time}')

    newInfo = items[['itemID']].copy()
    newInfo['group_backwards'] = time
    
    #minimun and maximum sales in a pair
    #too much zeros, trying for last 4 pairs
    newInfo['min_sale'] = orders_aux.groupby(['itemID'])['orderSum'].min()
    newInfo['max_sale'] = orders_aux.groupby(['itemID'])['orderSum'].max()


    #minimun and maximum sales in a group recent
    order_recent = data.query(f'group_backwards > {time} & group_backwards < {time+4}')
    newInfo['min_sale_rec'] = order_recent.groupby(['itemID'])['orderSum'].min()
    newInfo['max_sale_rec'] = order_recent.groupby(['itemID'])['orderSum'].max()

    current_time = data.query(f"group_backwards == {time}")
    return pd.merge(current_time,newInfo, on=['itemID','group_backwards'], how="left", validate="m:1")

df2 = apply_to_serie(df2,  add_feature_min_max)

In [12]:
df2 = add_feature_position_month(df2)

In [13]:
# percentage_accum_cat_3 feature...
df3 = cumulative_sale_by_category(df2)

In [14]:
orders3_columns = set(df3.columns)
print(orders3_columns - orders2_columns)

{'percentage_accum_cat_3', 'max_sale', 'posM_f_group', 'min_sale_rec', 'posM_l_group', 'freq_day', 'max_sale_rec', 'min_sale', 'posM_m_group', 'freq_group'}


## Feature building

In [15]:
# This cell lags and diffs our feature 'orderSum'
shifting = df3.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)
    shifting[f'percentage_accum_cat_3_{i}'] = shifting.groupby('itemID')['percentage_accum_cat_3'].shift(i)

    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    shifting[f'percentage_accum_cat_3_{i}'] = shifting.groupby('itemID')[f'percentage_accum_cat_3_{i}'].diff()
    
# LGBM Says on docs that it automatically handles zero values as NaN
shifting.fillna(-1, inplace=True)

In [16]:
#finding first apperance
not_zero_order = shifting.query('orderSum != 0')
first_appearance = not_zero_order.groupby('itemID',as_index=False)['group_backwards'].max()#remenber backwards
first_appearance.columns = ['itemID','first_appearance']

shifting2 = pd.merge(shifting, first_appearance, on="itemID",how="left", validate="m:1")

#putting in relation with the current timestamp
#positive means that the itemID was never sold
#negative means that the itemID was already sold
shifting2['first_appearance'] = shifting2['group_backwards'] - shifting2['first_appearance'] 

#removing dataleak
func = lambda x : np.nan if x >= 0 else x
shifting2['first_appearance'] = shifting2['first_appearance'].apply(func)

In [17]:
orders4_columns = set(shifting2.columns)
print(orders4_columns - orders3_columns)

{'percentage_accum_cat_3_1', 'orderSum_4', 'percentage_accum_cat_3_3', 'orderSum_diff_1', 'first_appearance', 'orderSum_2', 'orderSum_3', 'percentage_accum_cat_3_2', 'percentage_accum_cat_3_4', 'orderSum_diff_3', 'orderSum_1', 'orderSum_diff_2', 'orderSum_diff_4'}


<hr>

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [18]:
worst_possible_prediction = shifting.loc[shifting.group_backwards < 13]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 13]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 13]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 90.29706562119341


<hr>

## Dataset Splitting
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

### Utilities

**Predicting at test time**

In [19]:
params = {
#           "objective" : "poisson",
          "objective" : "l1",
          "metric" :"rmse",
          "learning_rate" : 0.5,
          'verbosity': 1,
          'max_depth': 6,
          'num_leaves': 32,
          "min_data_in_leaf":3000,
         }

In [20]:
def get_result(data, not_drop_columns):
    new_features = list(set(data.columns) - orders2_columns)
    for colum in not_drop_columns :
        new_features.remove(colum)
    
    data = data.drop(columns=new_features)
    train = data.loc[data.group_backwards >= 3]
    train_full = data.loc[data.group_backwards >= 2]
    val = data.loc[data.group_backwards == 2]
    test = data.loc[data.group_backwards == 1]

    weights = infos.set_index('itemID')['simulationPrice'].to_dict()

    w_train = train['itemID'].map(weights)
    w_val = val['itemID'].map(weights)
    w_train_full = train_full['itemID'].map(weights)


    y_train = train['orderSum']
    y_train_full = train_full['orderSum']
    y_val = val['orderSum']
    y_test = test['orderSum']
        
    X_train = train.drop(columns=["orderSum"])
    X_train_full = train_full.drop(columns=["orderSum"])
    X_val = val.drop(columns=["orderSum"])
    X_test = test.drop(columns=["orderSum"])


    lgbtrain = lgb.Dataset(X_train, label = y_train, weight=w_train)
    lgbtrain_full = lgb.Dataset(X_train_full, label = y_train_full, weight=w_train_full)
    lgbvalid = lgb.Dataset(X_val, label = y_val, weight=w_val)

    
    num_round = 1000
    model = lgb.train(params,
                  lgbtrain,
                  num_round,
                  valid_sets = [lgbtrain, lgbvalid], 
                  verbose_eval=0,
                  early_stopping_rounds=5,
#                   fobj=objective,
                  feval=feval,)
    
    index_best_score = model.best_iteration
    
    final_predictions = model.predict(X_test)
    final_predictions[final_predictions < 0] = 0
    
    
    scores = []
    scores.append(model.best_score['training']['feval'])#score train 1
    scores.append(model.best_score['valid_1']['feval'])#score validation 1
    #score test 1
    scores.append(baseline_score(final_predictions, y_test.values, infos['simulationPrice']))
    
    #retraining with pair-week 2
    
    model2 = lgb.train(params,
                lgbtrain_full,
                num_boost_round = index_best_score,
                valid_sets = [lgbtrain_full], 
                verbose_eval=0,
                early_stopping_rounds=5,
                feval=feval,)
    
    
    final_predictions2 = model2.predict(X_test)
    final_predictions2[final_predictions2 < 0] = 0
    
    scores.append(model2.best_score['training']['feval'])#score train 2
    #score test 2
    scores.append(baseline_score(final_predictions2, y_test.values, infos['simulationPrice']))
        
    return scores

In [21]:
print(get_result(shifting2,[]))#no new features

[0.0, 0.0, 0.0, 0.0, 0.0]


In [22]:
print(set(shifting2.columns) - orders2_columns)

{'percentage_accum_cat_3', 'max_sale', 'percentage_accum_cat_3_1', 'orderSum_4', 'posM_l_group', 'orderSum_1', 'posM_m_group', 'posM_f_group', 'first_appearance', 'percentage_accum_cat_3_2', 'orderSum_2', 'orderSum_3', 'freq_day', 'max_sale_rec', 'min_sale', 'orderSum_diff_4', 'freq_group', 'percentage_accum_cat_3_3', 'orderSum_diff_1', 'min_sale_rec', 'percentage_accum_cat_3_4', 'orderSum_diff_3', 'orderSum_diff_2'}


In [23]:
base = get_result(shifting2,[])#no new features
diff = lambda x: [x - b for x,b in zip(x,base)]

print(f"valor  base: {base};")
print("utilizando a diferença da base para os valores abaixo")
print(f"score treino1, score validacao1, score test1, score treino2, score validacao2")

print("todas features,posicao do mes, first_appearance")
print(diff(get_result(shifting2, list(set(shifting2.columns) - orders2_columns))))#all new features
print(diff(get_result(shifting2, ['posM_f_group', 'posM_m_group','posM_l_group'])))#position in month
print(diff(get_result(shifting2, ['first_appearance'])))#'first_appearance'
print(f"")

print("frequencia, min max e min_rec max_rec")
print(diff(get_result(shifting2, ['freq_day','freq_group'])))#frequency
print(diff(get_result(shifting2, ['min_sale','max_sale','max_sale_rec','min_sale_rec'])))#min max
print(f"")

print(" min_rec max_rec, ordermSum e orderSum_diff")
print(diff(get_result(shifting2, ['max_sale_rec','min_sale_rec'])))#min max recent
print(diff(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'orderSum_diff_1','orderSum_diff_2','orderSum_diff_3'])))
print(f"")

print("ordermSum , orderSum_diff")
print(diff(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3'])))
print(diff(get_result(shifting2, ['orderSum_diff_1','orderSum_diff_2','orderSum_diff_3'])))

print(f"")
print(diff(get_result(shifting2, ['percentage_accum_cat_3_3','percentage_accum_cat_3_4','percentage_accum_cat_3_1','percentage_accum_cat_3_2',
                            'percentage_accum_cat_3'])))


valor  base: [0.0, 0.0, 0.0, 0.0, 0.0];
utilizando a diferença da base para os valores abaixo
score treino1, score validacao1, score test1, score treino2, score validacao2
todas features,posicao do mes, first_appearance
[357270.3263937055, 58272.07592918872, 42995.587999999996, 341099.4682347, 33248.91]
[0.0, 0.0, 0.0, 0.0, 0.0]
[58697.59612010716, 9862.858035534618, 9868.356, 124565.86794940231, 15997.854]

frequencia, min max e min_rec max_rec
[0.0, 0.0, 0.0, 0.0, 0.0]
[64207.7359539628, 14810.4919773221, 14176.787999999999, 121310.03404651879, 24534.143999999997]

 min_rec max_rec, ordermSum e orderSum_diff
[37194.06799402236, 9325.247991836071, 9350.673999999999, 51771.89793430566, 8623.189999999999]
[287543.92409093975, 52451.536004149915, 52080.12599999999, 235186.23024313443, 34563.268]

ordermSum , orderSum_diff
[287543.92409093975, 52451.536004149915, 52080.12599999999, 235186.23024313443, 34563.268]
[149613.57214532493, 25416.067952919006, 24432.782, 184355.11032218335, 20863

In [24]:
base = get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3'])
diff = lambda x: [x - b for x,b in zip(x,base)]

print(f"score treino1, score validacao1, score test1, score treino2, score validacao2\n")

print("posicao mes, first_appearance")
print(diff(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'posM_f_group', 'posM_m_group','posM_l_group'])))#position in month

print(diff(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'first_appearance'])))#'first_appearance'
print("")

print("freq, min max e min_rec max_rec")
print(diff(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'freq_day','freq_group'])))#frequency

print(diff(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'min_sale','max_sale','max_sale_rec','min_sale_rec'])))#min max
print("")

print("min e max, percentage_accum_cat")
print(diff(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'max_sale_rec','min_sale_rec'])))#min max recent


print(diff(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                             'percentage_accum_cat_3_3','percentage_accum_cat_3_4','percentage_accum_cat_3_1','percentage_accum_cat_3_2',
                            'percentage_accum_cat_3'])))

score treino1, score validacao1, score test1, score treino2, score validacao2

posicao mes, first_appearance
[0.0, 0.0, 0.0, 0.0, 0.0]
[143605.99178530579, 20531.571890294545, 17336.812000000005, 268229.11404116155, 32896.648]

freq, min max e min_rec max_rec
[0.0, 0.0, 0.0, 0.0, 0.0]
[102413.33003123396, 20008.875943148123, 18250.678, 164240.94596729288, 28888.906000000003]

min e max, percentage_accum_cat
[63016.17797181604, 4924.413992333408, 2117.6800000000076, 90969.07602602249, 18914.789999999994]
[123161.35074998136, 19445.964060932405, 14948.198000000004, 146258.08160477283, 4499.356]


### features min e max tem um aumento pequeno no score, pode ajudar um pouco ou ser ruido
### o mesmo pode se aplicar a posicao do mes

In [25]:

print('\n soh semana anterior')
print(get_result(shifting2, ['orderSum_diff_1','orderSum_1']))
print(get_result(shifting2, ['orderSum_1']))
print('\n duas anteriores')
print(get_result(shifting2, ['orderSum_diff_1','orderSum_1','orderSum_diff_2','orderSum_2']))
print(get_result(shifting2, ['orderSum_1','orderSum_2']))
print('\n tres semanas anteriores')
print(get_result(shifting2, ['orderSum_diff_1','orderSum_1','orderSum_diff_2','orderSum_2','orderSum_diff_3','orderSum_3']))
print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3']))


 soh semana anterior
[221927.53613772988, 37969.266003286844, 37655.801999999996, 338420.19624308345, 49896.12599999999]
[267898.7642508089, 56259.806106841555, 53490.168000000005, 264910.3982194542, 37059.882]

 duas anteriores
[287543.92409093975, 52451.536004149915, 52080.12599999999, 235186.23024313443, 34563.268]
[352349.25370877975, 67551.53769187331, 62382.77399999999, 378279.92024835345, 54696.498]

 tres semanas anteriores
[287543.92409093975, 52451.536004149915, 52080.12599999999, 235186.23024313443, 34563.268]
[287543.92409093975, 52451.536004149915, 52080.12599999999, 235186.23024313443, 34563.268]


### Usando features que parecem impactar para modelo final

In [27]:
print(get_result(shifting2, ['orderSum_1','orderSum_2','orderSum_3',
                                  'first_appearance',
                             'min_sale','max_sale','max_sale_rec','min_sale_rec',
                                'percentage_accum_cat_3_3','percentage_accum_cat_3_4','percentage_accum_cat_3_1','percentage_accum_cat_3_2',
                            'percentage_accum_cat_3']))#position in month

[330224.7365670383, 54993.51602548361, 43107.009999999995, 406210.5981041312, 44837.06]
