In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from catboost import CatBoostRegressor, Pool, MetricVisualizer
from sasaki_features import add_feature_position_month
from datetime import datetime

!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
sys.path.append("../dora/models/")

from utils import  promo_detector, promotionAggregation

sys.path.append("../main")
from utils import read_data, process_time, merge_data

In [3]:
infos, items, orders = read_data("../main/datasets/")
print(orders.columns)

Index(['time', 'transactID', 'itemID', 'order', 'salesPrice'], dtype='object')


### adding created features

In [4]:
process_time(orders)
orders = promo_detector(orders)

In [5]:
print(orders.columns)

Index(['time', 'transactID', 'itemID', 'order', 'salesPrice', 'days',
       'days_backwards', 'week_backwards', 'group_backwards', 'salesPriceMode',
       'promotion'],
      dtype='object')


# trying catboost consider only groups (pair of weeks)

In [6]:
order_g = promotionAggregation(orders, items)
order_g = add_feature_position_month(order_g)

In [7]:
display(order_g)

Unnamed: 0,group_backwards,itemID,orderSum,promotion_mean,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,week_backwards,posM_f_group,posM_m_group,posM_l_group
0,1,1,3,0.000000,3.430000,0,1,4.38,1,1,1,8.84,1,16,23,29
1,1,3,140,0.000000,14.040000,0,3,5.00,1,3,1,15.89,1,16,23,29
2,1,4,145,0.000000,14.100000,0,2,4.44,1,2,1,40.17,1,16,23,29
3,1,5,1,1.000000,7.480000,0,2,2.33,1,1,1,17.04,1,16,23,29
4,1,7,1,0.000000,34.390000,0,3,4.00,1,3,1,26.40,1,16,23,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39510,13,9887,12,0.000000,1397.550000,178,225,4.50,8,44,8,1155.00,25,1,8,12
39511,13,9938,20,0.000000,1451.670000,178,225,0.00,8,44,8,671.79,25,1,8,12
39512,13,9986,2,0.000000,12.020000,187,234,0.00,8,44,8,45.54,25,1,8,12
39513,13,9999,7,0.000000,59.890000,182,227,5.00,8,44,8,38.57,25,1,8,12


### adding rows with zero sales

In [8]:
# Getting the IDs that were never sold
not_sold_items = items[np.logical_not(
    items.itemID.isin(sorted(orders['itemID'].unique())))]

new_rows = []
weeks_database = orders['group_backwards'].unique()


#features with same value for pair ('group', 'itemID')
from_example=['category1','category2','category3',
              'customerRating','recommendedRetailPrice',
              'manufacturer','brand']
    
for idd in order_g['itemID'].unique():
    orders_id = order_g[order_g.itemID == idd]
    example = orders_id.iloc[0]

    # finding weeks without itemID sales
    weeks_id = orders_id['group_backwards'].unique()
    weeks_without_id = np.setdiff1d(weeks_database, weeks_id)

    # creating new row
    for w in weeks_without_id:
        
        
        row = {'itemID': idd,
                         'group_backwards': w,
                         'salesPrice_mean': 0,
                         'orderSum': 0,
                         'promotion_mean': 0
              }
        for f in from_example:
            row[f] = example[f]
        
        new_rows.append(row)
#  Adding rows in every week with the IDs of the
# items that were never sold.

order_g = order_g.append(new_rows)  
order_g = order_g.sort_values(['group_backwards', 'itemID'], ascending=[False, True], ignore_index=True)

## Adding feature shifting
Compare to the dora model, I added salesPrice to the shift

In [9]:
order_g2 = order_g.copy()

for i in range(1, 3):
    # Carrying the data of weeks t-1
    order_g2[f'orderSum_{i}'] = order_g2.groupby('itemID')['orderSum'].shift(i)
    order_g2[f'promotion_mean_{i}'] = order_g2.groupby('itemID')['promotion_mean'].shift(i)
    order_g2[f'salesPrice_mean_{i}'] = order_g2.groupby('itemID')['salesPrice_mean'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    order_g2[f'orderSum_diff_{i}'] = order_g2.groupby('itemID')[f'orderSum_{i}'].diff()
    order_g2[f'promotion_mean_diff_{i}'] = order_g2.groupby('itemID')[f'promotion_mean_{i}'].diff()
    order_g2[f'salesPrice_mean_diff_{i}'] = order_g2.groupby('itemID')[f'salesPrice_mean_{i}'].diff()

#perguntar pro joao sobre o valor do fillna
order_g2.fillna(np.inf)

Unnamed: 0,group_backwards,itemID,orderSum,promotion_mean,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,...,salesPrice_mean_1,orderSum_diff_1,promotion_mean_diff_1,salesPrice_mean_diff_1,orderSum_2,promotion_mean_2,salesPrice_mean_2,orderSum_diff_2,promotion_mean_diff_2,salesPrice_mean_diff_2
0,13,1,0,0.0000,0.000000,0.0,1.0,4.38,1.0,1.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
1,13,2,0,0.0000,0.000000,0.0,2.0,3.00,1.0,2.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
2,13,3,1,0.0000,14.040000,0.0,3.0,5.00,1.0,3.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
3,13,4,0,0.0000,0.000000,0.0,2.0,4.44,1.0,2.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
4,13,5,2,0.0000,7.840000,0.0,2.0,2.33,1.0,1.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127915,1,10450,34,0.1875,53.555625,182.0,227.0,0.00,8.0,44.0,...,55.65,122.0,0.0,55.65,0.0,0.0,0.00,0.0,0.0,0.00
127916,1,10459,0,0.0000,0.000000,180.0,253.0,0.00,8.0,44.0,...,0.00,0.0,0.0,0.00,0.0,0.0,0.00,-1.0,0.0,-14.71
127917,1,10460,0,0.0000,0.000000,0.0,253.0,0.00,8.0,44.0,...,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00
127918,1,10462,0,0.0000,0.000000,180.0,253.0,0.00,8.0,44.0,...,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00


In [10]:
#categorical features
cat_features = ['brand','manufacturer','category1','category2','category3']

#passing to integer
for f in cat_features:
    order_g2[f] = order_g2[f].map(lambda x : int(x))  

## Data splitting

In [11]:
test = order_g2.loc[order_g2.group_backwards == 1]
val = order_g2.loc[order_g2.group_backwards == 2]
train = order_g2.loc[order_g2.group_backwards >= 3]


In [12]:
train_pool = Pool(
    data= train.drop(columns=["orderSum"]), 
    label= train['orderSum'], 
    cat_features= cat_features
)

validation_pool = Pool(
    data= val.drop(columns=["orderSum"]), 
    label= val['orderSum'], 
    cat_features= cat_features
)

## training and grid seach 
i dont know how to plot the train, if someone know please edit the code

In [None]:
model = CatBoostRegressor(iterations=1000, loss_function='RMSE',
                          use_best_model=True,
                          early_stopping_rounds=100)

bestModel = None
bestRMSE = 10000

#searching for best hyperparameters, descobrir sobre Bagging temperature e L2 regularization
depth= [4, 6, 8, 10]
learning_rate= [0.1,0.3, 0.7,None]

#grid search for best parameters
for d in tqdm(depth):
    for l in learning_rate:
        
        newModel = model.fit(train_pool, eval_set=validation_pool, verbose=False);
        RMSE = newModel.best_score_['validation']['RMSE']
        print(RMSE)
        if RMSE < bestRMSE:
            bestRMSE = RMSE
            bestModel = newModel
            

  0%|          | 0/4 [00:00<?, ?it/s]

89.4398748731065
89.4398748731065
89.4398748731065


 25%|██▌       | 1/4 [01:01<03:04, 61.57s/it]

89.4398748731065
89.4398748731065
89.4398748731065
89.4398748731065


 50%|█████     | 2/4 [02:04<02:03, 61.84s/it]

89.4398748731065
89.4398748731065
89.4398748731065
89.4398748731065


 75%|███████▌  | 3/4 [03:10<01:03, 63.11s/it]

89.4398748731065
89.4398748731065
89.4398748731065


In [None]:
print(f"best score: {bestRMSE}")
print(bestModel.get_all_params()['depth'])
print(bestModel.get_all_params()['learning_rate'])

## Predicing
Obs: i swapped negative predictive values to zero

In [None]:
test_pool = Pool(test.drop(columns=["orderSum"]),
                 cat_features= cat_features) 

preds = model.predict(test_pool)

#all prediction need to be positive andd integer
sold_items = test.copy()
preds = [max(x,0) for x in preds ]
sold_items["demandPrediction"] = np.rint(preds)
sold_items = sold_items[["itemID", "demandPrediction"]]


#items never sold will have 0 demandPrediction
not_sold_items = items[np.logical_not(
    items.itemID.isin(sorted(orders['itemID'].unique())))]
not_sold_items['demandPrediction'] = [0 for _ in range(len(not_sold_items))]
not_sold_items = not_sold_items[["itemID", "demandPrediction"]]

#to kagle csv
final = pd.concat([sold_items, not_sold_items])
final = final.sort_values(['itemID'],  ignore_index=True)
final.to_csv("kaggle_df.csv", index=False)

### saving model

In [None]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
modelName = 'cat-' + now
model.save_model(modelName)

### plotting feature importance

In [None]:
np.array(model.get_feature_importance(prettified=True))

there is suport for sharpley value, i will try later