In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from catboost import CatBoostRegressor, Pool, MetricVisualizer
from sasaki_features import add_feature_position_month
from datetime import datetime

!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
sys.path.append("../dora/models/")

from utils import  promo_detector, promotionAggregation

sys.path.append("../main")
from utils import read_data, process_time, merge_data

In [3]:
infos, items, orders = read_data("../main/datasets/")
print(orders.columns)

Index(['time', 'transactID', 'itemID', 'order', 'salesPrice'], dtype='object')


### adding created features

In [4]:
process_time(orders)
orders = promo_detector(orders)

In [5]:
print(orders.columns)

Index(['time', 'transactID', 'itemID', 'order', 'salesPrice', 'days',
       'days_backwards', 'week_backwards', 'group_backwards', 'salesPriceMode',
       'promotion'],
      dtype='object')


# trying catboost consider only groups (pair of weeks)

In [6]:
order_g = promotionAggregation(orders, items)

### adding rows with zero sales

In [7]:
# Getting the IDs that were never sold
not_sold_items = items[np.logical_not(
    items.itemID.isin(sorted(orders['itemID'].unique())))]

new_rows = []
weeks_database = orders['group_backwards'].unique()


#features with same value for pair ('group', 'itemID')
from_example=['category1','category2','category3',
              'customerRating','recommendedRetailPrice',
              'manufacturer','brand']
    
for idd in order_g['itemID'].unique():
    orders_id = order_g[order_g.itemID == idd]
    example = orders_id.iloc[0]

    # finding weeks without itemID sales
    weeks_id = orders_id['group_backwards'].unique()
    weeks_without_id = np.setdiff1d(weeks_database, weeks_id)

    # creating new row
    for w in weeks_without_id:
        
        
        row = {'itemID': idd,
                         'group_backwards': w,
                         'salesPrice_mean': 0,
                         'orderSum': 0,
                         'promotion_mean': 0
              }
        for f in from_example:
            row[f] = example[f]
        
        new_rows.append(row)
#  Adding rows in every week with the IDs of the
# items that were never sold.

order_g = order_g.append(new_rows)  
order_g = order_g.sort_values(['group_backwards', 'itemID'], ascending=[False, True], ignore_index=True)

In [8]:
order_g = add_feature_position_month(order_g)

In [9]:
display(order_g)

Unnamed: 0,group_backwards,itemID,orderSum,promotion_mean,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,week_backwards,posM_f_group,posM_m_group,posM_l_group
0,13,1,0,0.0000,0.000000,0.0,1.0,4.38,1.0,1.0,1.0,8.84,25,1,8,12
1,13,2,0,0.0000,0.000000,0.0,2.0,3.00,1.0,2.0,1.0,16.92,25,1,8,12
2,13,3,1,0.0000,14.040000,0.0,3.0,5.00,1.0,3.0,1.0,15.89,25,1,8,12
3,13,4,0,0.0000,0.000000,0.0,2.0,4.44,1.0,2.0,1.0,40.17,25,1,8,12
4,13,5,2,0.0000,7.840000,0.0,2.0,2.33,1.0,1.0,1.0,17.04,25,1,8,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127915,1,10450,34,0.1875,53.555625,182.0,227.0,0.00,8.0,44.0,8.0,36.78,1,16,23,29
127916,1,10459,0,0.0000,0.000000,180.0,253.0,0.00,8.0,44.0,8.0,56.57,1,16,23,29
127917,1,10460,0,0.0000,0.000000,0.0,253.0,0.00,8.0,44.0,8.0,163.81,1,16,23,29
127918,1,10462,0,0.0000,0.000000,180.0,253.0,0.00,8.0,44.0,8.0,166.97,1,16,23,29


## Adding feature shifting
Compare to the dora model, I added salesPrice to the shift

In [10]:
order_g2 = order_g.copy()

for i in range(1, 3):
    # Carrying the data of weeks t-1
    order_g2[f'orderSum_{i}'] = order_g2.groupby('itemID')['orderSum'].shift(i)
    order_g2[f'promotion_mean_{i}'] = order_g2.groupby('itemID')['promotion_mean'].shift(i)
    order_g2[f'salesPrice_mean_{i}'] = order_g2.groupby('itemID')['salesPrice_mean'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    order_g2[f'orderSum_diff_{i}'] = order_g2.groupby('itemID')[f'orderSum_{i}'].diff()
    order_g2[f'promotion_mean_diff_{i}'] = order_g2.groupby('itemID')[f'promotion_mean_{i}'].diff()
    order_g2[f'salesPrice_mean_diff_{i}'] = order_g2.groupby('itemID')[f'salesPrice_mean_{i}'].diff()

order_g2.fillna(np.inf)

Unnamed: 0,group_backwards,itemID,orderSum,promotion_mean,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,...,salesPrice_mean_1,orderSum_diff_1,promotion_mean_diff_1,salesPrice_mean_diff_1,orderSum_2,promotion_mean_2,salesPrice_mean_2,orderSum_diff_2,promotion_mean_diff_2,salesPrice_mean_diff_2
0,13,1,0,0.0000,0.000000,0.0,1.0,4.38,1.0,1.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
1,13,2,0,0.0000,0.000000,0.0,2.0,3.00,1.0,2.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
2,13,3,1,0.0000,14.040000,0.0,3.0,5.00,1.0,3.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
3,13,4,0,0.0000,0.000000,0.0,2.0,4.44,1.0,2.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
4,13,5,2,0.0000,7.840000,0.0,2.0,2.33,1.0,1.0,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127915,1,10450,34,0.1875,53.555625,182.0,227.0,0.00,8.0,44.0,...,55.65,122.0,0.0,55.65,0.0,0.0,0.00,0.0,0.0,0.00
127916,1,10459,0,0.0000,0.000000,180.0,253.0,0.00,8.0,44.0,...,0.00,0.0,0.0,0.00,0.0,0.0,0.00,-1.0,0.0,-14.71
127917,1,10460,0,0.0000,0.000000,0.0,253.0,0.00,8.0,44.0,...,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00
127918,1,10462,0,0.0000,0.000000,180.0,253.0,0.00,8.0,44.0,...,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00


In [11]:
#categorical features
cat_features = ['brand','manufacturer','category1','category2','category3']

#passing to integer
for f in cat_features:
    order_g2[f] = order_g2[f].map(lambda x : int(x))  

## Data splitting

In [12]:
test = order_g2.loc[order_g2.group_backwards == 1]
val = order_g2.loc[order_g2.group_backwards == 2]
train = order_g2.loc[order_g2.group_backwards >= 3]


In [13]:
train_pool = Pool(
    data= train.drop(columns=["orderSum"]), 
    label= train['orderSum'], 
    weight= train['salesPrice_mean'],
    cat_features= cat_features
)

validation_pool = Pool(
    data= val.drop(columns=["orderSum"]), 
    label= val['orderSum'], 
    weight= val['salesPrice_mean'],
    cat_features= cat_features
)

## defining custom metric

In [14]:
class custom_obj(object):
    def __iter__(self):
        return iter('custom')
    
    def get_final_error(self, error, weight):
    
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for prediction,t,w in zip(approx, target, weight):
            
            weight_sum += w
            
            error_sum += -1* (prediction - (np.maximum(prediction - t, 0) * 1.6))  * w

        return error_sum, weight_sum
    def calc_ders_range(self, approxes, targets, weights):
        pred = np.array(approxes)
        target = np.array(targets)
        weight = np.array(weights)
        
        
        der1 = -2 *weight* (pred - (np.maximum(pred - target, 0) * 1.6)) * (1 - (pred > target) * 1.6)
        der2 = -2 *weight* (1 - (pred > target) * 1.6) ** 2

        return list(zip(der1,der2))

In [15]:
params1 = {'iterations': 50, 
         'loss_function':'RMSE',
         'use_best_model': True,
         'early_stopping_rounds': 50,
         'eval_metric':custom_obj(),
         'subsample':1,
}

params2 = {'iterations': 50, 
         'loss_function':'RMSE',
         'use_best_model': True,
         'early_stopping_rounds': 50,
}


model1=CatBoostRegressor(**params1) 
model2=CatBoostRegressor(**params2) 

model1.fit(train_pool,eval_set=validation_pool , verbose=False);
model2.fit(train_pool,eval_set=validation_pool , verbose=False);

## Predicing
Obs: i swapped negative predictive values to zero

In [16]:
test_pool = Pool(test.drop(columns=["orderSum"]),
                 weight= test['salesPrice_mean'],
                 cat_features= cat_features) 


preds = model1.predict(test_pool)

#all prediction need to be positive and integer
sold_items = test.copy()
preds = [max(x,0) for x in preds ]
sold_items['demandPrediction'] = preds
sold_items = sold_items[["itemID", "demandPrediction"]]


#items never sold will have 0 demandPrediction
not_sold_items = items[np.logical_not(
    items.itemID.isin(sorted(orders['itemID'].unique())))]
not_sold_items['demandPrediction'] = [0 for _ in range(len(not_sold_items))]
not_sold_items = not_sold_items[["itemID", "demandPrediction"]]

#to kagle csv
final = pd.concat([sold_items, not_sold_items])
final["demandPrediction"] = final["demandPrediction"].astype(np.uint8)
final = final.sort_values(['itemID'],  ignore_index=True)
final.to_csv("kaggle_df_cat1.csv", index=False, sep='|')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [17]:
test_pool = Pool(test.drop(columns=["orderSum"]),
                 weight= test['salesPrice_mean'],
                 cat_features= cat_features) 


preds = model2.predict(test_pool)

#all prediction need to be positive and integer
sold_items = test.copy()
preds = [max(x,0) for x in preds ]
sold_items['demandPrediction'] = preds
sold_items = sold_items[["itemID", "demandPrediction"]]


#items never sold will have 0 demandPrediction
not_sold_items = items[np.logical_not(
    items.itemID.isin(sorted(orders['itemID'].unique())))]
not_sold_items['demandPrediction'] = [0 for _ in range(len(not_sold_items))]
not_sold_items = not_sold_items[["itemID", "demandPrediction"]]

#to kagle csv
final = pd.concat([sold_items, not_sold_items])
final["demandPrediction"] = final["demandPrediction"].astype(np.uint8)
final = final.sort_values(['itemID'],  ignore_index=True)
final.to_csv("kaggle_df_cat2.csv", index=False, sep='|')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### saving model

In [18]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
model1.save_model('cat1-' + now)
model2.save_model('cat2-' + now)

### plotting feature importance

In [19]:
np.array(model1.get_feature_importance(prettified=True))

array([['category3', 17.675753163693084],
       ['category2', 14.954454708209228],
       ['customerRating', 9.333770392495946],
       ['promotion_mean', 8.971775289039083],
       ['recommendedRetailPrice', 7.811629465784488],
       ['orderSum_diff_1', 7.59115601577602],
       ['itemID', 7.044104988465939],
       ['orderSum_1', 3.752390286269896],
       ['manufacturer', 3.5083258439692546],
       ['category1', 3.4749137779875556],
       ['orderSum_diff_2', 3.4135696196955196],
       ['orderSum_2', 2.5493555894012467],
       ['posM_l_group', 2.3084323799265674],
       ['posM_m_group', 1.8570315981304548],
       ['salesPrice_mean', 1.2125081476474733],
       ['salesPrice_mean_diff_1', 0.9232355557980024],
       ['group_backwards', 0.692582323761849],
       ['salesPrice_mean_2', 0.6649116222957184],
       ['promotion_mean_1', 0.6033701830251585],
       ['posM_f_group', 0.5493675347373135],
       ['week_backwards', 0.5337742572156035],
       ['salesPrice_mean_diff_2', 0

In [20]:
np.array(model2.get_feature_importance(prettified=True))

array([['category2', 20.165313374860947],
       ['itemID', 17.483120559335855],
       ['promotion_mean', 11.67388898279162],
       ['recommendedRetailPrice', 8.518730012398644],
       ['category1', 5.796060718759691],
       ['category3', 5.263377845137978],
       ['customerRating', 4.900464984106118],
       ['salesPrice_mean_diff_1', 3.2614200700143083],
       ['posM_l_group', 2.92565966838384],
       ['manufacturer', 2.713054618526134],
       ['orderSum_diff_1', 2.1159433548402147],
       ['orderSum_2', 2.042110011156362],
       ['salesPrice_mean_1', 1.9202041255122198],
       ['orderSum_1', 1.7024205974433264],
       ['salesPrice_mean', 1.418591188972481],
       ['salesPrice_mean_2', 1.1944520365503408],
       ['week_backwards', 1.1456681370955246],
       ['brand', 0.9820806314398197],
       ['posM_f_group', 0.9512481636078166],
       ['promotion_mean_1', 0.7906354338336332],
       ['salesPrice_mean_diff_2', 0.6401292016596352],
       ['orderSum_diff_2', 0.623019

there is suport for sharpley value, i will try later

In [21]:
##searching for best hyperparameters, descobrir sobre Bagging temperature e L2 regularization
#depth= [4, 7, 10]
#learning_rate= [None, 0.3, 0.7]
#bagging_temperature=[0, 1, 10]
#L2_regularization=[1,5,10]
#
#params_model = []
#for b in bagging_temperature:
#    for r in L2_regularization:
#        for l in learning_rate:
#            for d in depth:
#                params_model.append({'iterations': 1000, 
#                                    'loss_function':'RMSE',
#                                    'use_best_model': True,
#                                    'early_stopping_rounds': 50,
#                                    'bagging_temperature': b,
#                                    'learning_rate': l,
#                                    'depth': d,
#                                    'l2_leaf_reg': r
#                })
#
#
#bestModel = None
#bestRMSE = 10000
#
#
#
#for param in tqdm(params_model):
#    
#    model = CatBoostRegressor(**param)
#    newModel = model.fit(train_pool, eval_set=validation_pool, verbose=False);
#    
#    
#    RMSE = newModel.best_score_['validation']['RMSE']
#    if RMSE < bestRMSE:
#        bestRMSE = RMSE
#        bestParam = param
#        bestModel = newModel
#                    
#            

In [22]:
#print(f"best score: {bestRMSE}")
#print(param)