In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from catboost import CatBoostRegressor, Pool, MetricVisualizer
from sasaki_features import add_feature_position_month
from datetime import datetime

sys.path.append("../dora/models")
from utils import read_data, process_time, merge_data, promotionAggregation

#TENTAR FEATURES NOS MODELOS DO JOAO

## some functions from dora

### (por algum motivo eu nao consegui importar)

In [2]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

## importing orders and applying already made features

In [3]:
infos, items, orders = read_data("../main/datasets/")
process_time(orders)

orders_columns = set(orders.columns)
print(orders_columns)

{'days', 'salesPrice', 'time', 'order', 'itemID', 'days_backwards', 'group_backwards', 'week_backwards', 'transactID'}


## Adding zero ordemSum rows

In [8]:
# Aggregating our data by pairs...
df = orders.groupby(['group_backwards', 'itemID'], as_index=False).agg({'order':'sum'}).rename(columns={'order':'orderSum'})

# Building our dataset through multiindexing...
multiIndex = pd.MultiIndex.from_product([range(13, 0, -1), items['itemID']], names=['group_backwards', 'itemID'])
aux = pd.DataFrame(index=multiIndex)
df = pd.merge(aux, df, left_on=['group_backwards', 'itemID'], right_on=['group_backwards', 'itemID'], how='left')
df.fillna(0, inplace = True)

# Gettin' informations about our items in our dataset...
orders2 = pd.merge(df, items, left_on=['itemID'], right_on=['itemID']).sort_values('group_backwards', ascending=False)

orders2_columns = set(orders2.columns)
print(orders2_columns - orders_columns)

{'category2', 'brand', 'category1', 'category3', 'manufacturer', 'customerRating', 'orderSum', 'recommendedRetailPrice'}


In [46]:
print(len(orders2))

136019


In [6]:
valid_ids = set( orders['itemID'].unique() )
all_ids = set(items['itemID'].unique())

## New features 1

In [60]:
orders4 = add_feature_position_month(orders2)

In [61]:
print(len(orders4))

136019


In [62]:
#apply function without information from futures
def apply_to_serie(data, function,extraParans={}):
    
    new_data = pd.DataFrame()
 
    for time in data['group_backwards'].unique():
        new_rows = function(data,time,**extraParans)
        new_data = pd.concat([new_data, new_rows])
        

    return new_data

In [63]:
def add_feature_freq(data,time):
    orders_aux = orders.query(f"group_backwards >  {time}")#remember, its backwards
    
    nDays = orders_aux['days'].nunique()
    nWeek = orders_aux['week_backwards'].nunique()
    nGroup = orders_aux['group_backwards'].nunique()

    newInfo = items[['itemID']].copy()
    newInfo["group_backwards"] = time
    
    #how many days in average the item is sold in day/week/pair
    newInfo['freq_day'] = orders_aux.groupby('itemID', as_index=False)['days'].nunique()/nDays
    newInfo['freq_week'] = orders_aux.groupby('itemID', as_index=False)['week_backwards'].nunique()/nWeek
    newInfo['freq_group'] = orders_aux.groupby('itemID', as_index=False)['group_backwards'].nunique()/nGroup
    
    current_time = data.query(f"group_backwards == {time}")
    return pd.merge(current_time,newInfo, on=['itemID','group_backwards'], how="left", validate="m:1")

orders4 = apply_to_serie(orders4,  add_feature_freq)

In [64]:
print(len(orders4))

136019


In [65]:
#ordenando features categoricas usando como metrica 
#vendas totais medias entre itemID de mesma categoria
def add_feature_ord_cat(data, time, category):
    orders_aux = data.query(f"group_backwards > {time}")#remember, its backwards

    newInfo = orders_aux.groupby(category,as_index=False).agg({'orderSum' : ['sum'],'itemID' : ['count']})
    newInfo["group_backwards"] = time
    

    #calculando a metrica para cada item diferente da coluna
    newInfo[f'avg_sales_{category}'] = newInfo[( 'orderSum',   'sum')] / newInfo[( 'itemID', 'count')]
    newInfo = newInfo[[category,"group_backwards", f'avg_sales_{category}']]
    newInfo.columns = [category,"group_backwards", f'avg_sales_{category}']
    
    
    current_time = data.query(f"group_backwards == {time}")
    return pd.merge(current_time,newInfo, on=[category,'group_backwards'], how="left", validate="m:1")


orders4 = apply_to_serie(orders4,  add_feature_ord_cat, extraParans={"category": "category3"})
orders4 = apply_to_serie(orders4,  add_feature_ord_cat, extraParans={"category": "brand"})
orders4 = apply_to_serie(orders4,  add_feature_ord_cat, extraParans={"category": "manufacturer"})

In [66]:
print(len(orders4))

136019


In [67]:
def add_feature_min_max(data,time):
    orders_aux = data.query(f'group_backwards > {time}')

    newInfo = items[['itemID']].copy()
    newInfo['group_backwards'] = time
    
    #minimun and maximum sales in a pair
    #too much zeros, trying for last 4 pairs
    newInfo['min_sale'] = orders_aux.groupby(['itemID'])['orderSum'].min()
    newInfo['max_sale'] = orders_aux.groupby(['itemID'])['orderSum'].max()


    #minimun and maximum sales in a group recent
    order_recent = data.query(f'group_backwards > {time} & group_backwards < {time+4}')
    newInfo['min_sale_rec'] = order_recent.groupby(['itemID'])['orderSum'].min()
    newInfo['max_sale_rec'] = order_recent.groupby(['itemID'])['orderSum'].max()

    current_time = data.query(f"group_backwards == {time}")
    return pd.merge(current_time,newInfo, on=['itemID','group_backwards'], how="left", validate="m:1")

orders4 = apply_to_serie(orders4,  add_feature_min_max)

In [68]:
print(len(orders4))

136019


In [69]:
ola = orders4[orders4.group_backwards != 13]
ola = ola[ola.min_sale.isnull()]
print(ola)

      group_backwards  itemID  orderSum  brand  manufacturer  customerRating  \
38                 12       1       2.0      0             1            4.38   
1345               11       1     313.0      0             1            4.38   
8053               10       1      35.0      0             1            4.38   
9339                9       1       3.0      0             1            4.38   
189                 8       1       1.0      0             1            4.38   
9739                7       1       1.0      0             1            4.38   
9712                6       1       2.0      0             1            4.38   
6784                5       1     299.0      0             1            4.38   
1290                4       1       3.0      0             1            4.38   
9220                3       1      31.0      0             1            4.38   
2790                2       1       0.0      0             1            4.38   
189                 1       1       3.0 

### add feature first appearance

In [70]:
#finding first apperance
not_zero_order = orders4.query('orderSum != 0')
first_appearance = not_zero_order.groupby('itemID',as_index=False)['group_backwards'].max()#remenber backwards
first_appearance.columns = ['itemID','first_appearance']

orders4 = pd.merge(orders4, first_appearance, on="itemID",how="left", validate="m:1")

#putting in relation with the current timestamp
#positive means that the itemID was never sold
#negative means that the itemID was already sold
orders4['first_appearance'] = orders4['group_backwards'] - orders4['first_appearance'] 

#removing dataleak
func = lambda x : np.nan if x >= 0 else x
orders4['first_appearance'] = orders4['first_appearance'].apply(func)


In [72]:
orders4_columns = set(orders4.columns)
print(orders4_columns - orders2_columns)

{'freq_day', 'avg_sales_manufacturer', 'min_sale_rec', 'posM_f_group', 'first_appearance', 'max_sale', 'max_sale_rec', 'posM_l_group', 'freq_group', 'avg_sales_brand', 'freq_week', 'avg_sales_category3', 'min_sale', 'posM_m_group'}


In [73]:
#freq feature has lots of nan; just checking if there is a error
print(len(first_appearance[first_appearance.first_appearance == 1]) * 13 + \
len(first_appearance[first_appearance.first_appearance == 2]) * 12 + \
len(first_appearance[first_appearance.first_appearance == 3]) * 11 + \
len(first_appearance[first_appearance.first_appearance == 4]) * 10 + \
len(first_appearance[first_appearance.first_appearance == 5]) * 9 + \
len(first_appearance[first_appearance.first_appearance == 6]) * 8 + \
len(first_appearance[first_appearance.first_appearance == 7]) * 7 + \
len(first_appearance[first_appearance.first_appearance == 8]) * 6 + \
len(first_appearance[first_appearance.first_appearance == 9]) * 5 + \
len(first_appearance[first_appearance.first_appearance == 10]) * 4 + \
len(first_appearance[first_appearance.first_appearance == 11]) * 3 + \
len(first_appearance[first_appearance.first_appearance == 12]) * 2 + \
len(first_appearance[first_appearance.first_appearance == 13]) * 1 + \
len(all_ids - valid_ids) * 13 )

display(orders4.freq_day.isnull().sum())

74609


74609

## Shift
### added the feature salesPrice_mean_ from older pairs

In [76]:
orders5 = orders4.copy()

for i in range(1, 3):
    # Carrying the data of weeks t-1
    orders5[f'orderSum_{i}'] = orders5.groupby('itemID')['orderSum'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    orders5[f'orderSum_diff_{i}'] = orders5.groupby('itemID')[f'orderSum_{i}'].diff()

orders5 =orders5.fillna(np.inf)

orders5_columns = set(orders5.columns)
print(orders5_columns - orders4_columns)

{'orderSum_1', 'orderSum_diff_1', 'orderSum_diff_2', 'orderSum_2'}


In [86]:
print(orders5.columns)

Index(['group_backwards', 'itemID', 'orderSum', 'brand', 'manufacturer',
       'customerRating', 'category1', 'category2', 'category3',
       'recommendedRetailPrice', 'posM_f_group', 'posM_m_group',
       'posM_l_group', 'freq_day', 'freq_week', 'freq_group',
       'avg_sales_category3', 'avg_sales_brand', 'avg_sales_manufacturer',
       'min_sale', 'max_sale', 'min_sale_rec', 'max_sale_rec',
       'first_appearance', 'orderSum_1', 'orderSum_diff_1', 'orderSum_2',
       'orderSum_diff_2'],
      dtype='object')


## Custom metrics

In [77]:
class custom_obj(object):
    def __iter__(self):
        return iter('custom')
    
    def get_final_error(self, error, weight):
    
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for prediction,t,w in zip(approx, target, weight):
            
            weight_sum += w
            
            error_sum += -1* (prediction - (np.maximum(prediction - t, 0) * 1.6))  * w

        return error_sum, weight_sum
    def calc_ders_range(self, approxes, targets, weights):
        pred = np.array(approxes)
        target = np.array(targets)
        weight = np.array(weights)
        
        
        der1 = -2 *weight* (pred - (np.maximum(pred - target, 0) * 1.6)) * (1 - (pred > target) * 1.6)
        der2 = -2 *weight* (1 - (pred > target) * 1.6) ** 2

        return list(zip(der1,der2))

In [78]:
orders6 = orders5.copy()

display(orders6.columns)

Index(['group_backwards', 'itemID', 'orderSum', 'brand', 'manufacturer',
       'customerRating', 'category1', 'category2', 'category3',
       'recommendedRetailPrice', 'posM_f_group', 'posM_m_group',
       'posM_l_group', 'freq_day', 'freq_week', 'freq_group',
       'avg_sales_category3', 'avg_sales_brand', 'avg_sales_manufacturer',
       'min_sale', 'max_sale', 'min_sale_rec', 'max_sale_rec',
       'first_appearance', 'orderSum_1', 'orderSum_diff_1', 'orderSum_2',
       'orderSum_diff_2'],
      dtype='object')

In [79]:
#categorical features
cat_features = ['brand','manufacturer','category1','category2','category3']

#passing to integer
for f in cat_features:
    orders6[f] = orders6[f].map(lambda x : int(x))
    
weight =pd.merge(orders6, infos[["itemID", "simulationPrice"]], 
                     on="itemID", validate="m:1")
weightt = weight[["itemID","group_backwards","simulationPrice"]]

params = {'iterations': 200, 
         'loss_function':'RMSE',
         'use_best_model': True,
         'early_stopping_rounds': 30,
}

params2= {'loss_function':custom_obj(),
         'iterations': 200, 
         'eval_metric':custom_obj(),
         'use_best_model': True,
         'early_stopping_rounds': 30,
         'subsample':1,
         }


params3= {'loss_function':'RMSE',
         'iterations': 200, 
         'eval_metric':custom_obj(),
         'early_stopping_rounds': 30,
         'use_best_model': True,
         }



In [87]:
#ADD NOT SOLD ITEMS IF YOUR MODEL DONT PREDICT ALL
def get_pred(modelo,test, nome):
    
    
    test_pool = Pool(test.drop(columns=["orderSum"]),
                 weight= test['recommendedRetailPrice'],
                 cat_features= cat_features
    ) 
        
    preds = modelo.predict(test_pool)

    #all prediction need to be positive and integer
    sold_items = test.copy()
    preds = [max(x,0) for x in preds ]
    sold_items['demandPrediction'] = preds
    sold_items = sold_items[["itemID", "demandPrediction"]]

    sold_items["demandPrediction"] = sold_items["demandPrediction"].astype(np.uint8)

    #to kagle csv
    return sold_items.sort_values(['itemID'],  ignore_index=True)
    #final.to_csv(f"pred/{nome}.csv", index=False, sep='|')

In [88]:
def get_result(data, not_drop_columns):
    
    new_features = list(orders5_columns - orders2_columns)
    for colum in not_drop_columns :
        new_features.remove(colum)
    
    data = data.drop(columns=new_features)
    
    test = data.query('group_backwards == 1')
    val = data.query('group_backwards == 2')
    train = data.query('group_backwards >= 3')


    train_pool = Pool(
        data= train.drop(columns=["orderSum"]), 
        label= train['orderSum'], 
        weight= weightt.query('group_backwards >= 3') ,
        cat_features= cat_features
    )
    
    validation_pool = Pool(
        data= val.drop(columns=["orderSum"]), 
        label= val['orderSum'], 
        weight= weightt.query('group_backwards == 2'),
        cat_features= cat_features
    )
    
    
    
    model=CatBoostRegressor(**params) 
    model.fit(train_pool,eval_set=validation_pool , verbose=False)
    
    #model2=CatBoostRegressor(**params2) 
    #model2.fit(train_pool,eval_set=validation_pool , verbose=False)
    
    model3=CatBoostRegressor(**params3) 
    model3.fit(train_pool,eval_set=validation_pool , verbose=False)
    
    
    target = test['orderSum'].values
    predct1 =get_pred(model,test, 'cat_pos1')['demandPrediction'].values
    predct3 =get_pred(model3,test, 'cat_pos1')['demandPrediction'].values
    
    score1 = baseline_score(predct1, target, infos['simulationPrice'])
    score3 = baseline_score(predct3, target, infos['simulationPrice'])
    
    return score1, score3


In [89]:
print(orders5_columns - orders2_columns)

{'freq_day', 'avg_sales_manufacturer', 'min_sale_rec', 'posM_f_group', 'first_appearance', 'max_sale', 'max_sale_rec', 'orderSum_diff_1', 'posM_l_group', 'freq_group', 'avg_sales_brand', 'orderSum_1', 'avg_sales_category3', 'freq_week', 'min_sale', 'orderSum_diff_2', 'posM_m_group', 'orderSum_2'}


## results

In [90]:
print(get_result(orders6,[]))#all new features

print(get_result(orders6, list(orders5_columns - orders2_columns)))#no new features

print(get_result(orders6, ['posM_f_group', 'posM_m_group','posM_l_group']))#position in month



(-2558694.946000001, -2363689.0980000007)
(-2850372.1780000012, -6256618.284000002)
(-2883996.318000001, -5860836.592000001)


In [91]:
print(get_result(orders6, ['first_appearance']))#'first_appearance'

print(get_result(orders6, ['freq_day','freq_group','freq_week']))#frequency

print(get_result(orders6, ['avg_sales_brand','avg_sales_manufacturer','avg_sales_category3']))#avg sales

print(get_result(orders6, ['min_sale','max_sale','max_sale_rec','min_sale_rec']))#min max

print(get_result(orders6, ['max_sale_rec','min_sale_rec']))#min max recent

(-2485403.892000001, -4908763.1000000015)
(-2685173.580000001, -5860836.592000001)
(-2472589.696000001, -5860836.592000001)
(-2589885.846000001, -6097441.118000001)
(-2562921.090000001, -1906474.5280000004)


In [None]:
print(get_result(orders6, ['salesPrice_mean_1','salesPrice_mean_diff_1','salesPrice_mean_2','salesPrice_mean_diff_2']))

print(get_result(orders6, ['promotion_mean_1','promotion_mean_2','promotion_mean_diff_1','promotion_mean_diff_2']))

print(get_result(orders6, ['orderSum_1','orderSum_2','orderSum_diff_1','orderSum_diff_2']))

In [None]:
print(get_result(orders6, ['salesPrice_mean_1','salesPrice_mean_diff_1','salesPrice_mean_2','salesPrice_mean_diff_2',
                          'first_appearance','min_sale','max_sale','max_sale_rec','min_sale_rec',
                          'orderSum_1','orderSum_2','orderSum_diff_1','orderSum_diff_2']))