In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
from catboost import CatBoostRegressor, Pool, MetricVisualizer
from sasaki_features import add_feature_position_month
from datetime import datetime

sys.path.append("../dora/models/")
from utils import  promo_detector, promotionAggregation

sys.path.append("../main")
from utils import read_data, process_time, merge_data

In [None]:
infos, items, orders = read_data("../main/datasets/")

orders_columns = set(orders.columns)
print(orders_columns)

In [None]:
process_time(orders)
orders2 = promo_detector(orders)
orders2 = promotionAggregation(orders2, items)

orders_columns2 = set(orders2.columns)
print(orders_columns2 - orders_columns)

In [None]:
#removing item with sale only in group_backwards ==1 or 2 
#because it can cause dataleak
id_in_test = orders2.query('group_backwards <= 2')['itemID'].unique()
id_in_train = orders2.query('group_backwards >= 3')['itemID'].unique()

remove =  set(id_in_test) - set(id_in_train)
orders2 = orders2[~orders2.itemID.isin(remove)]


In [None]:
new_rows = []
weeks_database = orders2['group_backwards'].unique()


#features with same value for pair ('group', 'itemID')
from_example=['category1','category2','category3',
              'customerRating','recommendedRetailPrice',
              'manufacturer','brand']
    
for idd in orders2['itemID'].unique():
    orders_id = orders2[orders2.itemID == idd]
    example = orders_id.iloc[0]

    # finding weeks without itemID sales
    weeks_id = orders_id['group_backwards'].unique()
    weeks_without_id = np.setdiff1d(weeks_database, weeks_id)

    # creating new row
    for w in weeks_without_id:
        
        
        row = {'itemID': idd,
                         'group_backwards': w,
                         'salesPrice_mean': 0,
                         'orderSum': 0,
                         'promotion_mean': 0
              }
        for f in from_example:
            row[f] = example[f]
        
        new_rows.append(row)
#  Adding rows in every week with the IDs of the
# items that were never sold.

orders2 = orders2.append(new_rows)  
orders2 = orders2.sort_values(['group_backwards', 'itemID'], ascending=[False, True], ignore_index=True)

## New features 1

In [None]:
orders3 = add_feature_position_month(orders2)


orders_aux = orders.query('group_backwards >= 3')
nDays = orders_aux['days'].nunique()
nWeek = orders_aux['week_backwards'].nunique()
nGroup = orders_aux['group_backwards'].nunique()

newInfo = items[['itemID']].copy()

#how many days in average the item is sold in day/week/pair
newInfo['freq_day'] = orders_aux.groupby('itemID', as_index=False)['days'].nunique()/nDays
newInfo['freq_week'] = orders_aux.groupby('itemID', as_index=False)['week_backwards'].nunique()/nWeek
newInfo['freq_group'] = orders_aux.groupby('itemID', as_index=False)['group_backwards'].nunique()/nGroup



orders_aux = orders3.query('group_backwards >= 3')

#minimun and maximum sales in a pair
#too much zeros, trying for last 4 pairs
newInfo['min_sale'] = orders_aux.groupby(['itemID'])['orderSum'].min()
newInfo['max_sale'] = orders_aux.groupby(['itemID'])['orderSum'].max()



#minimun and maximum sales in a group recent
order_recent = orders3.query('group_backwards >= 3 & group_backwards <= 7')
newInfo['min_sale_rec'] = order_recent.groupby(['itemID'])['orderSum'].min()
newInfo['max_sale_rec'] = order_recent.groupby(['itemID'])['orderSum'].max()


orders3 = pd.merge(orders3,newInfo,on=['itemID'])

orders_columns3 = set(orders3.columns)
print(orders_columns3 - orders_columns2)


## Shift
### added the feature salesPrice_mean_ from older pairs

In [None]:
orders4 = orders3.copy()

for i in range(1, 3):
    # Carrying the data of weeks t-1
    orders4[f'orderSum_{i}'] = orders4.groupby('itemID')['orderSum'].shift(i)
    orders4[f'promotion_mean_{i}'] = orders4.groupby('itemID')['promotion_mean'].shift(i)
    orders4[f'salesPrice_mean_{i}'] = orders4.groupby('itemID')['salesPrice_mean'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    orders4[f'orderSum_diff_{i}'] = orders4.groupby('itemID')[f'orderSum_{i}'].diff()
    orders4[f'promotion_mean_diff_{i}'] = orders4.groupby('itemID')[f'promotion_mean_{i}'].diff()
    orders4[f'salesPrice_mean_diff_{i}'] = orders4.groupby('itemID')[f'salesPrice_mean_{i}'].diff()

orders4 =orders4.fillna(np.inf)

orders_columns4 = set(orders4.columns)
print(orders_columns4 - orders_columns3)

## new features 2

In [None]:
# tentando ordenar certas features categoricas
#estou usando como metrica vendas totais medias entre itemID de mesma categoria
def ordanalize_colum(order_g2, colum):
    aaa = order_g2.copy()
    order_g2 = order_g2.query('group_backwards >= 3')
    
    aux = order_g2.groupby(colum,as_index=False).agg({'orderSum' : ['sum'],'itemID' : ['count']})
    
    #calculando a metrica para cada item diferente da coluna
    aux[f'avg_sales_{colum}'] = aux[( 'orderSum',   'sum')] / aux[( 'itemID', 'count')]
    aux = aux[[colum,f'avg_sales_{colum}']]
    aux.columns = [colum,f'avg_sales_{colum}']
    
    
    #ordenando a coluna
    aux = aux.sort_values(by=f'avg_sales_{colum}')
    aux[f'{colum}_order'] = range(len(aux))

    #print(aux)
    return pd.merge(aaa,aux, how='left',on=[colum])


In [None]:
orders5 = ordanalize_colum(orders4, 'category3')
orders5 = ordanalize_colum(orders5, 'brand')
orders5 = ordanalize_colum(orders5, 'manufacturer')

orders_columns5 = set(orders5.columns)
print(orders_columns5 - orders_columns4)


## Custom metrics

In [None]:
class custom_obj(object):
    def __iter__(self):
        return iter('custom')
    
    def get_final_error(self, error, weight):
    
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for prediction,t,w in zip(approx, target, weight):
            
            weight_sum += w
            
            error_sum += -1* (prediction - (np.maximum(prediction - t, 0) * 1.6))  * w

        return error_sum, weight_sum
    def calc_ders_range(self, approxes, targets, weights):
        pred = np.array(approxes)
        target = np.array(targets)
        weight = np.array(weights)
        
        
        der1 = -2 *weight* (pred - (np.maximum(pred - target, 0) * 1.6)) * (1 - (pred > target) * 1.6)
        der2 = -2 *weight* (1 - (pred > target) * 1.6) ** 2

        return list(zip(der1,der2))

In [None]:
orders6 = orders5.copy()

In [None]:
orders6 = orders6.drop(columns=['avg_sales_category3', 'avg_sales_brand', 'avg_sales_manufacturer'])
#piorou bastante

In [None]:
orders6 = orders6.drop(columns=['manufacturer_order', 'category3_order', 'brand_order'])
# parece nao fazer diferença (melhorou 40000)

In [None]:
orders6 = orders6.drop(columns=['freq_day','freq_week','freq_group'])
#pode melhorar (melhorou 100000)

In [None]:
#orders6 = orders6.drop(columns=['posM_f_group', 'posM_m_group', 'posM_l_group'])
#piorou um pouco

In [None]:
orders6 = orders6.drop(columns=['min_sale_rec', 'max_sale_rec','min_sale','max_sale'])
#pode melhorar um pouco (melhorou 100000)

In [None]:
display(orders6)

In [None]:
#categorical features
cat_features = ['brand','manufacturer','category1','category2','category3']

#passing to integer
for f in cat_features:
    orders6[f] = orders6[f].map(lambda x : int(x))  

    
weight =pd.merge(orders6, infos[["itemID", "simulationPrice"]], 
                     on="itemID", validate="m:1")
weightt = weight[["itemID","group_backwards","simulationPrice"]]

test = orders6.query('group_backwards == 1')
val = orders6.query('group_backwards == 2')
train = orders6.query('group_backwards >= 3')



train_pool = Pool(
    data= train.drop(columns=["orderSum"]), 
    label= train['orderSum'], 
    weight= weightt.query('group_backwards >= 3') ,
    cat_features= cat_features
)

validation_pool = Pool(
    data= val.drop(columns=["orderSum"]), 
    label= val['orderSum'], 
    weight= weightt.query('group_backwards == 2'),
    cat_features= cat_features
)

params = {'iterations': 1000, 
         'loss_function':'RMSE',
         'use_best_model': True,
         'early_stopping_rounds': 30,
}

params2= {'loss_function':custom_obj(),
         'iterations': 200, 
         'eval_metric':custom_obj(),
         'use_best_model': True,
         'early_stopping_rounds': 30,
         'subsample':1,
         }


params3= {'loss_function':'RMSE',
         'iterations': 1000, 
         'eval_metric':custom_obj(),
         'early_stopping_rounds': 30,
         'use_best_model': True,
         }

model=CatBoostRegressor(**params) 
model.fit(train_pool,eval_set=validation_pool , verbose=False)

#model2=CatBoostRegressor(**params2) 
#model2.fit(train_pool,eval_set=validation_pool , verbose=False)

model3=CatBoostRegressor(**params3) 
model3.fit(train_pool,eval_set=validation_pool , verbose=False)

In [None]:
test_pool = Pool(test.drop(columns=["orderSum"]),
                 weight= test['salesPrice_mean'],
                 cat_features= cat_features) 

#items never sold will have 0 demandPrediction
not_sold_items = items[np.logical_not(
    items.itemID.isin(sorted(orders2['itemID'].unique())))]
not_sold_items['demandPrediction'] = [0 for _ in range(len(not_sold_items))]
not_sold_items = not_sold_items[["itemID", "demandPrediction"]]

def get_pred(modelo, nome):
    preds = modelo.predict(test_pool)

    #all prediction need to be positive and integer
    sold_items = test.copy()
    preds = [max(x,0) for x in preds ]
    sold_items['demandPrediction'] = preds
    sold_items = sold_items[["itemID", "demandPrediction"]]



    #to kagle csv
    final = pd.concat([sold_items, not_sold_items])
    final["demandPrediction"] = final["demandPrediction"].astype(np.uint8)
    final = final.sort_values(['itemID'],  ignore_index=True)
    final.to_csv(f"pred/{nome}.csv", index=False, sep='|')

In [None]:
get_pred(model, 'cat_pos1')
#get_pred(model2, 'cat2')
get_pred(model3, 'cat_pos2')

In [None]:
#np.array(model.get_feature_importance(prettified=True))

In [None]:
#NAO USAR ESSA FEATURE; ALGUEM A PERDA DE -0.3 PARA -1 
#(e por algum motivo foi a com maior importancia no modelo com ela usada)

#average sales of items in the same category12

#aux = order_g2.groupby(['category1','category2'],as_index=False).agg({'orderSum' : ['sum','count']})
#aux['avg_sales_cat12'] = aux[( 'orderSum',   'sum')] / aux[( 'orderSum', 'count')]
#aux = aux[['category1','category2','avg_sales_cat12']]
#aux.columns = ['category1','category2','avg_sales_cat12']


#order_g2= pd.merge(order_g2,aux, on=['category1','category2'])