In [1]:
import pandas as pd
import numpy as np
import datetime

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric

from metrics_f1 import calc_f1_score

import collections
from itertools import chain

## Описание блоков в другом файле! Со скринкаста

In [31]:
import numpy as np

class ComponentReducer:
    def __init__(self):
        from collections import defaultdict
        self.last_customer_id = None
        self.last_order_id = None
        self.prev_ts = None
        self.possible_keys = defaultdict(list)
        self.arr_revenue = []
        self.arr_squares = []
        self.arr_deltas = []
        self.arr_of_arr_dishes = []
        self.cnt = 0
        self.items = dict()

    def print_result(self):
        final_features = [self.last_customer_id, self.last_order_id]

        for k in range(4, 1, -1):
            final_features.append(np.mean(self.arr_revenue[-k:]))
        
        try:
            final_features.append(self.arr_revenue[-1] / self.arr_revenue[-2])
        except:
            final_features.append(-1)
            
        for k in range(4, 1, -1):
            final_features.append(np.mean(self.arr_squares[-k:]))

        for key in ['drinks', 'burgers', 'snack', 'offer', 'chicken', 'bucket', 'count']:
            els = self.possible_keys[key]
            for k in range(4, 1, -1):
                final_features.append(np.mean(els[-k:]))
            try:
                final_features.append(els[-1] / els[-2])
            except:
                final_features.append(-1)
            
        final_features.append(np.mean(self.arr_deltas[-3:]))
        final_features.append(self.arr_deltas[-1])
        
        arr_1d_dishes = list(chain(*self.arr_of_arr_dishes))
        mc = collections.Counter(arr_1d_dishes)
        # print(self.arr_of_arr_dishes)
        for i in range(2):
            try:
                final_features.append(mc.most_common()[i][0])
            except:
                final_features.append(None)

        final_features.append(len(set(arr_1d_dishes)))
        
        
        hour = self.items['start_ts'].hour
        
        final_features.append(hour)
        
        if (hour >= 6) and (hour < 12):
            final_features.append('morning')
        elif (hour >= 12) and (hour < 17):
            final_features.append('morning')
        elif (hour >= 17) and (hour < 22):
            final_features.append('evening')
        else:
            final_features.append('night')
        
        weekday = self.items['start_ts'].weekday() + 1
        
        final_features.append(weekday)
        
        if weekday <= 5:
            final_features.append('workday')
        else:
            final_features.append('weekend')
                
        for col in ['revenue', 'square', 'drinks', 'burgers', 'snack', 'offer', 'chicken', 'bucket']:
            final_features.append(self.items[col])
            
        final_features.append(self.cnt)

        return final_features

    def reset(self):
        self.last_customer_id = None
        self.last_order_id = None
        self.prev_ts = None
        self.possible_keys.clear()
        self.arr_revenue = []
        self.arr_squares = []
        self.arr_deltas = []
        self.arr_of_arr_dishes = []
        self.cnt = 0
        self.items = dict()


    def get_item_dict(self, items):

        d = {
            'customer_id': items.loc['customer_id'],
            'start_ts': items.loc['startdatetime'],
            'revenue': items.loc['revenue'],
            'square': items.loc['ownareaall_sqm'],
            'drinks': items.loc['drinks'],
            'burgers': items.loc['burgers'],
            'snack': items.loc['snack'],
            'offer': items.loc['offer'],
            'chicken': items.loc['chicken'],
            'bucket': items.loc['bucket'],
            'count': items.loc['count'],
            'order_id': items.loc['order_id'],
            'dish_arr': items.loc['dish_name']
        }
        return d

    def push_line(self, line):
        from copy import deepcopy

        res = None
        # items = self.get_item_dict(line.strip().split('\t'))
        items = self.get_item_dict(line)


        order_id = items['order_id']
        customer_id = items['customer_id']
        unix_ts = items['start_ts']
        
        self.items = items

        if self.last_customer_id is not None and self.last_customer_id != customer_id:
            res = self.print_result()
            self.reset()

        if self.last_order_id is not None and self.last_order_id != order_id:
            res = self.print_result()

        self.arr_revenue.append(float(items['revenue']))
        self.arr_squares.append(float(items['square']))
        
        for key in ['drinks', 'burgers', 'snack', 'offer', 'chicken', 'bucket', 'count']:
            self.possible_keys[key].append(items[key])
            
        if self.prev_ts is not None:
            self.arr_deltas.append((self.prev_ts - unix_ts).total_seconds())
        else:
            self.arr_deltas.append(-1)
        
        self.arr_of_arr_dishes.append(items['dish_arr'])

        self.cnt += 1

        self.last_customer_id = customer_id
        self.last_order_id = order_id
        self.prev_ts = unix_ts
        
        return res

reducer = ComponentReducer()
reducer.reset()

In [2]:
def make_features(df, flg_train=True):

    if flg_train:
        df_orders = df.drop_duplicates(subset=['customer_id', 'startdatetime']).sort_values(by=['customer_id', 'startdatetime']).reset_index(drop=True)
        df_orders['next_startdatetime'] = df_orders.groupby(['customer_id'])['startdatetime'].shift(-1)
        df_orders['days_delta'] = (df_orders['next_startdatetime'] - df_orders['startdatetime']).apply(lambda row: row.total_seconds() // (24*60*60))

        df_orders['trg_days'] = df_orders.apply(lambda row: row.days_delta if str(row.days_delta) != 'nan' else row.date_diff_post, axis=1)
        df_orders['trg_flg'] = df_orders['trg_days'].apply(lambda row: 1 if row <= 60 else 0)

        # df_orders[['customer_id', 'startdatetime', 'next_startdatetime', 'buy_post', 'days_delta', 'date_diff_post', 'trg_days', 'trg_flg']]

        df_orders[['customer_id', 'startdatetime', 'trg_days', 'trg_flg']].to_pickle('new_trg.pkl')
#         df = df.merge(df_orders[['customer_id', 'startdatetime', 'trg_days', 'trg_flg']])

    df['drinks'] = df.dish_name.str.lower().str.contains(r'0[,.]|кофе|латте|американо|капучино|липтон|шейк|балтика|туборг|какао|кола|лимонад|чай|сок|адреналин|миринда|эвервесс|байкал|дюшес|мандарин|севен')
    df['burgers'] = df.dish_name.str.lower().str.contains(r'воппер|ангус|гранд|чизбургер|гамбургер')
    df['snack'] = df.dish_name.str.lower().str.contains(r'шт|фри')
    df['1rub'] = df.dish_name.str.lower().str.contains(r'1rub')
    df['chicken'] = df.dish_name.str.lower().str.contains(r'куриц|чикен')
    df['bucket'] = df.dish_name.str.lower().str.contains(r'букет')
    df['dessert'] = df.dish_name.str.lower().str.contains(r'пирожок|рожок|маффин')
    df['pivo'] = df.dish_name.str.lower().str.contains(r'туборг|балтика|пиво')
    df['sok'] = df.dish_name.str.lower().str.contains(r'сок|а4')
    df['toilet'] = df.format_name.str.lower().str.contains(r'без туалета')
    df['foodcourt'] = df.format_name.str.lower().str.contains(r'фудкорт')
    df['otdelno'] = df.format_name.str.lower().str.contains(r'отдельно стоящий')
    df['count'] = 1
    df['offer'] = df.revenue.apply(lambda x: True if x <= 19.98 else False)
    df['offer'] = df['1rub'] + df['offer']
    df = df.drop(columns=['1rub'])

    return df

## Read data

In [2]:
# df = pd.read_parquet("files/train.parquet")
df = pd.read_parquet('train_dataset_hackaton2023_train.parquet')

df = make_features(df)

In [4]:
df_check = df[
    ['customer_id', 'revenue', 'startdatetime', 'ownareaall_sqm', 'drinks',
     'burgers', 'snack', 'offer', 'chicken', 'bucket', 'dessert', 'pivo', 
     'sok', 'toilet', 'foodcourt', 'otdelno', 'count', 'dish_name']
].groupby(['customer_id', 'startdatetime']) \
    .agg({
    'revenue': 'sum',
    'ownareaall_sqm':'mean',
    'drinks':'sum',
    'burgers':'sum',
    'snack':'sum',
    'offer':'sum',
    'chicken':'sum',
    'bucket':'sum',
    'dessert':'sum',
    'pivo':'sum',
    'sok':'sum',
    'toilet':'max',
    'foodcourt':'max',
    'otdelno':'max',
    'count':'sum',
    'dish_name': list
}).reset_index()

In [10]:
df_sample = df_check.sort_values(by=['customer_id', 'startdatetime']).reset_index()
df_sample['order_id'] = df_sample['customer_id'].astype(str) + '_' + df_sample['startdatetime'].astype(str)

In [12]:
df_sample.to_pickle('feats.pkl')

## Reducer

In [3]:
df_sample = pd.read_pickle('feats.pkl')

In [42]:
from tqdm import tqdm

out_arr = []
cols = ['customer_id', 'startdatetime', 'revenue', 'ownareaall_sqm', 'drinks',
        'burgers', 'snack', 'offer', 'chicken', 'bucket', 'count', 'order_id', 'dish_name']

for index, row in tqdm(df_sample[cols].iterrows()):
    res = reducer.push_line(row)
    if res is not None:
        # print('\t      '.join([str(x) for x in res]))
        out_arr.append(res)

res = reducer.print_result()
if res is not None:
    # print('\t'.join([str(x) for x in res]))
    out_arr.append(res)

2880586it [09:11, 5226.67it/s]


In [45]:
df_train_new = pd.DataFrame(out_arr, columns=[
    'customer_id', 'order_id',
    'mean_revenue_last4', 'mean_revenue_last3', 'mean_revenue_last2', 'rat_revenue',
    'mean_squares_last4', 'mean_squares_last3', 'mean_squares_last2',
    'mean_drinks_last4', 'mean_drinks_last3', 'mean_drinks_last2', 'rat_drinks',
    'mean_burgers_last4', 'mean_burgers_last3', 'mean_burgers_last2', 'rat_burgers',
    'mean_snack_last4', 'mean_snack_last3', 'mean_snack_last2', 'rat_snack',
    'mean_offer_last4', 'mean_offer_last3', 'mean_offer_last2', 'rat_offer',
    'mean_chicken_last4', 'mean_chicken_last3', 'mean_chicken_last2', 'rat_chicken',
    'mean_bucket_last4', 'mean_bucket_last3', 'mean_bucket_last2', 'rat_bucket',
    'mean_count_last4', 'mean_count_last3', 'mean_count_last2', 'rat_count',
    'gap_last3', 'last_gap',
    'mc1_dish', 'mc2_dish',
    
    'nunique_dishes', 'cat_hour', 'cat_gr_hour', 'cat_weekday', 'cat_gr_weekday',
    
    'revenue_curr', 'square_curr', 'drinks_curr', 'burgers_curr', 'snack_curr', 'offer_curr', 
    'chicken_curr', 'bucket_curr',     
    'cnt'
])

* Read target

In [47]:
df_orders = pd.read_pickle('new_trg.pkl')

df_orders['order_id'] = df_orders['customer_id'].astype(str) + '_' + df_orders['startdatetime'].astype(str)

In [49]:
df_long = df_train_new.merge(df_orders[['order_id', 'trg_days', 'trg_flg']], on=['order_id']).copy()
df_long['trg_flg'].value_counts()

1    2739682
0     140905
Name: trg_flg, dtype: int64

In [51]:
df_long['mc1_dish'] = df_long['mc1_dish'].fillna('-1')
df_long['mc2_dish'] = df_long['mc2_dish'].fillna('-1')

In [54]:
df_last = df_long.sort_values(
    by=['customer_id', 'cnt'], ascending=[True, False]
).drop_duplicates(subset=['customer_id'])

In [55]:
df_last['ts'] = df_last['order_id'].apply(lambda row: row.split('_')[1])

Make oot

In [56]:
df_train_all = df_last[df_last['ts'] <= '2023-06-20'].copy()
df_oot = df_last[df_last['ts'] > '2023-06-20'].copy()

In [224]:
df_train_all['trg_flg'].value_counts()

1    297426
0    125677
Name: trg_flg, dtype: int64

In [225]:
df_oot['trg_flg'].value_counts()

1    61669
0    15228
Name: trg_flg, dtype: int64

Take all cols

In [216]:
cols = [
    # 'customer_id', 'revenue', 'dish_name', 'ownareaall_sqm', 'format_name',
    # 'customer_id',
    'mean_revenue_last4', 'mean_revenue_last3', 'mean_revenue_last2', 'rat_revenue',
    'mean_squares_last4', 'mean_squares_last3', 'mean_squares_last2',
    'mean_drinks_last4', 'mean_drinks_last3', 'mean_drinks_last2', 'rat_drinks',
    'mean_burgers_last4', 'mean_burgers_last3', 'mean_burgers_last2', 'rat_burgers',
    'mean_snack_last4', 'mean_snack_last3', 'mean_snack_last2', 'rat_snack',
    'mean_offer_last4', 'mean_offer_last3', 'mean_offer_last2', 'rat_offer',
    'mean_chicken_last4', 'mean_chicken_last3', 'mean_chicken_last2', 'rat_chicken',
    'mean_bucket_last4', 'mean_bucket_last3', 'mean_bucket_last2', 'rat_bucket',
    'mean_count_last4', 'mean_count_last3', 'mean_count_last2', 'rat_count',
    'gap_last3', 'last_gap',
    'mc1_dish', 'mc2_dish',
    
    'nunique_dishes', 'cat_hour', 'cat_gr_hour', 'cat_weekday', 'cat_gr_weekday',
    
    'revenue_curr', 'square_curr', 'drinks_curr', 'burgers_curr', 'snack_curr', 'offer_curr', 
    'chicken_curr', 'bucket_curr',  
]
target_col = 'trg_flg'

str_cols = ['mc1_dish', 'mc2_dish', 'customer_id', 'cat_hour', 'cat_gr_hour', 'cat_weekday', 'cat_gr_weekday']

# X_train, X_test, y_train, y_test = train_test_split(
#     df_long[df_long['cnt'] > 2][cols],
#     df_long[df_long['cnt'] > 2][target_col].copy(),
#     test_size=0.2,
#     random_state=23
# )

X_train, X_test, y_train, y_test = train_test_split(
    df_train_all[cols],
    df_train_all[target_col].copy(),
    test_size=0.2,
    random_state=23
)

In [59]:
y_train.value_counts()

1    237997
0    100485
Name: trg_flg, dtype: int64

Sample best from cb feature importance

In [248]:
cols = ['nunique_dishes',
 'gap_last3',
 'last_gap',
 'mean_count_last4',
 'mean_drinks_last4',
 'mean_revenue_last4',
 'mc1_dish',
 'mean_burgers_last4',
 'mc2_dish',
 'mean_chicken_last4',
 'rat_revenue',
 'mean_snack_last4',
 'mean_offer_last4',
 'square_curr',
 'mean_revenue_last2',
 'revenue_curr']

In [249]:
# import pickle
# with open('models/cls_0624798.pkl', 'wb') as f:
#     pickle.dump(cls, f)

In [251]:
cls = CatBoostClassifier(
    iterations = 1000,
    auto_class_weights='Balanced',
    has_time = True,
    cat_features = list(set(str_cols) & set(cols)),
    task_type = 'CPU',
    eval_metric = "F1",
#     custom_metric=['F1', 'AUC'],
    random_state = 23,
    silent=False,
    **params
) #learning_rate=0.5,

cls.fit(X_train[cols], y_train, eval_set=(X_test[cols], y_test))

# 929:	learn: 0.6805033	test: 0.6485976	best: 0.6485976 (929)	total: 33.3s	remaining: 2.5s

# bestTest = 0.6499612173
# bestIteration = 763

0:	learn: 0.5898600	test: 0.5902395	best: 0.5902395 (0)	total: 79.7ms	remaining: 1m 19s
1:	learn: 0.6286962	test: 0.6299714	best: 0.6299714 (1)	total: 143ms	remaining: 1m 11s
2:	learn: 0.6185400	test: 0.6207894	best: 0.6299714 (1)	total: 209ms	remaining: 1m 9s
3:	learn: 0.6207396	test: 0.6225254	best: 0.6299714 (1)	total: 276ms	remaining: 1m 8s
4:	learn: 0.6202113	test: 0.6216462	best: 0.6299714 (1)	total: 341ms	remaining: 1m 7s
5:	learn: 0.6214362	test: 0.6219898	best: 0.6299714 (1)	total: 396ms	remaining: 1m 5s
6:	learn: 0.6235428	test: 0.6252773	best: 0.6299714 (1)	total: 452ms	remaining: 1m 4s
7:	learn: 0.6240982	test: 0.6246000	best: 0.6299714 (1)	total: 534ms	remaining: 1m 6s
8:	learn: 0.6249435	test: 0.6253712	best: 0.6299714 (1)	total: 608ms	remaining: 1m 6s
9:	learn: 0.6266417	test: 0.6270471	best: 0.6299714 (1)	total: 677ms	remaining: 1m 7s
10:	learn: 0.6278557	test: 0.6278983	best: 0.6299714 (1)	total: 737ms	remaining: 1m 6s
11:	learn: 0.6291171	test: 0.6289574	best: 0.62997

<catboost.core.CatBoostClassifier at 0x354ab28b0>

Features

In [252]:
cb_imp = pd.DataFrame(
    {'feature_importance': cls.feature_importances_,
     'feature_names': cls.feature_names_}
).sort_values(by=['feature_importance'], ascending=False).reset_index(drop=True)

cb_imp

Unnamed: 0,feature_importance,feature_names
0,10.012388,nunique_dishes
1,9.450645,gap_last3
2,7.685068,mc2_dish
3,7.634792,last_gap
4,6.849347,mc1_dish
5,6.78622,rat_revenue
6,6.526494,square_curr
7,6.386373,revenue_curr
8,5.96152,mean_count_last4
9,5.958751,mean_revenue_last4


In [90]:
# cols =  
list(cb_imp.head(16)['feature_names'].values)
# cb_imp.head(18)
# cols = ['gap_last3',
#  'last_gap',
#  'mean_offer_last4',
#  'mean_drinks_last4',
#  'mean_burgers_last4',
#  'mc1_dish',
#  'mean_revenue_last3',
#  'mean_snack_last4',
#  'mean_revenue_last4',
#  'mc2_dish',
#  'mean_squares_last2',
#  'rat_revenue',
#  'mean_chicken_last4',
#  'mean_revenue_last2',
#  'mean_count_last4']

['nunique_dishes',
 'gap_last3',
 'last_gap',
 'mean_count_last4',
 'mean_drinks_last4',
 'mean_revenue_last4',
 'mc1_dish',
 'mean_burgers_last4',
 'mc2_dish',
 'mean_chicken_last4',
 'rat_revenue',
 'mean_snack_last4',
 'mean_offer_last4',
 'square_curr',
 'mean_revenue_last2',
 'revenue_curr']

In [253]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

target_col = 'trg_flg'

for _df, _trg, nm in zip(
        [X_train, X_test, df_oot[cls.feature_names_]], 
        [y_train, y_test, df_oot[target_col]], 
        ['train', 'test', 'oot']):
    
    predict = cls.predict_proba(_df[cls.feature_names_])[:, 1]
    pred = cls.predict(_df[cls.feature_names_])
    auc = roc_auc_score(_trg, predict)
    f1_macro = f1_score(_trg, pred, average='macro')
    f1_weighted = f1_score(_trg, pred, average='weighted')

    print(f'AUC of {nm: <6}: {auc:.3f}')
    print(f'F1 macro of {nm: <6}: {f1_macro:.3f}')
    print(f'F1 weighted of {nm: <6}: {f1_weighted:.3f} \n')



AUC of train : 0.820
F1 macro of train : 0.714
F1 weighted of train : 0.748 

AUC of test  : 0.692
F1 macro of test  : 0.623
F1 weighted of test  : 0.669 

AUC of oot   : 0.663
F1 macro of oot   : 0.585
F1 weighted of oot   : 0.709 



In [None]:
# AUC of train : 0.742
# F1 macro of train : 0.651
# F1 weighted of train : 0.691 

# AUC of test  : 0.700
# F1 macro of test  : 0.623
# F1 weighted of test  : 0.665 

# AUC of oot   : 0.671
# F1 macro of oot   : 0.583
# F1 weighted of oot   : 0.700 

## Plots

In [259]:
# import plotly.express as px
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_curve, auc
# from sklearn.datasets import make_classification

# for _df, y, nm in zip(
#         [X_train, X_test, df_oot[cls.feature_names_]], 
#         [y_train, y_test, df_oot[target_col]], 
#         ['train', 'test', 'oot']):
    
#     y_score = cls.predict_proba(_df[cls.feature_names_])[:, 1]
    
#     fpr, tpr, thresholds = roc_curve(y, y_score)

#     fig = px.area(
#         x=fpr, y=tpr,
#         title=f'ROC Curve {nm} (AUC={auc(fpr, tpr):.4f})',
#         labels=dict(x='False Positive Rate', y='True Positive Rate'),
#         width=700, height=500
#     )
#     fig.add_shape(
#         type='line', line=dict(dash='dash'),
#         x0=0, x1=1, y0=0, y1=1
#     )

#     df_thr = pd.DataFrame({
#         'False Positive Rate': fpr,
#         'True Positive Rate': tpr
#     }, index=thresholds)
#     df_thr.index.name = "Thresholds"
#     df_thr.columns.name = "Rate"

#     fig.update_yaxes(scaleanchor="x", scaleratio=1)
#     fig.update_xaxes(constrain='domain')
#     fig.show()
    
#     fig_thresh = px.line(
#         df_thr, title=f'{nm}: TPR and FPR at every threshold',
#         width=700, height=500
#     )

#     fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
#     fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
#     fig_thresh.show()

## Test part

In [73]:
df_test = pd.read_parquet('hackaton2023_test.gzip')

In [74]:
df_test = make_features(df_test, False)

## Preproc

In [75]:
df_check = df_test[
    ['customer_id', 'revenue', 'startdatetime', 'ownareaall_sqm', 'drinks',
     'burgers', 'snack', 'offer', 'chicken', 'bucket', 'dessert', 'pivo', 'sok', 
     'toilet', 'foodcourt', 'otdelno', 'count', 'dish_name']
].groupby(['customer_id', 'startdatetime']) \
    .agg({
    'revenue': 'sum',
    'ownareaall_sqm':'mean',
    'drinks':'sum',
    'burgers':'sum',
    'snack':'sum',
    'offer':'sum',
    'chicken':'sum',
    'bucket':'sum',
    'dessert':'sum',
    'pivo':'sum',
    'sok':'sum',
    'toilet':'max',
    'foodcourt':'max',
    'otdelno':'max',
    'count':'sum',
    'dish_name': list
}).reset_index()

In [80]:
df_check.shape

(592273, 18)

## Make feats

In [76]:
from tqdm import tqdm

df_sample = df_check.sort_values(by=['customer_id', 'startdatetime']).reset_index()
df_sample['order_id'] = df_sample['customer_id'].astype(str) + '_' + df_sample['startdatetime'].astype(str)

out_arr = []
cols = ['customer_id', 'startdatetime', 'revenue', 'ownareaall_sqm', 'drinks',
        'burgers', 'snack', 'offer', 'chicken', 'bucket', 'count', 'order_id', 'dish_name']

for index, row in tqdm(df_sample[cols].iterrows()):
    res = reducer.push_line(row)
    if res is not None:
        # print('\t      '.join([str(x) for x in res]))
        out_arr.append(res)

res = reducer.print_result()
if res is not None:
    # print('\t'.join([str(x) for x in res]))
    out_arr.append(res)

592273it [01:52, 5247.79it/s]


In [79]:
df_test_new = pd.DataFrame(out_arr, columns=[
    'customer_id', 'order_id',
    'mean_revenue_last4', 'mean_revenue_last3', 'mean_revenue_last2', 'rat_revenue',
    'mean_squares_last4', 'mean_squares_last3', 'mean_squares_last2',
    'mean_drinks_last4', 'mean_drinks_last3', 'mean_drinks_last2', 'rat_drinks',
    'mean_burgers_last4', 'mean_burgers_last3', 'mean_burgers_last2', 'rat_burgers',
    'mean_snack_last4', 'mean_snack_last3', 'mean_snack_last2', 'rat_snack',
    'mean_offer_last4', 'mean_offer_last3', 'mean_offer_last2', 'rat_offer',
    'mean_chicken_last4', 'mean_chicken_last3', 'mean_chicken_last2', 'rat_chicken',
    'mean_bucket_last4', 'mean_bucket_last3', 'mean_bucket_last2', 'rat_bucket',
    'mean_count_last4', 'mean_count_last3', 'mean_count_last2', 'rat_count',
    'gap_last3', 'last_gap',
    'mc1_dish', 'mc2_dish',
    
    'nunique_dishes', 'cat_hour', 'cat_gr_hour', 'cat_weekday', 'cat_gr_weekday',
    
    'revenue_curr', 'square_curr', 'drinks_curr', 'burgers_curr', 'snack_curr', 'offer_curr', 
    'chicken_curr', 'bucket_curr',     
    'cnt'
])

In [81]:
# del out_arr
# gc.collect()

In [82]:
cols = [
    # 'customer_id', 'revenue', 'dish_name', 'ownareaall_sqm', 'format_name',
    # 'customer_id',
    'mean_revenue_last4', 'mean_revenue_last3', 'mean_revenue_last2', 'rat_revenue',
    'mean_squares_last4', 'mean_squares_last3', 'mean_squares_last2',
    'mean_drinks_last4', 'mean_drinks_last3', 'mean_drinks_last2', 'rat_drinks',
    'mean_burgers_last4', 'mean_burgers_last3', 'mean_burgers_last2', 'rat_burgers',
    'mean_snack_last4', 'mean_snack_last3', 'mean_snack_last2', 'rat_snack',
    'mean_offer_last4', 'mean_offer_last3', 'mean_offer_last2', 'rat_offer',
    'mean_chicken_last4', 'mean_chicken_last3', 'mean_chicken_last2', 'rat_chicken',
    'mean_bucket_last4', 'mean_bucket_last3', 'mean_bucket_last2', 'rat_bucket',
    'mean_count_last4', 'mean_count_last3', 'mean_count_last2', 'rat_count',
    'gap_last3', 'last_gap',
    'mc1_dish', 'mc2_dish',
    
    'nunique_dishes', 'cat_hour', 'cat_gr_hour', 'cat_weekday', 'cat_gr_weekday',
    
    'revenue_curr', 'square_curr', 'drinks_curr', 'burgers_curr', 'snack_curr', 'offer_curr', 
    'chicken_curr', 'bucket_curr',  
]

Keep last row for scoring

In [83]:
df_test_smpl = df_test_new.sort_values(by='order_id', ascending=False).drop_duplicates(subset=['customer_id'], keep='first')

In [85]:
df_test_smpl['mc1_dish'] = df_test_smpl['mc1_dish'].fillna('-1')
df_test_smpl['mc2_dish'] = df_test_smpl['mc2_dish'].fillna('-1')

In [100]:
df_test_smpl['buy_post'] = cls.predict_proba(df_test_smpl[cols])[:, 1]

df_test_smpl['buy_post'] = df_test_smpl['buy_post'].apply(lambda x: 1 if x > 0.5 else 0) 
df_test_smpl['buy_post'].value_counts()

1    59071
0    53264
Name: buy_post, dtype: int64

Scoring

In [102]:
df_out = pd.read_csv('submission.csv', sep=';')[['customer_id', 'date_diff_post']].merge(
    df_test_smpl[['customer_id', 'buy_post']]
)

# .to_csv('submission_ex.csv', sep=';', header=True, index=False)

In [103]:
# df_out
df_out.to_csv('submission_ex12.csv', sep=';', header=True, index=False)

In [107]:
import pickle
with open(f'models/cls_064_top_{len(cols)}_feats.pkl', 'wb') as f:
    pickle.dump(cls, f)

In [153]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

for _df, _trg, nm in zip([X_train, X_test], [y_train, y_test], ['train', 'test']):
    predict = cls.predict_proba(_df[cols])[:, 1]
    pred = cls.predict(_df[cols])
    auc = roc_auc_score(_trg, predict)
    f1_macro = f1_score(_trg, pred, average='macro')
    f1_weighted = f1_score(_trg, pred, average='weighted')

    print(f'AUC of {nm: <6}: {auc:.3f}')
    print(f'F1 macro of {nm: <6}: {f1_macro:.3f}')
    print(f'F1 weighted of {nm: <6}: {f1_weighted:.3f} \n')

AUC of train : 0.718
F1 macro of train : 0.567
F1 weighted of train : 0.686 

AUC of test  : 0.670
F1 macro of test  : 0.550
F1 weighted of test  : 0.678 



In [112]:
df_long['trg_flg'].value_counts()

1    2739682
0     140905
Name: trg_flg, dtype: int64

In [125]:
import plotly.express as px
pd.options.display.float_format = '{:,.0f}'.format

for col in cols:
    
    print(col)
    print(df_long.groupby(['trg_flg'])[col].describe(), '\n')
#     fig = px.histogram(df_long, x=col, color="trg_flg")
#     fig.show()


gap_last3
            count     mean     std        min      25%      50%      75%  max
trg_flg                                                                      
0         140,905 -637,402 425,766 -1,737,800 -952,094 -601,322 -284,532   -7
1       2,739,682 -367,049 389,130 -2,592,836 -575,388 -240,158  -39,510   -1 

last_gap
            count     mean     std        min        25%      50%     75%  max
trg_flg                                                                       
0         140,905 -835,843 945,178 -5,205,567 -1,291,857 -518,897 -73,053   -1
1       2,739,682 -485,791 709,923 -5,204,804   -657,434 -176,846    -743   -1 

mean_offer_last4
            count  mean  std  min  25%  50%  75%  max
trg_flg                                              
0         140,905     0    1    0    0    0    1   12
1       2,739,682     0    1    0    0    0    0   90 

mean_drinks_last4
            count  mean  std  min  25%  50%  75%  max
trg_flg                                   

In [239]:
params = {'depth': 9,
  'l2_leaf_reg': 2,
  'learning_rate': 0.1,
  'boosting_type': 'Plain'}

cls_2 = CatBoostClassifier(
    iterations = 1000,
    auto_class_weights='Balanced',
    has_time = True,
    cat_features = list(set(str_cols) & set(cols)),
    task_type = 'CPU',
    eval_metric = "F1",
    random_state = 23,
    silent=False,
    **params
) #learning_rate=0.5,

cls_2.fit(
    pd.concat([X_train[cols], X_test[cols], df_oot[cols]], ignore_index=False),
    pd.concat([y_train, y_test, df_oot[target_col]], ignore_index=False),
    early_stopping_rounds=30
)

0:	learn: 0.6321329	total: 143ms	remaining: 2m 23s
1:	learn: 0.6246086	total: 271ms	remaining: 2m 15s
2:	learn: 0.6182201	total: 395ms	remaining: 2m 11s
3:	learn: 0.6211073	total: 516ms	remaining: 2m 8s
4:	learn: 0.6245569	total: 640ms	remaining: 2m 7s
5:	learn: 0.6219337	total: 758ms	remaining: 2m 5s
6:	learn: 0.6223001	total: 877ms	remaining: 2m 4s
7:	learn: 0.6229984	total: 995ms	remaining: 2m 3s
8:	learn: 0.6242118	total: 1.11s	remaining: 2m 2s
9:	learn: 0.6255875	total: 1.22s	remaining: 2m
10:	learn: 0.6276578	total: 1.34s	remaining: 2m
11:	learn: 0.6278264	total: 1.46s	remaining: 2m
12:	learn: 0.6281616	total: 1.58s	remaining: 2m
13:	learn: 0.6270772	total: 1.7s	remaining: 1m 59s
14:	learn: 0.6278961	total: 1.84s	remaining: 2m
15:	learn: 0.6285876	total: 1.95s	remaining: 1m 59s
16:	learn: 0.6292192	total: 2.07s	remaining: 1m 59s
17:	learn: 0.6302025	total: 2.2s	remaining: 1m 59s
18:	learn: 0.6303213	total: 2.34s	remaining: 2m
19:	learn: 0.6309761	total: 2.45s	remaining: 2m
20:	le

<catboost.core.CatBoostClassifier at 0x354b46340>

In [240]:
df_test_smpl['buy_post'] = cls_2.predict_proba(df_test_smpl[cols])[:, 1]

df_test_smpl['buy_post'] = df_test_smpl['buy_post'].apply(lambda x: 1 if x > 0.52 else 0) 
df_test_smpl['buy_post'].value_counts()

1    65240
0    47095
Name: buy_post, dtype: int64

In [115]:
df_out = pd.read_csv('submission.csv', sep=';')[['customer_id', 'date_diff_post']].merge(
    df_test_smpl[['customer_id', 'buy_post']]
)
df_out.to_csv('submission_ex12.csv', sep=';', header=True, index=False)

In [111]:
# import pickle
# with open('models/cls_0625.pkl', 'wb') as f:
#     pickle.dump(cls_2, f)

## Regressor

In [116]:
from catboost.utils import eval_metric
from catboost import CatBoostRegressor

In [124]:
df_train_rmse = df_train_all[~df_train_all['trg_days'].isna()].copy()
df_oot_rmse = df_oot[~df_oot['trg_days'].isna()].copy()

In [125]:
target_col_rmse = 'trg_days'

In [180]:
rmse_cols = ['nunique_dishes',
 'gap_last3',
 'last_gap',
 'mean_count_last4',
 'mean_revenue_last4',
 'mean_drinks_last4',
 'mean_burgers_last4',
 'mean_offer_last4',
 'mean_snack_last4',
 'rat_revenue',
 'mean_chicken_last4',
 'mc1_dish',
 'mc2_dish']

In [182]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    df_train_rmse[rmse_cols],
    df_train_rmse[target_col_rmse].copy(),
    test_size=0.2,
    random_state=23
)

In [183]:
cb_rmse = CatBoostRegressor(
    iterations = 1000, 
    has_time = True, 
    cat_features = list(set(str_cols) & set(rmse_cols)),
    task_type = 'CPU', loss_function='RMSE', 
    random_state = 23, silent=False
)

cb_rmse.fit(X_train_r[rmse_cols], y_train_r, eval_set=(X_test_r[rmse_cols], y_test_r))

Learning rate set to 0.120227
0:	learn: 15.5387451	test: 15.6014664	best: 15.6014664 (0)	total: 56.3ms	remaining: 56.2s
1:	learn: 15.4234806	test: 15.4867118	best: 15.4867118 (1)	total: 92.5ms	remaining: 46.1s
2:	learn: 15.3339398	test: 15.3985602	best: 15.3985602 (2)	total: 128ms	remaining: 42.4s
3:	learn: 15.2512303	test: 15.3144807	best: 15.3144807 (3)	total: 157ms	remaining: 39.2s
4:	learn: 15.1803971	test: 15.2422949	best: 15.2422949 (4)	total: 184ms	remaining: 36.7s
5:	learn: 15.1213107	test: 15.1824532	best: 15.1824532 (5)	total: 230ms	remaining: 38.1s
6:	learn: 15.0718247	test: 15.1328198	best: 15.1328198 (6)	total: 250ms	remaining: 35.5s
7:	learn: 15.0297578	test: 15.0907526	best: 15.0907526 (7)	total: 274ms	remaining: 34s
8:	learn: 14.9934241	test: 15.0549089	best: 15.0549089 (8)	total: 297ms	remaining: 32.7s
9:	learn: 14.9594498	test: 15.0214854	best: 15.0214854 (9)	total: 327ms	remaining: 32.3s
10:	learn: 14.9295306	test: 14.9906322	best: 14.9906322 (10)	total: 353ms	remain

<catboost.core.CatBoostRegressor at 0x346a356d0>

In [166]:
[np.median(y_train_r)]

[22.0]

In [185]:
from sklearn.metrics import mean_squared_error as mse

for _df, _trg, nm in zip(
        [X_train_r, X_test_r, df_oot_rmse[rmse_cols]], 
        [y_train_r, y_test_r, df_oot_rmse[target_col_rmse]], 
        ['train', 'test', 'oot']):
    
    pred = cb_rmse.predict(_df[rmse_cols])
    mse_val = np.sqrt(mse(_trg, pred))
    
    pred_default = [22] * _df.shape[0]
    mse_default = np.sqrt(mse(_trg, pred_default))

    print(f'RMSE of {nm: <6}: {mse_val:.3f} ({mse_default:.3f})')
    
# RMSE of train : 14.292
# RMSE of test  : 14.555
# RMSE of oot   : 14.596

RMSE of train : 14.306 (15.931)
RMSE of test  : 14.553 (16.012)
RMSE of oot   : 14.599 (15.123)


In [186]:
cb_imp_rmse = pd.DataFrame(
    {'feature_importance': cb_rmse.feature_importances_,
     'feature_names': cb_rmse.feature_names_}
).sort_values(by=['feature_importance'], ascending=False).reset_index(drop=True)

cb_imp_rmse

Unnamed: 0,feature_importance,feature_names
0,26.619333,nunique_dishes
1,14.18174,gap_last3
2,10.506269,last_gap
3,8.892528,mean_count_last4
4,6.886974,mean_revenue_last4
5,6.593787,mean_drinks_last4
6,4.621135,rat_revenue
7,4.556413,mean_burgers_last4
8,4.231381,mean_offer_last4
9,3.842078,mean_snack_last4


In [187]:
import pickle
with open('models/regr_rmse.pkl', 'wb') as f:
    pickle.dump(cb_rmse, f)

In [189]:
pd.Series(cb_rmse.predict(df_test_smpl[rmse_cols])).describe()

count    112335.000000
mean         26.553596
std           6.005855
min          -3.636316
25%          22.875124
50%          26.623485
75%          30.740809
max          48.899952
dtype: float64

### Common predict

In [256]:
df_test_smpl['buy_post'] = cls_2.predict_proba(df_test_smpl[cls_2.feature_names_])[:, 1]

df_test_smpl['buy_post'] = df_test_smpl['buy_post'].apply(lambda x: 1 if x > 0.5 else 0) 

df_test_smpl['buy_post'].value_counts()

1    62333
0    50002
Name: buy_post, dtype: int64

In [257]:
df_test_smpl['date_diff_post'] = cb_rmse.predict(df_test_smpl[rmse_cols])

df_test_smpl['date_diff_post'] = df_test_smpl['date_diff_post'].apply(lambda x: 1 if x < 1 else x) 

df_test_smpl['date_diff_post'] = df_test_smpl.apply(
    lambda row: row.date_diff_post if row.buy_post == 1 else None, axis=1
) 

df_test_smpl['date_diff_post'].describe()

count    62333.000000
mean        22.954825
std          4.792491
min          1.000000
25%         20.589447
50%         23.564772
75%         26.088465
max         44.959235
Name: date_diff_post, dtype: float64

In [258]:
df_out = pd.read_csv('submission.csv', sep=';')[['customer_id']].merge(
    df_test_smpl[['customer_id', 'date_diff_post', 'buy_post']]
)


df_out.to_csv('submission_ex16.csv', sep=';', header=True, index=False)
df_out

Unnamed: 0,customer_id,date_diff_post,buy_post
0,13220760,14.322425,1
1,30315975,,0
2,21679985,26.699351,1
3,29754274,23.958043,1
4,7797823,16.809864,1
...,...,...,...
112329,14023374,21.520394,1
112330,34923292,,0
112331,32761835,,0
112332,38322785,,0
