In [1]:
import numpy as np
import pandas as pd 
import sys
import pickle
import catboost
from catboost import Pool, CatBoostClassifier, CatBoostRanker
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import multiprocessing
from psutil import virtual_memory
from tqdm.notebook import tqdm

In [2]:
clickstream_path = 'data/clickstream.csv'
transactions_path = 'data/transactions.csv'
matching_path = 'data/train_matching.csv'

In [3]:
class CFG:
    num_negatives = 70
    num_easiest_examples = 7

    class First_model:
        depth = 5
        iterations = 5000
        
    class Second_model:
        depth = 9
        iterations = 18000
        lr = 0.02

    seed = 777

In [4]:
def make_comb_features(dataframe):
    for x in range(24):
        dataframe[f'diff_hours_{x}'] = dataframe[f'click_h_{x}'] - dataframe[f'trans_h_{x}']

all_dicts = {}
clickstream = pd.read_csv(clickstream_path)

all_dicts['rtk_le'] = LabelEncoder().fit(clickstream['user_id'])
clickstream['user_id'] = all_dicts['rtk_le'].transform(clickstream['user_id'])+1
clickstream_dtypes = {'user_id':np.int16, 'cat_id':np.int16, 'new_uid':np.int32}
clickstream = clickstream.astype(clickstream_dtypes)

clickstream['timestamp'] = pd.to_datetime(clickstream['timestamp'])
clickstream['date'] = clickstream['timestamp'].dt.date.astype('category')
clickstream['hour'] = clickstream['timestamp'].dt.hour.astype('category')
clickstream['weekday'] = clickstream['timestamp'].dt.dayofweek.astype('category')

clickstream_embed = clickstream.pivot_table(index = 'user_id', 
                            values=['timestamp'],
                            columns=['cat_id'],
                            aggfunc=['count']).fillna(0)
clickstream_embed2 = clickstream.pivot_table(index = 'user_id', 
                            values=['timestamp'],
                            columns=['date'],
                            aggfunc=['count']).fillna(0)
clickstream_embed3 = clickstream.pivot_table(index = 'user_id', 
                            values=['new_uid'],
                            aggfunc=['nunique']).fillna(0)
clickstream_embed4 = clickstream.groupby('user_id')['timestamp'].apply(lambda x: np.max(x) - np.min(x)).dt.days.astype('int16').to_frame()
clickstream_embed5 = clickstream.pivot_table(index = 'user_id', 
                            values=['timestamp'],
                            columns=['weekday','hour'],
                            aggfunc=['count']).fillna(0)
clickstream_embed.columns = [f'rtk-{str(i[0])}-{str(i[2])}' for i in clickstream_embed.columns]
clickstream_embed2.columns = [f'rtk-{str(i[0])}-{str(i[2])}' for i in clickstream_embed2.columns]
clickstream_embed3.columns = [f'rtk-{str(i[0])}-{str(i[1])}' for i in clickstream_embed3.columns]
clickstream_embed4.columns = [f'rtk-max_date_diff' for i in clickstream_embed4.columns]
clickstream_embed5.columns = [f'rtk-{str(i[0])}-weekday-{str(i[2])}-nhour-{str(i[3])}' for i in clickstream_embed5.columns]
clickstream_embed = clickstream_embed.merge(clickstream_embed2, left_on='user_id', right_index=True).merge(
                                            clickstream_embed3, left_on='user_id', right_index=True).merge(
                                            clickstream_embed4, left_on='user_id', right_index=True).merge(
                                            clickstream_embed5, left_on='user_id', right_index=True)
clickstream_embed.loc[0] = np.empty(len(clickstream_embed.columns))

clickstream['hour'] = clickstream['timestamp'].dt.hour
cl_sv = pd.pivot_table(clickstream, index='user_id', columns='hour', values = 'timestamp', aggfunc = 'count').fillna(0)
cl_sv['summs'] = cl_sv.sum(axis=1)
for i in cl_sv.columns[:-1]:
    cl_sv[i] /= cl_sv['summs']
cl_sv.columns = ['click_h_'+ str(i) for i in cl_sv.columns]
del clickstream, clickstream_embed2, clickstream_embed3, clickstream_embed4, clickstream_embed5
gc.collect()

dtype_clickstream = list()
for x in clickstream_embed.dtypes.tolist():
    if x == 'int64' or x == 'int32' or x == 'int16':
        dtype_clickstream.append('int16')
    elif x == 'float64' or x == 'float32':
        dtype_clickstream.append('float32')
    else:
        dtype_clickstream.append('object')

dtype_clickstream = dict(zip(clickstream_embed.columns.tolist(),dtype_clickstream))
clickstream_embed = clickstream_embed.astype(dtype_clickstream)


transactions = pd.read_csv(transactions_path)
transactions['transaction_dttm'] = pd.to_datetime(transactions['transaction_dttm'])
all_dicts['bank_le'] = LabelEncoder().fit(transactions['user_id'])
transactions['user_id'] = all_dicts['bank_le'].transform(transactions['user_id'])+1
transactions_dtypes = {'user_id':np.int16, 'mcc_code':np.int16, 'currency_rk':np.int8}
transactions = transactions.astype(transactions_dtypes)

transactions['date'] = transactions['transaction_dttm'].dt.date.astype('category')
transactions['hour'] = transactions['transaction_dttm'].dt.hour.astype('category')
transactions['weekday'] = transactions['transaction_dttm'].dt.dayofweek.astype('category')

bankclient_embed = transactions.pivot_table(index = 'user_id',
                            values=['transaction_amt'],
                            columns=['mcc_code'],
                            aggfunc=['sum', 'mean', 'count']).fillna(0)
bankclient_embed.columns = [f'{str(i[0])}-{str(i[2])}' for i in bankclient_embed.columns]
bankclient_embed2 = transactions.pivot_table(index = 'user_id', 
                            values=['transaction_amt'],
                            columns=['currency_rk'],
                            aggfunc=['sum', 'mean', 'count']).fillna(0)
bankclient_embed2.columns = [f'{str(i[0])}-{str(i[2])}' for i in bankclient_embed2.columns]
bankclient_embed3 = transactions.pivot_table(index = 'user_id', 
                            values=['transaction_dttm'],
                            columns=['date'],
                            aggfunc=['count']).fillna(0)
bankclient_embed3.columns = [f'{str(i[0])}-{str(i[2])}' for i in bankclient_embed3.columns]
bankclient_embed4 = transactions.pivot_table(index = 'user_id', 
                            values=['transaction_dttm'],
                            columns=['weekday','hour'],
                            aggfunc=['count']).fillna(0)
bankclient_embed4.columns = [f'bnk-{str(i[0])}-weekday-{str(i[2])}-nhour-{str(i[3])}' for i in bankclient_embed4.columns]
bankclient_embed5 = transactions.groupby('user_id')['transaction_dttm'].apply(lambda x: np.max(x) - np.min(x)).dt.days.astype('int16').to_frame()
bankclient_embed5.columns = [f'bnk-max_date_diff' for i in bankclient_embed5.columns]
bankclient_embed = bankclient_embed.merge(bankclient_embed2, left_on='user_id', right_index=True
                                        ).merge(bankclient_embed3, left_on='user_id', right_index=True
                                                ).merge(bankclient_embed4, left_on='user_id', right_index=True
                                                    ).merge(bankclient_embed5, left_on='user_id', right_index=True)

tr_sv = pd.pivot_table(transactions, index='user_id', columns='hour', values = 'transaction_amt', aggfunc = 'count').fillna(0)
tr_sv['summs'] = tr_sv.sum(axis=1)
for i in tr_sv.columns[:-1]:
    tr_sv[i] /= tr_sv['summs']
tr_sv.columns = ['trans_h_'+ str(i) for i in tr_sv.columns]

del transactions, bankclient_embed2, bankclient_embed3, bankclient_embed4
gc.collect()

dtype_bankclient = list()
for x in bankclient_embed.dtypes.tolist():
    if x == 'int64' or x == 'int32' or x == 'int16':
        dtype_bankclient.append('int16')
    elif x == 'float64' or x == 'float32':
        dtype_bankclient.append('float32')
    else:
        dtype_bankclient.append('object')
    
dtype_bankclient = dict(zip(bankclient_embed.columns.tolist(),dtype_bankclient))
bankclient_embed = bankclient_embed.astype(dtype_bankclient)

main_train_df = pd.read_csv(matching_path)
main_train_df['bank'] = all_dicts['bank_le'].transform(main_train_df['bank'])+1
main_train_df = main_train_df[main_train_df['rtk']!='0']
main_train_df['rtk'] = all_dicts['rtk_le'].transform(main_train_df['rtk'])+1
main_train_df['bank+rtk'] = main_train_df['bank'].astype('str')+'+'+main_train_df['rtk'].astype('str')
train = main_train_df

k = CFG.num_negatives
cor_dict = train.set_index('bank')['rtk'].to_dict()

train_bank_ids = train['bank']
train_rtk_ids = train[train.bank.isin(train_bank_ids)]['rtk'].drop_duplicates()
df_train = pd.DataFrame(train_bank_ids, columns=['bank'])
df_train['rtk'] = df_train['bank'].apply(lambda x: [cor_dict[x]] + train_rtk_ids.sample(k, random_state=x+CFG.seed).values.tolist())

df_train = df_train.explode('rtk')

train['bank+rtk'] = train['bank'].astype('str')+'_'+train['rtk'].astype('str')
df_train['bank+rtk'] = df_train['bank'].astype('str')+'_'+df_train['rtk'].astype('str')
df_train['target'] = df_train['bank+rtk'].isin(train['bank+rtk']).astype('int')

df_train.drop_duplicates('bank+rtk', inplace=True)
df_train.reset_index(inplace=True, drop=True)

X_train=df_train.merge(bankclient_embed, how='left', left_on='bank', right_index=True
                    ).merge(clickstream_embed, how='left', left_on='rtk', right_index=True
                        ).merge(cl_sv, how='left', left_on='rtk', right_index=True
                            ).merge(tr_sv, how='left', left_on='bank', right_index=True
                                ).fillna(0)

make_comb_features(X_train)

In [5]:
clf = CatBoostClassifier(
    depth=CFG.First_model.depth,
    iterations=CFG.First_model.iterations,
    random_seed=CFG.seed,
)
feature_list = X_train.drop(['bank', 'rtk', 'target', 'bank+rtk'], axis=1).columns.tolist()
clf.fit(Pool(X_train[feature_list], X_train['target']), verbose=200, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.090298
0:	learn: 0.6133920	total: 228ms	remaining: 3m 47s
200:	learn: 0.2707968	total: 32.9s	remaining: 2m 10s
400:	learn: 0.2619088	total: 1m 1s	remaining: 1m 32s
600:	learn: 0.2538106	total: 1m 30s	remaining: 1m
800:	learn: 0.2475027	total: 1m 58s	remaining: 29.5s
999:	learn: 0.2420566	total: 2m 26s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1ad02f850>

In [10]:
X_train['predicts'] = clf.predict_proba(X_train[feature_list])[:, 1]

for x in range(CFG.num_easiest_examples):
    X_train = X_train[~X_train['predicts'].isin(X_train[X_train['target']==0].groupby('bank')['predicts'].apply(min).values)]

In [12]:
feature_imp = clf.get_feature_importance()
bad_cols = [feature_list[i] for i in range(len(feature_list)) if feature_imp[i] < 0.025]
X_train.drop(bad_cols, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(bad_cols, axis=1, inplace=True)


In [13]:
clf = CatBoostRanker(
    loss_function='YetiRank',
    learning_rate=CFG.Second_model.lr,
    depth=CFG.Second_model.depth,
    iterations=CFG.Second_model.iterations,
    custom_metric=['MRR'],
    random_seed=CFG.seed,
)
feature_list = X_train.drop(['bank', 'rtk', 'target', 'predicts', 'bank+rtk'], axis=1).columns.tolist()
clf.fit(Pool(X_train[feature_list], X_train['target'], group_id=X_train['bank']), verbose=200, plot=True)
clf.save_model('catboost.cbm')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	total: 66.8ms	remaining: 13.3s
199:	total: 11.8s	remaining: 0us


<catboost.core.CatBoostRanker at 0x1bfce1df0>