### Imports

In [107]:
import pandas as pd
import dask.dataframe as dd
from os import listdir
from os.path import isfile, join
import numpy as np
from catboost import CatBoostClassifier, Pool, CatBoost
import sklearn as sk

relative_directory = '../../'
pd.set_option('display.max_rows', 200)

### Clickstream Reading

In [108]:
class DataReader:
    def __init__(self, chunkfolder_path, read_func_name, concat_func_name=None):        
        self.chunkfolder_path=chunkfolder_path
        self.read = read_func_name
        self.concat= concat_func_name
        return

    def read_chunks(self):
        self.chunk_path = tuple(chunkfolder_path + '/' + f for f in listdir(self.chunkfolder_path) if isfile(join(self.chunkfolder_path, f)))
        self.chunks_iterator = map(self.read,self.chunk_path)
        return
    
    def put_chunks_to_memory(self, first_chunk_only=False):
        if first_chunk_only:
            self.chunks = [next(self.chunks_iterator)]
        else:
            self.chunks = list(self.chunks_iterator)
        return
    
    def concat_chunks(self):
        if self.concat:
            self.concatenated_chunks = self.concat(self.chunks)
        return


In [109]:
chunkfolder_path = relative_directory+'clickstream'

ChunkReader = DataReader(chunkfolder_path, pd.read_parquet, pd.concat) # создаем экземпляр для чтения файлов из директории clickstream
ChunkReader.read_chunks() # сохраняем пути до файлов
print(ChunkReader.chunk_path) # проверяем пути до файлов
ChunkReader.put_chunks_to_memory(first_chunk_only=True) # читаем только первый chunk
print(len(ChunkReader.chunks)) # один чанк
ChunkReader.put_chunks_to_memory(first_chunk_only=False) # складываем чанки в лист
print(len(ChunkReader.chunks)) # чанки по одному лежат тут.
ChunkReader.concat_chunks() # объединяем в один большой


('../../clickstream/part-00000.parquet', '../../clickstream/part-00009.parquet', '../../clickstream/part-00003.parquet', '../../clickstream/part-00001.parquet', '../../clickstream/part-00004.parquet', '../../clickstream/part-00007.parquet', '../../clickstream/part-00005.parquet', '../../clickstream/part-00008.parquet', '../../clickstream/part-00006.parquet', '../../clickstream/part-00002.parquet')
1
9


In [110]:
train_target_df = pd.read_csv(relative_directory+'alfabattle2_abattle_train_target.csv',
                              parse_dates=['timestamp'], infer_datetime_format=True)

In [111]:
merged_df = ChunkReader.chunks[0].merge(train_target_df, how='inner', on='session_id') # джойним кликстрим на сессии


In [112]:
del ChunkReader


In [113]:
merged_df.head(5)

Unnamed: 0,timestamp_x,application_id,client,session_id,event_type,event_category,event_name,event_label,device_screen_name,timezone,device_is_webview,page_urlhost,page_urlpath_full,net_connection_type,net_connection_tech,client_pin,timestamp_y,multi_class_target
0,2020-06-05 22:31:50.839,anketa,149ffff87bb793e49097e89b7634f048,2458e7b8e9415e7ead1486485572b318,pv,,,,,Europe/Minsk,True,anketa.alfabank.ru,03eb3ceab6204f8b602f1863f7fbce01,,,149ffff87bb793e49097e89b7634f048,2020-06-05 22:13:18,statement
1,2020-06-05 22:31:51.441,anketa,149ffff87bb793e49097e89b7634f048,2458e7b8e9415e7ead1486485572b318,se,Ecommerce,View,f4d9dee663a4faf8a9a99344120c6647,,Europe/Minsk,True,anketa.alfabank.ru,d0a697abf7c744a109a1c0ab4d052ef7,,,149ffff87bb793e49097e89b7634f048,2020-06-05 22:13:18,statement
2,2020-06-05 22:31:50.970,anketa,149ffff87bb793e49097e89b7634f048,2458e7b8e9415e7ead1486485572b318,se,Non-interaction,show,51c65b92f640d5892d452468d4ed5fef,,Europe/Minsk,True,anketa.alfabank.ru,d0a697abf7c744a109a1c0ab4d052ef7,,,149ffff87bb793e49097e89b7634f048,2020-06-05 22:13:18,statement
3,2020-06-05 22:31:51.438,anketa,149ffff87bb793e49097e89b7634f048,2458e7b8e9415e7ead1486485572b318,se,Non-interaction,show,51c65b92f640d5892d452468d4ed5fef,,Europe/Minsk,True,anketa.alfabank.ru,d0a697abf7c744a109a1c0ab4d052ef7,,,149ffff87bb793e49097e89b7634f048,2020-06-05 22:13:18,statement
4,2020-06-05 22:17:09.510,mobile,149ffff87bb793e49097e89b7634f048,2458e7b8e9415e7ead1486485572b318,se,Application Lifecycle,Background,,,Europe/Moscow,True,,,mobile,LTE,149ffff87bb793e49097e89b7634f048,2020-06-05 22:13:18,statement


### First event in session calculating

In [114]:
merged_df["event_num"] = merged_df.sort_values(['timestamp_x'], ascending=True)\
             .groupby(['session_id'])\
             .cumcount() + 1
merged_df["event_count"] = merged_df.groupby(['client'])['event_num'].transform('mean')
merged_df["session_start_timestamp"] = merged_df.groupby(['session_id'])['timestamp_x'].transform('min')

def most_frequent_cat(x):
    mode = pd.Series.mode(x)
    if len(mode) >0:
        mode = mode[0]
    else:
        mode='undefined'
    return mode

def most_frequent_cat_share(x):
    rows = len(x)
    frequencies = x.value_counts()
    if len(frequencies):
        most_frequent=frequencies.iloc[0]/rows
    else:
        most_frequent=0.0
    return pd.Series([most_frequent]).repeat(rows)

merged_df["most_freq_ev_cat"]=merged_df.groupby(['client']).event_category.transform(most_frequent_cat)
merged_df['most_freq_ev_cat_share'] = merged_df.groupby(['client']).event_category.\
                                                        transform(most_frequent_cat_share)
merged_df["most_freq_ev_type"]=merged_df.groupby(['client']).event_type.transform(most_frequent_cat)
merged_df['most_freq_ev_type_share'] = merged_df.groupby(['client']).event_type.\
                                                        transform(most_frequent_cat_share)

merged_df["most_freq_ev_name"]=merged_df.groupby(['client']).event_name.transform(most_frequent_cat)
merged_df['most_freq_ev_name_share'] = merged_df.groupby(['client']).event_name.\
                                                        transform(most_frequent_cat_share)

merged_df["most_freq_ev_label"]=merged_df.groupby(['client']).event_label.transform(most_frequent_cat)
merged_df['most_freq_ev_label_share'] = merged_df.groupby(['client']).event_label.\
                                                        transform(most_frequent_cat_share)

merged_df["most_freq_screen"]=merged_df.groupby(['client']).device_screen_name.transform(most_frequent_cat)
merged_df['most_freq_screen_share'] = merged_df.groupby(['client']).device_screen_name.\
                                                        transform(most_frequent_cat_share)

merged_df["most_freq_conn_type"]=merged_df.groupby(['client']).net_connection_type.transform(most_frequent_cat)
merged_df['most_freq_conn_type_share'] = merged_df.groupby(['client']).net_connection_type.\
                                                        transform(most_frequent_cat_share)

merged_df["most_freq_conn_tech"]=merged_df.groupby(['client']).net_connection_tech.transform(most_frequent_cat)
merged_df['most_freq_conn_tech_share'] = merged_df.groupby(['client']).net_connection_tech.\
                                                        transform(most_frequent_cat_share)

merged_df["most_freq_timezone"]=merged_df.groupby(['client']).timezone.transform(most_frequent_cat)

In [10]:
# merged_df['prev_target'] = merged_df.sort_values(by=['session_start_timestamp'], ascending=True)\
#                        .groupby(['client'])['multi_class_target'].shift(1)

# merged_df['prev_target2'] = merged_df.sort_values(by=['session_start_timestamp'], ascending=True)\
#                        .groupby(['client'])['multi_class_target'].shift(2)

# merged_df['prev_target3'] = merged_df.sort_values(by=['session_start_timestamp'], ascending=True)\
#                        .groupby(['client'])['multi_class_target'].shift(3)

# merged_df['prev_target4'] = merged_df.sort_values(by=['session_start_timestamp'], ascending=True)\
#                        .groupby(['client'])['multi_class_target'].shift(4)

# merged_df['prev_target5'] = merged_df.sort_values(by=['session_start_timestamp'], ascending=True)\
#                        .groupby(['client'])['multi_class_target'].shift(5)

# merged_df['prev_target6'] = merged_df.sort_values(by=['session_start_timestamp'], ascending=True)\
#                        .groupby(['client'])['multi_class_target'].shift(6)

# merged_df['prev_target7'] = merged_df.sort_values(by=['session_start_timestamp'], ascending=True)\
#                        .groupby(['client'])['multi_class_target'].shift(7)

In [115]:
random_client = merged_df[merged_df.client=='ee36508bb8ed3975276d32d6db98a04c']

In [116]:
random_client

Unnamed: 0,timestamp_x,application_id,client,session_id,event_type,event_category,event_name,event_label,device_screen_name,timezone,...,client_pin,timestamp_y,multi_class_target,event_num,event_count,session_start_timestamp,most_freq_ev_cat,most_freq_ev_cat_share,most_freq_ev_type,most_freq_ev_type_share
160854,2020-06-21 07:04:23.114,mobile,ee36508bb8ed3975276d32d6db98a04c,21ac81ea4072e9a56ff49505d29a58b5,se,Application Lifecycle,Foreground,,,Asia/Vladivostok,...,ee36508bb8ed3975276d32d6db98a04c,2020-06-21 07:04:02,own_transfer,2,11,2020-06-21 07:04:02.797,Application Lifecycle,0.207059,se,0.557647
160855,2020-06-21 07:04:02.797,mobile,ee36508bb8ed3975276d32d6db98a04c,21ac81ea4072e9a56ff49505d29a58b5,se,Widget Dashboard,Click > Account Item,,Accounts And Cards Widget,Asia/Vladivostok,...,ee36508bb8ed3975276d32d6db98a04c,2020-06-21 07:04:02,own_transfer,1,11,2020-06-21 07:04:02.797,Application Lifecycle,0.207059,se,0.557647
160856,2020-06-21 07:04:37.136,mobile,ee36508bb8ed3975276d32d6db98a04c,21ac81ea4072e9a56ff49505d29a58b5,se,Widget Dashboard,Drag > Account Item,,Accounts And Cards Widget,Asia/Vladivostok,...,ee36508bb8ed3975276d32d6db98a04c,2020-06-21 07:04:02,own_transfer,7,11,2020-06-21 07:04:02.797,Application Lifecycle,0.207059,se,0.557647
160857,2020-06-21 07:04:36.010,mobile,ee36508bb8ed3975276d32d6db98a04c,21ac81ea4072e9a56ff49505d29a58b5,se,Widget Dashboard,Expand > Account Items,,Accounts And Cards Widget,Asia/Vladivostok,...,ee36508bb8ed3975276d32d6db98a04c,2020-06-21 07:04:02,own_transfer,6,11,2020-06-21 07:04:02.797,Application Lifecycle,0.207059,se,0.557647
160858,2020-06-21 07:04:38.714,mobile,ee36508bb8ed3975276d32d6db98a04c,21ac81ea4072e9a56ff49505d29a58b5,se,Advice,Preselect Accounts,24f96f775c5236e0f7594ef40cc86666,Own Transfer,Asia/Vladivostok,...,ee36508bb8ed3975276d32d6db98a04c,2020-06-21 07:04:02,own_transfer,9,11,2020-06-21 07:04:02.797,Application Lifecycle,0.207059,se,0.557647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8143414,2020-02-11 06:26:40.767,mobile,ee36508bb8ed3975276d32d6db98a04c,8544e3cbf116474d76ebdb5c60ba062b,sv,,,,SignInActivity,Australia/Brisbane,...,ee36508bb8ed3975276d32d6db98a04c,2020-02-11 06:26:40,main_screen,1,2,2020-02-11 06:26:40.767,Application Lifecycle,0.207059,se,0.557647
8143415,2020-02-01 06:03:50.795,mobile,ee36508bb8ed3975276d32d6db98a04c,50ef7607322b14b61acee249edf1e970,sv,,,,MainListFragment,Australia/Brisbane,...,ee36508bb8ed3975276d32d6db98a04c,2020-02-01 06:03:44,main_screen,2,2,2020-02-01 06:03:44.371,Application Lifecycle,0.207059,se,0.557647
8143416,2020-02-01 06:03:44.371,mobile,ee36508bb8ed3975276d32d6db98a04c,50ef7607322b14b61acee249edf1e970,sv,,,,SignInActivity,Australia/Brisbane,...,ee36508bb8ed3975276d32d6db98a04c,2020-02-01 06:03:44,main_screen,1,2,2020-02-01 06:03:44.371,Application Lifecycle,0.207059,se,0.557647
8143417,2020-01-04 04:55:58.935,mobile,ee36508bb8ed3975276d32d6db98a04c,71d99cd0286a0f39ea988cf8b5da24c0,sv,,,,MainListFragment,Australia/Brisbane,...,ee36508bb8ed3975276d32d6db98a04c,2020-01-04 04:55:52,main_screen,2,2,2020-01-04 04:55:52.455,Application Lifecycle,0.207059,se,0.557647


### From clickkstream to train-target

In [150]:
merged_df['row_number'] = merged_df.sort_values(['timestamp_y'], ascending=False)\
             .groupby(['client_pin'])\
             .cumcount() + 1

In [254]:
feature_list = ['most_freq_timezone', 'most_freq_ev_cat_share','most_freq_ev_type_share',
                'most_freq_ev_name_share','most_freq_ev_label_share','most_freq_screen_share',
                'most_freq_conn_type_share','most_freq_conn_tech_share','client_pin','most_freq_ev_cat',
                'most_freq_ev_type','most_freq_ev_name','most_freq_ev_label','most_freq_screen',
                'most_freq_conn_type','most_freq_conn_tech']
client_features = merged_df[merged_df.row_number==1][feature_list]

In [255]:

def add_dateparts(df, fldname, drop_date_columns=True):
    '''
    Adds features extracted from date. Full list below:
    'Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear','Is_month_end',
    'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'
    Returns: 0 if everything is fine
    '''
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
              'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
              'Is_year_end', 'Is_year_start', 'Hour', 'Minute', 'Second'):
        df[n] = np.int_(getattr(fld.dt, n.lower()))
    if drop_date_columns:
        df.drop(fldname, axis=1, inplace=True)
    return 


### Date feature generating

In [256]:
train_target_df = pd.read_csv(relative_directory+'alfabattle2_abattle_train_target.csv',
                              parse_dates=['timestamp'], infer_datetime_format=True)

train_dataset = train_target_df.merge(client_features, how='left', on='client_pin')
train_dataset.most_freq_timezone.fillna("Europe/Moscow", inplace=True)
train_dataset['session_local_timestamp'] = train_dataset.apply(lambda x: x.timestamp.\
                                            tz_localize(tz = "Europe/Moscow").tz_convert(x.most_freq_timezone).\
                                            tz_localize(None), axis = 1)
                                            
add_dateparts(train_dataset, 'session_local_timestamp') # добавллям фичи из даты, удаляем timestamp

test_dataset = pd.read_csv(relative_directory+'alfabattle2_prediction_session_timestamp.csv',
                              parse_dates=['timestamp'], infer_datetime_format=True)
test_dataset = test_dataset.merge(client_features, how='left', on='client_pin')
test_dataset.most_freq_timezone.fillna("Europe/Moscow", inplace=True)
test_dataset['session_local_timestamp'] = test_dataset.apply(lambda x: x.timestamp.\
                                         tz_localize(tz = "Europe/Moscow").tz_convert(x.most_freq_timezone).\
                                        tz_localize(None), axis = 1)
add_dateparts(test_dataset, 'session_local_timestamp')

  


### Train and val split

In [306]:
# Проблема в том, что по-хорошему нельзя случайным образом разбивать выборку на test и train
# Сессии из будущего плохо использовать для предсказания прошлого
# ~10% юзеров исключить из train, чтобы посмотреть, как модель справляется с новым юзерами
# остальные данные для валидации надо подобрать так, чтобы в валидации были только данные из будущего относительно train-а

train_dataset["client_hash"] = train_dataset.client_pin.apply(hash)%10
train_dataset["client"] = train_dataset["client_pin"]
val_dataset = train_dataset[train_dataset.client_hash.isin([1,5,8])] # максимальный месяц идет в валидацию
train_dataset_cut = train_dataset[~(train_dataset.client_hash.isin([1,5,8]))]

### Detelete extra columns

In [309]:
columns_to_drop = ['Year', 'Month', 'session_id', 'client_hash','most_freq_timezone','Is_year_end',
                   'Is_year_start', 'timestamp', 'client_pin']

train_dataset_cut.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')
val_dataset.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Making a model

In [310]:
train_dataset_cut.head()

Unnamed: 0,multi_class_target,most_freq_ev_cat_share,most_freq_ev_type_share,most_freq_ev_name_share,most_freq_ev_label_share,most_freq_screen_share,most_freq_conn_type_share,most_freq_conn_tech_share,most_freq_ev_cat,most_freq_ev_type,...,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Hour,Minute,Second,client
0,main_screen,,,,,,,,,,...,0,167,0,0,0,0,14,1,12,7cf9221322a0e2fdefb1b998b8f2ab29
3,main_screen,,,,,,,,,,...,6,75,0,0,0,0,19,50,23,91f55a33d7502c1a1fa5da7ff2f7b648
4,main_screen,,,,,,,,,,...,3,51,0,0,0,0,7,56,58,3ef1020bda95ce7836d2680fa553ecb7
7,main_screen,,,,,,,,,,...,3,205,0,0,0,0,5,23,22,cb11659c5bec156fbaaac453230456c3
8,own_transfer,,,,,,,,,,...,6,117,0,0,0,0,17,21,26,c7a854d2f479950f69341e670cf829b1


In [311]:
train_dataset_cut[['most_freq_ev_cat_share_isnan', 'most_freq_ev_type_share_isnan',
       'most_freq_ev_name_share_isnan', 'most_freq_ev_label_share_isnan',
       'most_freq_screen_share_isnan', 'most_freq_conn_type_share_isnan',
       'most_freq_conn_tech_share_isnan']]=train_dataset_cut[['most_freq_ev_cat_share','most_freq_ev_type_share',
       'most_freq_ev_name_share', 'most_freq_ev_label_share',
       'most_freq_screen_share', 'most_freq_conn_type_share',
       'most_freq_conn_tech_share']].isnull()


train_dataset_cut[['most_freq_ev_cat_share', 'most_freq_ev_type_share',
       'most_freq_ev_name_share', 'most_freq_ev_label_share',
       'most_freq_screen_share', 'most_freq_conn_type_share',
       'most_freq_conn_tech_share']] = train_dataset_cut[['most_freq_ev_cat_share', 'most_freq_ev_type_share',
       'most_freq_ev_name_share', 'most_freq_ev_label_share',
       'most_freq_screen_share', 'most_freq_conn_type_share',
       'most_freq_conn_tech_share']].fillna(0)

train_dataset_cut[['most_freq_ev_cat_isnan', 'most_freq_ev_type_isnan', 'most_freq_ev_name_isnan',
       'most_freq_ev_label_isnan', 'most_freq_screen_isnan', 'most_freq_conn_type_isnan',
       'most_freq_conn_tech_isnan', 'Week_isnan', 'Day_isnan', 'Dayofweek_isnan', 'Dayofyear_isnan',
       'Is_month_end_isnan', 'Is_month_start_isnan', 'Is_quarter_end_isnan', 'Is_quarter_start_isnan',
       'Hour_isnan', 'Minute_isnan', 'Second_isnan']] = train_dataset_cut[['most_freq_ev_cat', 'most_freq_ev_type', 'most_freq_ev_name',
       'most_freq_ev_label', 'most_freq_screen', 'most_freq_conn_type',
       'most_freq_conn_tech', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
       'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
       'Hour', 'Minute', 'Second']].isnull()


train_dataset_cut[['most_freq_ev_cat', 'most_freq_ev_type', 'most_freq_ev_name',
       'most_freq_ev_label', 'most_freq_screen', 'most_freq_conn_type',
       'most_freq_conn_tech', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
       'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
       'Hour', 'Minute', 'Second']] = train_dataset_cut[['most_freq_ev_cat', 'most_freq_ev_type', 'most_freq_ev_name',
       'most_freq_ev_label', 'most_freq_screen', 'most_freq_conn_type',
       'most_freq_conn_tech', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
       'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
       'Hour', 'Minute', 'Second']].fillna('undefined')

val_dataset[['most_freq_ev_cat_share_isnan', 'most_freq_ev_type_share_isnan',
       'most_freq_ev_name_share_isnan', 'most_freq_ev_label_share_isnan',
       'most_freq_screen_share_isnan', 'most_freq_conn_type_share_isnan',
       'most_freq_conn_tech_share_isnan']]=val_dataset[['most_freq_ev_cat_share','most_freq_ev_type_share',
       'most_freq_ev_name_share', 'most_freq_ev_label_share',
       'most_freq_screen_share', 'most_freq_conn_type_share',
       'most_freq_conn_tech_share']].isnull()

val_dataset[['most_freq_ev_cat_share', 'most_freq_ev_type_share',
       'most_freq_ev_name_share', 'most_freq_ev_label_share',
       'most_freq_screen_share', 'most_freq_conn_type_share',
       'most_freq_conn_tech_share']] = val_dataset[['most_freq_ev_cat_share', 'most_freq_ev_type_share',
       'most_freq_ev_name_share', 'most_freq_ev_label_share',
       'most_freq_screen_share', 'most_freq_conn_type_share',
       'most_freq_conn_tech_share']].fillna(0)

val_dataset[['most_freq_ev_cat_isnan', 'most_freq_ev_type_isnan', 'most_freq_ev_name_isnan',
       'most_freq_ev_label_isnan', 'most_freq_screen_isnan', 'most_freq_conn_type_isnan',
       'most_freq_conn_tech_isnan', 'Week_isnan', 'Day_isnan', 'Dayofweek_isnan', 'Dayofyear_isnan',
       'Is_month_end_isnan', 'Is_month_start_isnan', 'Is_quarter_end_isnan', 'Is_quarter_start_isnan',
       'Hour_isnan', 'Minute_isnan', 'Second_isnan']] = val_dataset[['most_freq_ev_cat', 'most_freq_ev_type', 'most_freq_ev_name',
       'most_freq_ev_label', 'most_freq_screen', 'most_freq_conn_type',
       'most_freq_conn_tech', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
       'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
       'Hour', 'Minute', 'Second']].isnull()

val_dataset[['most_freq_ev_cat', 'most_freq_ev_type', 'most_freq_ev_name',
       'most_freq_ev_label', 'most_freq_screen', 'most_freq_conn_type',
       'most_freq_conn_tech', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
       'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
       'Hour', 'Minute', 'Second']] = val_dataset[['most_freq_ev_cat', 'most_freq_ev_type', 'most_freq_ev_name',
       'most_freq_ev_label', 'most_freq_screen', 'most_freq_conn_type',
       'most_freq_conn_tech', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
       'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
       'Hour', 'Minute', 'Second']].fillna('undefined')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [312]:
train_data = train_dataset_cut.drop(["multi_class_target"], axis=1).values
train_labels = train_dataset_cut.multi_class_target.values
val_data = val_dataset.drop(["multi_class_target"], axis=1).values
val_labels = val_dataset.multi_class_target.values

train_pool = Pool(train_data, 
                  train_labels, cat_features=[i for i in range(train_data.shape[1]) if i > 6])
val_pool = Pool(val_data, 
                  val_labels, cat_features=[i for i in range(val_data.shape[1]) if i > 6])

# test_pool= Pool(test_dataset.values, cat_features=[i for i in range(test_dataset.values.shape[1])])

In [315]:

param = {'iterations':10, 'loss_function':'MultiClass','eval_metric':'TotalF1',
         'l2_leaf_reg':1.0, 'auto_class_weights':'Balanced', 'boosting_type':'Plain',
        'depth':10, 'random_strength':3}
model = CatBoost(param)
#train the model
model.fit(train_pool, eval_set=val_pool) 
# # make the prediction using the resulting model
# preds_class = model.predict(val_pool, prediction_type='Class')
# preds_proba = model.predict(val_pool, prediction_type='Probability')
# preds_raw_vals = model.predict(val_pool, prediction_type='RawFormulaVal')

model.best_score_

0:	learn: 0.3931818	test: 0.0179520	best: 0.0179520 (0)	total: 21.1s	remaining: 3m 10s
1:	learn: 0.4030577	test: 0.0179520	best: 0.0179520 (1)	total: 40.3s	remaining: 2m 41s
2:	learn: 0.4250433	test: 0.0179520	best: 0.0179520 (1)	total: 59.2s	remaining: 2m 18s
3:	learn: 0.4216452	test: 0.0179520	best: 0.0179520 (1)	total: 1m 12s	remaining: 1m 49s
4:	learn: 0.4306081	test: 0.0242752	best: 0.0242752 (4)	total: 1m 32s	remaining: 1m 32s
5:	learn: 0.4321379	test: 0.0242752	best: 0.0242752 (4)	total: 1m 45s	remaining: 1m 10s
6:	learn: 0.4316757	test: 0.0273041	best: 0.0273041 (6)	total: 2m 5s	remaining: 54s
7:	learn: 0.4385075	test: 0.0243230	best: 0.0273041 (6)	total: 2m 24s	remaining: 36.1s
8:	learn: 0.4387202	test: 0.0242632	best: 0.0273041 (6)	total: 2m 44s	remaining: 18.2s
9:	learn: 0.4392476	test: 0.0243910	best: 0.0273041 (6)	total: 2m 57s	remaining: 0us

bestTest = 0.02730408398
bestIteration = 6

Shrink model to first 7 iterations.


{'learn': {'TotalF1': 0.4392476479259311, 'MultiClass': 2.0198890966347123},
 'validation': {'TotalF1': 0.027304083976894763,
  'MultiClass': 2.302738280055479}}

In [316]:
model.feature_importances_

array([7.55873047e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 4.50776153e-02, 1.19330628e-03, 2.50199936e-02,
       0.00000000e+00, 1.57486654e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.83565063e-07,
       0.00000000e+00, 9.98515467e+01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

# Make prediction

In [17]:
def preprocess(df):
    X = df.copy()
    add_dateparts(X, 'timestamp')
    columns_to_drop = ['Year', 'Month']
    X.drop(columns_to_drop, axis=1, inplace=True)
    return X
#alfabattle2_prediction_session_timestamp_test = pd.read_csv("alfabattle2_prediction_session_timestamp.csv")
#preprocess(alfabattle2_prediction_session_timestamp_test)

In [60]:
%reload_ext autoreload
%autoreload 2
import utils

utils.make_submit(model, preprocess, {"model": "Catboost"})
prediction = pd.read_csv("prediction.csv")
prediction.head()

  df[n] = np.int_(getattr(fld.dt, n.lower()))


submits/2020-12-15_21:17:48.464900_None/prediction.csv created
submits/2020-12-15_21:17:48.464900_None/meta_info.json created


Unnamed: 0,client_pin,prediction
0,f0c674b2bb4dc64be607029271d706ec,card_recharge
1,90725b54ce77576883813d87749df6bd,statement
2,eb0f82d74c7b7bd5eafbd5b5f8cb3e2a,credit_info
3,831bf4c0ecccc55e536b8cfb9153d672,card2card_transfer
4,3f1a5a1c492ce877af833113e59c5797,mobile_recharge
