### Imports

In [1]:
import pandas as pd
import dask.dataframe as dd
from os import listdir
from os.path import isfile, join
import numpy as np
from catboost import CatBoostClassifier, Pool, CatBoost
import sklearn as sk

### Clickstream Reading

In [2]:
class DataReader:
    def __init__(self, chunkfolder_path, read_func_name, concat_func_name=None):        
        self.chunkfolder_path=chunkfolder_path
        self.read = read_func_name
        self.concat= concat_func_name
        return

    def read_chunks(self):
        self.chunk_path = tuple(chunkfolder_path + '/' + f for f in listdir(self.chunkfolder_path) if isfile(join(self.chunkfolder_path, f)))
        self.chunks_iterator = map(self.read,self.chunk_path)
        return
    
    def put_chunks_to_memory(self, first_chunk_only=False):
        if first_chunk_only:
            self.chunks = [next(self.chunks_iterator)]
        else:
            self.chunks = list(self.chunks_iterator)
        return
    
    def concat_chunks(self):
        if self.concat:
            self.concatenated_chunks = self.concat(self.chunks)
        return


In [3]:
chunkfolder_path = 'clickstream'

ChunkReader = DataReader(chunkfolder_path, pd.read_parquet, pd.concat) # создаем экземпляр для чтения файлов из директории clickstream
ChunkReader.read_chunks() # сохраняем пути до файлов
print(ChunkReader.chunk_path) # проверяем пути до файлов
ChunkReader.put_chunks_to_memory(first_chunk_only=True) # читаем только первый chunk
print(len(ChunkReader.chunks)) # один чанк
ChunkReader.put_chunks_to_memory(first_chunk_only=False) # складываем чанки в лист
print(len(ChunkReader.chunks)) # чанки по одному лежат тут.
ChunkReader.concat_chunks() # объединяем в один большой


('clickstream/part-00000.parquet', 'clickstream/part-00009.parquet', 'clickstream/part-00003.parquet', 'clickstream/part-00001.parquet', 'clickstream/part-00004.parquet', 'clickstream/part-00007.parquet', 'clickstream/part-00005.parquet', 'clickstream/part-00008.parquet', 'clickstream/part-00006.parquet', 'clickstream/part-00002.parquet')
1
9


In [4]:
train_target_df = pd.read_csv('alfabattle2_abattle_train_target.csv',
                              parse_dates=['timestamp'], infer_datetime_format=True)

In [5]:
merged_df = ChunkReader.chunks[0].merge(train_target_df, how='inner', on='session_id') # джойним кликстрим на сессии


### First event in session calculating

In [6]:
merged_df["event_num"] = merged_df.sort_values(['timestamp_x'], ascending=True)\
             .groupby(['session_id'])\
             .cumcount() + 1

In [7]:
random_client = merged_df[merged_df.client=='8f1fc0c1216403afc07b5d602680b313']

### Feature generating

In [2]:
train_target_df = pd.read_csv('alfabattle2_abattle_train_target.csv',
                              parse_dates=['timestamp'], infer_datetime_format=True)

def add_dateparts(df, fldname, drop_date_columns=True):
    '''
    Adds features extracted from date. Full list below:
    'Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear','Is_month_end',
    'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'
    Returns: 0 if everything is fine
    '''
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
              'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start',
              'Is_year_end', 'Is_year_start', 'Hour', 'Minute', 'Second'):
        df[n] = np.int_(getattr(fld.dt, n.lower()))
    if drop_date_columns:
        df.drop(fldname, axis=1, inplace=True)
    return 

add_dateparts(train_target_df, 'timestamp') # добавллям фичи из даты, удаляем timestamp
train_target_df

test_dataset = pd.read_csv('alfabattle2_prediction_session_timestamp.csv',
                              parse_dates=['timestamp'], infer_datetime_format=True)
add_dateparts(test_dataset, 'timestamp')

### Train and val split

In [3]:
# Проблема в том, что по-хорошему нельзя случайным образом разбивать выборку на test и train
# Сессии из будущего плохо использовать для предсказания прошлого
# ~10% юзеров исключить из train, чтобы посмотреть, как модель справляется с новым юзерами
# остальные данные для валидации надо подобрать так, чтобы в валидации были только данные из будущего относительно train-а

train_target_df["client_hash"] = train_target_df.client_pin.apply(hash)%10
val_dataset = train_target_df[train_target_df.Month==9] # максимальный месяц идет в валидацию
train_dataset = train_target_df[(train_target_df.Month!=9)|(train_target_df.client_hash==10)]

### Detelete extra columns

In [4]:
train_dataset
columns_to_drop = ['Year', 'Month', 'session_id', 'client_hash']
train_dataset.drop(columns_to_drop, axis=1, inplace=True)
val_dataset.drop(columns_to_drop, axis=1, inplace=True)
test_dataset.drop(['Year', 'Month'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Making a model

In [11]:
train_dataset

Unnamed: 0,client_pin,multi_class_target,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Hour,Minute,Second
0,7cf9221322a0e2fdefb1b998b8f2ab29,main_screen,25,15,0,167,0,0,0,0,0,0,14,1,12
1,5f16c0ab27a806fd08db3122921adf3a,invest,12,21,5,81,0,0,0,0,0,0,12,59,34
2,ec868fc2b388293cf10e18ee9518d72f,statement,4,24,4,24,0,0,0,0,0,0,18,18,55
3,91f55a33d7502c1a1fa5da7ff2f7b648,main_screen,11,15,6,75,0,0,0,0,0,0,19,50,23
4,3ef1020bda95ce7836d2680fa553ecb7,main_screen,8,20,3,51,0,0,0,0,0,0,7,56,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5065345,8f1fc0c1216403afc07b5d602680b313,main_screen,25,16,1,168,0,0,0,0,0,0,15,21,0
5065346,2675ab80450c36c8f2600d8dcd60b43c,main_screen,17,22,2,113,0,0,0,0,0,0,20,31,9
5065347,2c5e889f5360eabefc21181240d0d38e,main_screen,23,2,1,154,0,0,0,0,0,0,11,43,6
5065348,eb90469b91e22f5ba12986acd873def9,main_screen,14,30,0,90,0,0,0,0,0,0,0,39,3


In [5]:
train_data = train_dataset.drop(["multi_class_target"], axis=1).values
train_labels = train_dataset.multi_class_target.values
val_data = val_dataset.drop(["multi_class_target"], axis=1).values
val_labels = val_dataset.multi_class_target.values

train_pool = Pool(train_data, 
                  train_labels, cat_features=[i for i in range(train_data.shape[1])])
val_pool = Pool(val_data, 
                  val_labels, cat_features=[i for i in range(val_data.shape[1])])

test_pool= Pool(test_dataset.values, cat_features=[i for i in range(test_dataset.values.shape[1])])

In [6]:

param = {'iterations':5, 'loss_function':'MultiClass','eval_metric':'TotalF1',
         'l2_leaf_reg':3.0, 'auto_class_weights':'Balanced', 'boosting_type':'Plain',
        'depth':10}
model = CatBoost(param)
#train the model
model.fit(train_pool, eval_set=val_pool) 
# make the prediction using the resulting model
preds_class = model.predict(val_pool, prediction_type='Class')
preds_proba = model.predict(val_pool, prediction_type='Probability')
preds_raw_vals = model.predict(val_pool, prediction_type='RawFormulaVal')

model.best_score_

0:	learn: 0.4273354	test: 0.3997972	best: 0.3997972 (0)	total: 49.4s	remaining: 15m 38s
1:	learn: 0.4367500	test: 0.4083757	best: 0.4083757 (1)	total: 1m 35s	remaining: 14m 22s
2:	learn: 0.4358085	test: 0.4073154	best: 0.4083757 (1)	total: 2m 14s	remaining: 12m 41s
3:	learn: 0.4397085	test: 0.4110579	best: 0.4110579 (3)	total: 2m 59s	remaining: 11m 58s
4:	learn: 0.4420618	test: 0.4112746	best: 0.4112746 (4)	total: 3m 38s	remaining: 10m 54s
5:	learn: 0.4402289	test: 0.4101375	best: 0.4112746 (4)	total: 4m 31s	remaining: 10m 32s
6:	learn: 0.4417178	test: 0.4121822	best: 0.4121822 (6)	total: 5m 10s	remaining: 9m 36s
7:	learn: 0.4420684	test: 0.4118245	best: 0.4121822 (6)	total: 5m 48s	remaining: 8m 42s
8:	learn: 0.4437574	test: 0.4126399	best: 0.4126399 (8)	total: 6m 43s	remaining: 8m 12s
9:	learn: 0.4440061	test: 0.4124160	best: 0.4126399 (8)	total: 7m 15s	remaining: 7m 15s
10:	learn: 0.4442843	test: 0.4133888	best: 0.4133888 (10)	total: 7m 42s	remaining: 6m 18s
11:	learn: 0.4447333	test

{'learn': {'TotalF1': 0.4491822659215517, 'MultiClass': 1.866505789556933},
 'validation': {'TotalF1': 0.41650624680951115,
  'MultiClass': 1.9144766801747028}}

### Checking test

In [7]:
model.feature_importances_

array([100.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.])