In [1]:
%load_ext autoreload
%autoreload 1
%aimport myutils

In [43]:
import sys

import catboost
from catboost import CatBoost, Pool
import numpy as np
import pandas as pd

print('python:'.ljust(16), sys.version.split('\n')[0])
print('catboost:'.ljust(16), catboost.__version__)

python:          3.8.9 (default, Apr 13 2022, 08:48:06) 
catboost:        1.0.6


## Define constants

In [3]:
RANDOM_SEED = 2147483647

DATA_PATH = '../data/'
WORKING_PATH = './working/'

TARGET = 'is_bad'  # Target feature
TRAIN = 'TRAIN'  # Binary feature to separate train and valid data

## Load data

In [4]:
# Load datasets
df_train = pd.read_csv(DATA_PATH + 'train.csv', parse_dates=['datetime_submitted'])
df_valid = pd.read_csv(DATA_PATH + 'val.csv', parse_dates=['datetime_submitted'])

# Join train and valid datasets
df_train[TRAIN] = True  # Train/valid flag
df_valid[TRAIN] = False  # Train/valid flag
df = pd.concat([df_train, df_valid], ignore_index=True)

del df_train
del df_valid

## Prepare dataframe
### Preprocess

In [5]:
df['timestamp'] = (df['datetime_submitted'].astype('int')/10**9).astype('int') - 1559347215

df['weight'] = df.groupby('category')['category'].transform('count')
df['weight'] = df['weight'].max() / df['weight']

### Feature engineering

In [6]:
# Datetime features
df['time_s'] = (df['datetime_submitted'].dt.hour*60 + df['datetime_submitted'].dt.minute)*60 + df['datetime_submitted'].dt.second
df['time_sin'] = np.sin(2*np.pi*df['time_s'] / 86400)
df['time_cos'] = np.cos(2*np.pi*df['time_s'] / 86400)

# Drop useless features
df.drop(columns=['datetime_submitted', 'time_s', 'time_sin', 'time_cos'], inplace=True)

### View df

In [7]:
df.head(5)

Unnamed: 0,title,description,subcategory,category,price,region,city,is_bad,TRAIN,timestamp,weight
0,Диван-кровать,Продаем диван-кровать. Удобный механизм - евро...,Мебель и интерьер,Для дома и дачи,7000.0,Россия,Москва,0,True,0,1.601919
1,Кожух рулевой колонки Даф хф 91 4509834,Кожух рулевой колонки DAF XF 94 (60066004)/\n ...,Запчасти и аксессуары,Транспорт,2290.0,Россия,Москва,0,True,29,1.0
2,Дешёвый буст аккаунтов Dota 4,! Буст аккаунтов с ммр выше 1000ммр не беру ! ...,Предложение услуг,Услуги,200.0,Северная Осетия,Владикавказ,1,True,35,3.743797
3,Телевизор sharp.Смарт тв.Интернет,Продам телевизор . Диагональ 450.наличие входа...,Аудио и видео,Бытовая электроника,25000.0,Калининградская область,Советск,1,True,35,1.385993
4,Открытка-конверт,Открытки-конверты ручной работы/\nВыполнены в ...,Коллекционирование,Хобби и отдых,150.0,Ставропольский край,Ессентукская,0,True,41,3.298685


### Split df on train and valid

In [8]:
# Split df
X_train = df.loc[df[TRAIN]].drop([TRAIN, TARGET], axis=1).reset_index(drop=True)
X_valid = df.loc[~df[TRAIN]].drop([TRAIN, TARGET], axis=1).reset_index(drop=True)
y_train = df.loc[df[TRAIN], TARGET].values
y_valid = df.loc[~df[TRAIN], TARGET].values

# Separate weights and timestamps
weight_train = X_train['weight']
weight_valid = X_valid['weight']
timestamp_train = X_train['timestamp']
timestamp_valid = X_valid['timestamp']
X_train.drop(columns=['weight', 'timestamp'], inplace=True)
X_valid.drop(columns=['weight', 'timestamp'], inplace=True)

del df

### Get phone feature

In [9]:
%%time
# Tokenization mode
mode = (True, False)
mode_int = 1 + int(mode[0]) + 2*int(mode[1])

# Get tokenized columns
X_train['phone'] = myutils.get_tokenized_x(
    X_train['title'] + ' ' + X_train['description'],
    fname=WORKING_PATH + 'tokenized_x_train_w1_at_v3_' + str(mode_int) + '.csv',
    stopwords_fname = DATA_PATH + 'stopwords-ru.txt',
    regexp=r'(?u)[\w@]+',
    mode=mode,
    saving=True,
)
X_valid['phone'] = myutils.get_tokenized_x(
    X_valid['title'] + ' ' + X_valid['description'],
    fname=WORKING_PATH + 'tokenized_x_valid_w1_at_v3_' + str(mode_int) + '.csv',
    stopwords_fname = DATA_PATH + 'stopwords-ru.txt',
    regexp=r'(?u)[\w@]+',
    mode=mode,
    saving=True,
)

CPU times: user 7.65 s, sys: 510 ms, total: 8.16 s
Wall time: 8.17 s


In [10]:
%%time
# Get phone feature
X_train['phone'] = myutils.get_phone_feature(
    X_train['phone'],
    fname=WORKING_PATH + 'phone_train_w1_at.csv',
    saving=True,
)
X_valid['phone'] = myutils.get_phone_feature(
    X_valid['phone'],
    fname=WORKING_PATH + 'phone_valid_w1_at.csv',
    saving=True,
)

CPU times: user 105 ms, sys: 35 ms, total: 140 ms
Wall time: 141 ms


### Tokenize text data

In [11]:
%%time
# Tokenization mode
mode = (True, True)
mode_int = 1 + int(mode[0]) + 2*int(mode[1])

# Get tokenized columns
X_train['description'] = myutils.get_tokenized_x(
    X_train['title'] + ' ' + X_train['description'],
    fname=WORKING_PATH + 'tokenized_x_train_w1_at_v3_' + str(mode_int) + '.csv',
    stopwords_fname = DATA_PATH + 'stopwords-ru.txt',
    regexp=r'(?u)[\w@]+',
    mode=mode,
    saving=True,
)
X_valid['description'] = myutils.get_tokenized_x(
    X_valid['title'] + ' ' + X_valid['description'],
    fname=WORKING_PATH + 'tokenized_x_valid_w1_at_v3_' + str(mode_int) + '.csv',
    stopwords_fname = DATA_PATH + 'stopwords-ru.txt',
    regexp=r'(?u)[\w@]+',
    mode=mode,
    saving=True,
)

# Drop 'title' column
X_train.drop(columns='title', inplace=True)
X_valid.drop(columns='title', inplace=True)

CPU times: user 7.46 s, sys: 459 ms, total: 7.92 s
Wall time: 7.92 s


In [12]:
X_train.head(10)

Unnamed: 0,description,subcategory,category,price,region,city,phone
0,диван кровать продавать диван кровать удобный ...,Мебель и интерьер,Для дома и дачи,7000.0,Россия,Москва,0
1,кожух рулевой колонка даф хф 9 кожух рулевой к...,Запчасти и аксессуары,Транспорт,2290.0,Россия,Москва,5
2,дешёвый буста аккаунт dota 1 буста аккаунт ммр...,Предложение услуг,Услуги,200.0,Северная Осетия,Владикавказ,0
3,телевизор sharp смарт тв интернет продать теле...,Аудио и видео,Бытовая электроника,25000.0,Калининградская область,Советск,0
4,открытка конверт открытка конверт ручной работ...,Коллекционирование,Хобби и отдых,150.0,Ставропольский край,Ессентукская,0
5,зимний шина hankook winter i pike rs w 4 разме...,Запчасти и аксессуары,Транспорт,11000.0,Московская область,Железнодорожный,6
6,lada priora 4 приор 3 норма кондинционер 3 маш...,Автомобили,Транспорт,340000.0,Чеченская Республика,Грозный,0
7,дверь входной продать дверь входной дать дверь...,Ремонт и строительство,Для дома и дачи,3000.0,Россия,Санкт-Петербург,7
8,джинсы фирма gulliver продавать джинсы фирма g...,Детская одежда и обувь,Личные вещи,500.0,Россия,Москва,0
9,кроссовок nike air max 3 premium купить asos п...,"Одежда, обувь, аксессуары",Личные вещи,8000.0,Россия,Москва,0


In [13]:
# Print shapes
pd.DataFrame({'X': {'train': X_train.shape, 'valid': X_valid.shape},
              'y': {'train': y_train.shape, 'valid': y_valid.shape}})

Unnamed: 0,X,y
train,"(984487, 7)","(984487,)"
valid,"(16237, 7)","(16237,)"


## Define classifier and fit

In [16]:
cat_features = ['category', 'subcategory', 'region', 'city', 'phone']
text_features = ['description']

train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features,
    text_features=text_features,
    weight=weight_train,
)

valid_pool = Pool(
    data=X_valid,
    label=y_valid,
    cat_features=cat_features,
    text_features=text_features,
    weight=weight_valid,
)

In [33]:
iterations = 10000
early_stopping_rounds = 2000
learning_rate = 0.15
text_processing = {'tokenizers': [{'number_token': '🔢',
                                   'skip_empty': '1',
                                   'number_process_policy': 'LeaveAsIs',
                                   'tokenizer_id': 'Space',
                                   'token_types': ['Number', 'Unknown', 'Word'],
                                   'delimiter': ' ',
                                   'languages': [],
                                   'lemmatizing': '0',
                                   'split_by_set': '0',
                                   'lowercasing': '0',
                                   'subtokens_policy': 'SingleToken',
                                   'separator_type': 'ByDelimiter'}],
                   'dictionaries': [{'start_token_id': '0',
                                     'occurrence_lower_bound': '5',
                                     'skip_step': '0',
                                     'end_of_word_token_policy': 'Insert',
                                     'token_level_type': 'Word',
                                     'end_of_sentence_token_policy': 'Skip',
                                     'gram_order': '2',
                                     'max_dictionary_size': '50000',
                                     'dictionary_id': 'BiGram'},
                                    {'start_token_id': '0',
                                     'occurrence_lower_bound': '5',
                                     'skip_step': '0',
                                     'end_of_word_token_policy': 'Insert',
                                     'token_level_type': 'Word',
                                     'end_of_sentence_token_policy': 'Skip',
                                     'gram_order': '1',
                                     'max_dictionary_size': '50000',
                                     'dictionary_id': 'Word'},
                                    {'start_token_id': '0',
                                     'occurrence_lower_bound': '5',
                                     'skip_step': '0',
                                     'end_of_word_token_policy': 'Insert',
                                     'token_level_type': 'Letter',
                                     'end_of_sentence_token_policy': 'Skip',
                                     'gram_order': '1',
                                     'max_dictionary_size': '50000',
                                     'dictionary_id': 'L1'},
                                    {'start_token_id': '0',
                                     'occurrence_lower_bound': '5',
                                     'skip_step': '0',
                                     'end_of_word_token_policy': 'Insert',
                                     'token_level_type': 'Letter',
                                     'end_of_sentence_token_policy': 'Skip',
                                     'gram_order': '2',
                                     'max_dictionary_size': '50000',
                                     'dictionary_id': 'L2'},
                                    {'start_token_id': '0',
                                     'occurrence_lower_bound': '5',
                                     'skip_step': '0',
                                     'end_of_word_token_policy': 'Insert',
                                     'token_level_type': 'Letter',
                                     'end_of_sentence_token_policy': 'Skip',
                                     'gram_order': '3',
                                     'max_dictionary_size': '50000',
                                     'dictionary_id': 'L3'}],
                   'feature_processing': {'default': [
                       {'tokenizers_names': ['Space'],
                        'dictionaries_names': ['Word', 'BiGram', 'L1', 'L2', 'L3'],
                        'feature_calcers': ['BoW']},
                       {'tokenizers_names': ['Space'],
                        'dictionaries_names': ['Word'],
                        'feature_calcers': ['NaiveBayes']},
                   ]}}

param = {
    'iterations': iterations,
    'early_stopping_rounds': early_stopping_rounds,
    'learning_rate': learning_rate,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': RANDOM_SEED,
    'verbose': 50,
    'train_dir': WORKING_PATH + 'catboost/',
    'save_snapshot': True,
    'snapshot_interval': 300,
    'task_type': 'CPU',
    'text_processing': text_processing,
}

model = CatBoost(param)
model.fit(
    train_pool,
    eval_set=valid_pool,
    plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.8739149	best: 0.8739149 (0)	total: 979ms	remaining: 2h 43m 8s
50:	test: 0.9457845	best: 0.9457845 (50)	total: 58.5s	remaining: 3h 10m 5s
100:	test: 0.9535251	best: 0.9537171 (98)	total: 1m 53s	remaining: 3h 6m
150:	test: 0.9569135	best: 0.9569135 (150)	total: 2m 58s	remaining: 3h 13m 35s
200:	test: 0.9593629	best: 0.9593629 (200)	total: 4m 11s	remaining: 3h 24m 32s
250:	test: 0.9611623	best: 0.9612183 (248)	total: 5m 30s	remaining: 3h 33m 41s
300:	test: 0.9624698	best: 0.9624705 (297)	total: 6m 53s	remaining: 3h 42m 13s
350:	test: 0.9628766	best: 0.9628786 (349)	total: 8m 24s	remaining: 3h 51m 9s
400:	test: 0.9635746	best: 0.9635746 (400)	total: 9m 43s	remaining: 3h 52m 54s
450:	test: 0.9640638	best: 0.9641107 (449)	total: 10m 59s	remaining: 3h 52m 33s
500:	test: 0.9644035	best: 0.9644628 (490)	total: 12m 14s	remaining: 3h 51m 58s
550:	test: 0.9650063	best: 0.9650320 (548)	total: 13m 24s	remaining: 3h 49m 48s
600:	test: 0.9652234	best: 0.9652234 (600)	total: 14m 43s	remainin

<catboost.core.CatBoost at 0x28103ae50>

In [35]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,description,78.213563
1,phone,10.174655
2,subcategory,5.171285
3,category,3.215437
4,city,1.243797
5,region,0.992567
6,price,0.988695


In [36]:
model.save_model(WORKING_PATH + 'model_' + str(iterations) + '_'
                 + str(learning_rate).replace('.', '') + '.cbm')

## Predict

In [37]:
%%time
preds_proba = model.predict(valid_pool, prediction_type='Probability')[:, 1]
preds_proba

CPU times: user 11.9 s, sys: 186 ms, total: 12.1 s
Wall time: 1.78 s


array([9.98001986e-01, 3.12286240e-03, 8.45823133e-01, ...,
       7.66845921e-04, 6.23675778e-04, 9.02255425e-01])

## Compute metric

In [38]:
# Compute metric
macro_score, micro_score, roc_auc = myutils.get_score(y_valid, preds_proba, X_valid['category'])

# Print results
print('Macro:', macro_score)
print('Micro:', micro_score)
print()
pd.Series(roc_auc).sort_values(ascending=False)

Macro: 0.9452982186251624
Micro: 0.9561080102433172



Транспорт              0.991114
Недвижимость           0.983867
Работа                 0.965288
Животные               0.953832
Для дома и дачи        0.951627
Хобби и отдых          0.950236
Бытовая электроника    0.948473
Для бизнеса            0.934783
Услуги                 0.920082
Личные вещи            0.853680
dtype: float64