In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
!pip install catboost optuna shap -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m538.2/538.2 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [346]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
import shap
import matplotlib.pyplot as plt
import seaborn
import optuna
from sklearn.metrics import ndcg_score

In [347]:
train_df = pd.read_csv('/content/gdrive/MyDrive/data/train_df.csv')
test_df = pd.read_csv('/content/gdrive/MyDrive/data/test_df.csv')

## EDA и препроцессинг
* удалим дубликаты
* проверим есть ли пропуски
* проверим датасет на наличие константных фичей
* удалим `target` и `search_id`

In [348]:
train_df.shape

(15081, 81)

In [349]:
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

In [350]:
train_df.isna().sum().sum()

0

In [351]:
cols_to_drop = train_df.nunique().index[train_df.nunique() == 1]
cols_to_drop_test = test_df.nunique().index[test_df.nunique() == 1]
cols_to_drop

Index(['feature_0', 'feature_73', 'feature_74', 'feature_75'], dtype='object')

In [352]:
train_df.drop(cols_to_drop, axis=1, inplace=True)
test_df.drop(cols_to_drop_test, axis=1, inplace=True)

Найдем отношение класса `0` к классу `1`

In [353]:
disbalance_coef = len(train_df[train_df['target'] == 0]) / len(train_df[train_df['target'] == 1])
disbalance_coef

45.798136645962735

In [354]:
train_df.drop(['search_id'], axis=1, inplace=True)
train_data = train_df.drop(['target'], axis=1)
target = train_df['target']

test = test_df.drop(['search_id', 'target'], axis=1)
test_target = test_df['target']

Видим, что в датасете имеется сильный дизбаланс классов. При обучении попробуем домножить коэффициенты меньшего класса на коэффициент отношения классов

## Обучение
* будем использовать catboost
* посмотрим на важность фичей
* попробуем обучить модель без фичей, которые не имеют важности

In [355]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.2, random_state=42)

In [356]:
def train(params):
    model = CatBoostClassifier(**params)

    model.fit(
        X_train, y_train,
        eval_set = (X_test, y_test),
        verbose=50,
        early_stopping_rounds=30
    )

    return model

In [360]:
params = {
    'iterations': 400,
    'learning_rate': 0.01,
    'auto_class_weights': 'SqrtBalanced',
    # 'scale_pos_weight': disbalance_coef,
    'depth': 4,
    'random_state': 42
}

model = train(params)

predictions = model.predict_proba(X_test)[:, 1]
ndcg_score([y_test.values], [predictions])

0:	learn: 0.6855635	test: 0.6856694	best: 0.6856694 (0)	total: 9.73ms	remaining: 3.88s
50:	learn: 0.4600539	test: 0.4706634	best: 0.4706634 (50)	total: 544ms	remaining: 3.72s
100:	learn: 0.3845500	test: 0.4050861	best: 0.4050861 (100)	total: 1.09s	remaining: 3.23s
150:	learn: 0.3552754	test: 0.3853142	best: 0.3853142 (150)	total: 1.62s	remaining: 2.67s
200:	learn: 0.3408175	test: 0.3788586	best: 0.3788586 (200)	total: 2.18s	remaining: 2.16s
250:	learn: 0.3305693	test: 0.3769791	best: 0.3769791 (250)	total: 2.7s	remaining: 1.6s
300:	learn: 0.3219382	test: 0.3769942	best: 0.3765682 (273)	total: 3.83s	remaining: 1.26s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.3765681576
bestIteration = 273

Shrink model to first 274 iterations.


0.49711927840325515

Посмотрим на важность признаков катбуста


In [362]:
fi = model.get_feature_importance(prettified=True)
useless_features = list(fi[fi['Importances'] == 0]['Feature Id'])

In [363]:
train_data = train_df.drop(['target'] + useless_features, axis=1)
X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.2, random_state=42)

params = {
    'iterations': 400,
    'learning_rate': 0.01,
    'auto_class_weights': 'SqrtBalanced',
    # 'scale_pos_weight': disbalance_coef,
    'depth': 4,
    'random_state': 42
}

model = train(params)

predictions = model.predict_proba(X_test)[:, 1]
ndcg_score([y_test.values], [predictions])

0:	learn: 0.6855635	test: 0.6856694	best: 0.6856694 (0)	total: 9.94ms	remaining: 3.97s
50:	learn: 0.4600539	test: 0.4706634	best: 0.4706634 (50)	total: 562ms	remaining: 3.85s
100:	learn: 0.3845500	test: 0.4050861	best: 0.4050861 (100)	total: 1.09s	remaining: 3.24s
150:	learn: 0.3552754	test: 0.3853142	best: 0.3853142 (150)	total: 1.63s	remaining: 2.69s
200:	learn: 0.3408175	test: 0.3788586	best: 0.3788586 (200)	total: 2.18s	remaining: 2.15s
250:	learn: 0.3305693	test: 0.3769791	best: 0.3769791 (250)	total: 2.73s	remaining: 1.62s
300:	learn: 0.3219382	test: 0.3769942	best: 0.3765682 (273)	total: 3.26s	remaining: 1.07s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.3765681576
bestIteration = 273

Shrink model to first 274 iterations.


0.49711927840325515

In [364]:
test_pred = model.predict_proba(test)[:, 1]

In [365]:
ndcg_score([test_target.values], [test_pred])

0.5498456162949592