In [1]:
from IPython.core.display import HTML

HTML(
    r"""
<style>
    .output-plaintext, .output-stream, .output {
        font-family: "JetBrainsMono Nerd Font Mono"; # Any monospaced font should work
    }
</style>
"""
)

In [14]:
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import catboost
from catboost import CatBoostClassifier, Pool
from catboost.metrics import MultiLogloss, AUCMulticlass, AUC, Accuracy, Logloss

In [3]:
seed = 42

In [4]:
train_df = pd.read_csv("./data/prepared/train.csv", low_memory=False).drop(
    columns=["id"]
)
test_df = pd.read_csv("./data/prepared/test.csv", low_memory=False).drop(columns=["id"])
train_labels_df = pd.read_csv(
    "./data/prepared/train_labels.csv", low_memory=False
).drop(columns=["id"])


# size = 1_000
# train_df = train_df.sample(n=size)
# test_df = test_df.sample(n=size)
# train_labels_df = train_labels_df.sample(n=size)

In [5]:
train_df

Unnamed: 0,release,n_0000,n_0001,n_0002,n_0003,n_0004,n_0005,n_0006,n_0007,n_0009,...,c_1366,c_1367,c_1369,c_1370,c_1372,c_1373,c_1374,c_1375,c_1376,c_1377
0,a,,,0.025449,,,0.368421,,,,...,,,,,a,,q,,,
1,a,,,0.031297,,,0.315789,,,,...,,,,a,a,,,,,
2,a,,,0.024475,,,0.342105,,,,...,,,,a,a,,b,,,
3,a,,,0.041694,,,0.447368,,,,...,,,,,a,,,,,
4,c,,,0.038120,,,0.315789,,,,...,,,,b,a,,a,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,a,,,0.053931,,,0.394737,,,,...,a,,,,a,,,,,b
7996,a,,,0.031731,,,0.394737,,,,...,,,,,a,,a,,,
7997,c,,0.904762,0.033463,,,0.394737,,,,...,,,,,a,,,,,
7998,c,,,0.047109,,,0.289474,,,,...,,,,,a,,q,,,


In [6]:
train_labels_df

Unnamed: 0,service_a,service_b,service_c,service_d,service_e,service_f,service_g,service_h,service_i,service_j,service_k,service_l,service_m,service_n
0,1,1,0,0,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,0,0,0,0,0,1,1,0,0,0
2,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,0,0,0,1,0,0,0,0,0,1,0,1,0,0
4,0,0,0,1,1,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,1,1,1,0,0,0,1,0,0,1,1,1,1,1
7996,0,0,0,0,0,0,0,0,0,1,1,0,0,0
7997,1,0,0,0,0,0,0,0,0,1,1,0,0,0
7998,1,1,1,0,0,0,0,0,0,1,1,0,0,0


### Обучим втупую на всех фичах

In [8]:
cat_features = train_df.dtypes[train_df.dtypes == object].index.tolist()

Catboost хочет, чтобы в categorical фичах наны заменялись на стринги

In [9]:
train_df[cat_features] = train_df[cat_features].fillna("nan")

In [9]:
train_features, valid_features, train_targets, valid_targets = train_test_split(
    train_df, train_labels_df, test_size=0.2
)

In [10]:
features = train_features.columns[:30]
features

Index(['release', 'n_0000', 'n_0001', 'n_0002', 'n_0003', 'n_0004', 'n_0005',
       'n_0006', 'n_0007', 'n_0009', 'n_0010', 'n_0012', 'n_0013', 'n_0014',
       'n_0015', 'n_0016', 'n_0017', 'n_0018', 'n_0019', 'n_0020', 'n_0021',
       'n_0022', 'n_0023', 'n_0024', 'n_0025', 'n_0026', 'n_0027', 'n_0028',
       'n_0029', 'n_0030'],
      dtype='object')

#### TODO: поиграться с норм метриками, типа precision/recall/f1 - weighted?)

Метрика logloss на каждом лейбле отдельно и среднее арифметическое — совпадает с MultiLogLoss() из Catboost

In [42]:
train_pool = Pool(
    data=train_features,
    label=train_targets,
    cat_features=cat_features,
    thread_count=4,
)

valid_pool = Pool(
    data=valid_features,
    label=valid_targets,
    cat_features=cat_features,
    thread_count=4,
)

params = {
    "learning_rate": 1,
    "iterations": 10,
    "random_seed": seed,
    "use_best_model": True,
    "thread_count": 4,
    "loss_function": MultiLogloss(),
}

catboost_clf = CatBoostClassifier(**params)
catboost_clf.fit(train_pool, eval_set=valid_pool, plot=False)

NameError: name 'LogLoss' is not defined

In [None]:
valid_preds = catboost_clf.predict(valid_features)

metrics.roc_auc_score(valid_targets, valid_preds)

0.6530730124589476

In [27]:
valid_probas = catboost_clf.predict_proba(valid_features)

metrics.log_loss(valid_targets, valid_probas)

6.908886564076436

Чо-то хрень какая-то

### baseline: top 90% features

In [73]:
train_counts = train_df.describe(include="all").loc["count"]
well_defined_columns = train_counts[train_counts > 7000].index
well_done_df = train_df[well_defined_columns]

In [75]:
train_features, valid_features, train_targets, valid_targets = train_test_split(
    well_done_df, train_labels_df, test_size=0.2
)

train_pool = Pool(
    data=train_features,
    label=train_targets,
    cat_features=cat_features,
    thread_count=4,
)

valid_pool = Pool(
    data=valid_features,
    label=valid_targets,
    cat_features=cat_features,
    thread_count=4,
)

params = {
    "learning_rate": 0.1,
    "iterations": 15,
    "random_seed": seed,
    "use_best_model": True,
    "thread_count": 4,
    "loss_function": MultiLogloss(),
}

catboost_clf = CatBoostClassifier(**params)
catboost_clf.fit(train_pool, eval_set=valid_pool, plot=False)

0:	learn: 0.6104919	test: 0.6098172	best: 0.6098172 (0)	total: 15.9s	remaining: 3m 42s
1:	learn: 0.5526389	test: 0.5512414	best: 0.5512414 (1)	total: 31.9s	remaining: 3m 27s
2:	learn: 0.5032236	test: 0.5011189	best: 0.5011189 (2)	total: 43.1s	remaining: 2m 52s
3:	learn: 0.4575585	test: 0.4548185	best: 0.4548185 (3)	total: 49.2s	remaining: 2m 15s
4:	learn: 0.4198556	test: 0.4168571	best: 0.4168571 (4)	total: 1m 5s	remaining: 2m 11s
5:	learn: 0.3955510	test: 0.3919960	best: 0.3919960 (5)	total: 1m 17s	remaining: 1m 55s
6:	learn: 0.3753491	test: 0.3712820	best: 0.3712820 (6)	total: 1m 25s	remaining: 1m 38s
7:	learn: 0.3604275	test: 0.3560849	best: 0.3560849 (7)	total: 1m 36s	remaining: 1m 24s
8:	learn: 0.3491552	test: 0.3445047	best: 0.3445047 (8)	total: 1m 47s	remaining: 1m 11s
9:	learn: 0.3363290	test: 0.3322391	best: 0.3322391 (9)	total: 2m 2s	remaining: 1m 1s
10:	learn: 0.3299836	test: 0.3259065	best: 0.3259065 (10)	total: 2m 13s	remaining: 48.5s
11:	learn: 0.3196863	test: 0.3164611	b

<catboost.core.CatBoostClassifier at 0x148361700>

In [78]:
valid_probas = catboost_clf.predict_proba(valid_features)

log_loss_well_done = pd.Series({
    label: round(
        metrics.log_loss(valid_targets.iloc[:, label_id], valid_preds[:, label_id]),
        2
    )
    for label_id, label in enumerate(train_labels_df.columns)
}
)
log_loss_well_done["mean"] = log_loss_well_done.mean()
log_loss_well_done

service_a    18.160000
service_b    14.730000
service_c    10.610000
service_d     0.700000
service_e     1.980000
service_f     1.420000
service_g     2.000000
service_h    13.110000
service_i     0.830000
service_j     5.720000
service_k     9.010000
service_l     7.120000
service_m     5.860000
service_n     8.450000
mean          7.121429
dtype: float64

MultiLogLoss выдает какую-то хрень

### Обучим отдельные классификаторы

In [11]:
train_features, valid_features, train_targets, valid_targets = train_test_split(
    train_df, train_labels_df, test_size=0.2
)

In [21]:
def train_on_label(train_features, train_targets, valid_features, valid_targets) -> tuple[CatBoostClassifier, float, float]:
    train_pool = Pool(
        data=train_features,
        label=train_targets,
        cat_features=cat_features,
        thread_count=4,
    )

    valid_pool = Pool(
        data=valid_features,
        label=valid_targets,
        cat_features=cat_features,
        thread_count=4,
    )

    params = {
        "learning_rate": 0.1,
        "iterations": 100,
        "random_seed": seed,
        "use_best_model": True,
        "thread_count": 4,
        "loss_function": Logloss(),
    }

    catboost_clf = CatBoostClassifier(**params)
    catboost_clf.fit(train_pool, eval_set=valid_pool, plot=False, verbose=False)

    valid_preds = catboost_clf.predict(valid_features)
    valid_probas = catboost_clf.predict_proba(valid_features)

    log_loss = metrics.log_loss(valid_targets, valid_probas)
    roc_auc = metrics.roc_auc_score(valid_targets, valid_preds)

    print(f"log loss: {log_loss:.2}")
    print(f"roc auc:  {roc_auc:.2}")

    return catboost_clf, log_loss, roc_auc

In [22]:
classifiers = {}
metrics_individual = pd.DataFrame(columns=("log_loss", "roc_auc"))

for label in train_targets.columns:
    print(label)
    clf, log_loss, roc_auc = train_on_label(train_features, train_targets[label], valid_features, valid_targets[label])
    classifiers[label] = clf
    metrics_individual.loc[label] = (log_loss, roc_auc)
    print()

service_a
log loss: 0.37
roc auc:  0.83

service_b
log loss: 0.49
roc auc:  0.73

service_c
log loss: 0.51
roc auc:  0.53

service_d
log loss: 0.028
roc auc:  0.8

service_e
log loss: 0.15
roc auc:  0.54

service_f
log loss: 0.06
roc auc:  0.54

service_g
log loss: 0.2
roc auc:  0.5

service_h
log loss: 0.43
roc auc:  0.71

service_i
log loss: 0.041
roc auc:  0.57

service_j
log loss: 0.36
roc auc:  0.59

service_k
log loss: 0.45
roc auc:  0.58

service_l
log loss: 0.11
roc auc:  0.95

service_m
log loss: 0.097
roc auc:  0.9

service_n
log loss: 0.33
roc auc:  0.6



In [31]:
metrics_individual.loc["mean"] = metrics_individual.mean()

In [32]:
metrics_individual.round(2)

Unnamed: 0,log_loss,roc_auc
service_a,0.37,0.83
service_b,0.49,0.73
service_c,0.51,0.53
service_d,0.03,0.8
service_e,0.15,0.54
service_f,0.06,0.54
service_g,0.2,0.5
service_h,0.43,0.71
service_i,0.04,0.57
service_j,0.36,0.59


Для сравнения — метрики модели LogReg. Catboost выиграл в logloss, но сильно проиграл в рокауке

```yaml
log_loss:
  service_a: 0.37
  service_b: 0.49
  service_c: 0.51
  service_d: 0.03
  service_e: 0.18
  service_f: 0.05
  service_g: 0.16
  service_h: 0.47
  service_i: 0.06
  service_j: 0.35
  service_k: 0.45
  service_l: 0.22
  service_m: 0.19
  service_n: 0.33
  mean: 0.28
roc_auc:
  service_a: 0.91
  service_b: 0.81
  service_c: 0.75
  service_d: 0.99
  service_e: 0.82
  service_f: 0.98
  service_g: 0.85
  service_h: 0.81
  service_i: 0.9
  service_j: 0.75
  service_k: 0.74
  service_l: 0.9
  service_m: 0.91
  service_n: 0.85
  mean: 0.86
```

обучим еще на сабсете

In [35]:
train_counts = train_df.describe(include="all").loc["count"]
well_defined_columns = train_counts[train_counts > 7000].index
well_done_df = train_df[well_defined_columns]

In [37]:
train_features, valid_features, train_targets, valid_targets = train_test_split(
    well_done_df, train_labels_df, test_size=0.2
)

classifiers = {}
metrics_individual = pd.DataFrame(columns=("log_loss", "roc_auc"))

for label in train_targets.columns:
    print(label)
    clf, log_loss, roc_auc = train_on_label(train_features, train_targets[label], valid_features, valid_targets[label])
    classifiers[label] = clf
    metrics_individual.loc[label] = (log_loss, roc_auc)
    print()

service_a
log loss: 0.35
roc auc:  0.85

service_b
log loss: 0.5
roc auc:  0.73

service_c
log loss: 0.52
roc auc:  0.53

service_d
log loss: 0.024
roc auc:  0.75

service_e
log loss: 0.16
roc auc:  0.54

service_f
log loss: 0.058
roc auc:  0.55

service_g
log loss: 0.16
roc auc:  0.5

service_h
log loss: 0.43
roc auc:  0.72

service_i
log loss: 0.051
roc auc:  0.58

service_j
log loss: 0.36
roc auc:  0.61

service_k
log loss: 0.45
roc auc:  0.59

service_l
log loss: 0.085
roc auc:  0.95

service_m
log loss: 0.095
roc auc:  0.93

service_n
log loss: 0.33
roc auc:  0.61



## разбираюсь почему не работает plot

In [4]:
from catboost.datasets import titanic

train_df, test_df = titanic()

train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

X = train_df.drop("Survived", axis=1)
y = train_df.Survived
categorical_features_indices = np.where(X.dtypes != float)[0]
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, train_size=0.75, random_state=42
)

X_test = test_df

model = CatBoostClassifier(
    custom_loss=[Accuracy()], random_seed=42, logging_level="Silent"
)

model.fit(
    X_train,
    y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    #     logging_level='Verbose',  # you can uncomment this for text output
    plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))