У нас сильный дисбаланс классов. На таких данных обучать модели сложно. Давай справляться с этим через работу с данными и с метриками

In [21]:
import numpy as np
import pandas as pd
import sklearn

from sklearn import metrics
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    RepeatedStratifiedKFold,
    GridSearchCV,
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler

In [2]:
from IPython.core.display import HTML

HTML(
    r"""
<style>
    .output-plaintext, .output-stream, .output {
        font-family: "JetBrainsMono Nerd Font Mono"; # Any monospaced font should work
    }
</style>
"""
)

np.set_printoptions(precision=2)
pd.set_option("display.precision", 2)
seed = 42

Read data

In [3]:
data = pd.read_csv("./data/prepared/train.csv", low_memory=False).drop(columns=["id"])
labels = pd.read_csv("./data/prepared/train_labels.csv", low_memory=False).drop(
    columns=["id"]
)

Будем работать с датасетом с меньшим количеством фичей, чтобы не париться о нанах

In [4]:
feature_counts = data.describe(include="all").loc["count"]
well_defined_columns = feature_counts[feature_counts > 7000].index
data = data[well_defined_columns]

In [36]:
data.select_dtypes(include="object").columns

Index(['release', 'c_0368', 'c_0401', 'c_0426', 'c_0444', 'c_0456', 'c_0461',
       'c_0466', 'c_0500', 'c_0543', 'c_0544', 'c_0554', 'c_0582', 'c_0590',
       'c_0601', 'c_0623', 'c_0638', 'c_0653', 'c_0657', 'c_0662', 'c_0665',
       'c_0699', 'c_0704', 'c_0707', 'c_0709', 'c_0738', 'c_0755', 'c_0761',
       'c_0762', 'c_0764', 'c_0770', 'c_0809', 'c_0838', 'c_0845', 'c_0858',
       'c_0870', 'c_0887', 'c_0891', 'c_0917', 'c_0939', 'c_0956', 'c_0975',
       'c_0980', 'c_0983', 'c_0996', 'c_1004', 'c_1052', 'c_1055', 'c_1069',
       'c_1075', 'c_1101', 'c_1122', 'c_1130', 'c_1131', 'c_1145', 'c_1158',
       'c_1186', 'c_1189', 'c_1210', 'c_1223', 'c_1227', 'c_1236', 'c_1244',
       'c_1252', 'c_1259', 'c_1286', 'c_1316', 'c_1348', 'c_1372'],
      dtype='object')

In [26]:
cat_features = list(data.select_dtypes(include="object").columns)

# We can't use include="float", as we also have ints (that are *presumably* numerical)
numeric_features = list(data.select_dtypes(exclude="object").columns)

Посмотрим на количество категорий в разных фичах

In [34]:
data[cat_features].apply(pd.value_counts).count().values

array([ 3,  2,  2,  2,  2,  2,  2,  3,  9,  3,  5,  2,  6,  4,  2,  2,  5,
        5,  6,  2,  2,  3,  2,  2,  2,  2,  2,  3,  3,  3,  4,  2,  2,  6,
        2,  2,  2,  5,  8,  2,  2,  3,  4,  2,  2,  2,  2,  2,  2,  2,  5,
        2,  3,  2,  3, 11,  2,  2, 10,  4,  2,  2,  6,  2, 32,  2,  2,  4,
        2])

In [38]:
(data.dtypes == object)

release     True
n_0002     False
n_0005     False
n_0019     False
n_0038     False
           ...  
c_1259      True
c_1286      True
c_1316      True
c_1348      True
c_1372      True
Length: 90, dtype: bool

In [48]:
data

Unnamed: 0,release,n_0002,n_0005,n_0019,n_0038,n_0047,n_0050,n_0052,n_0061,n_0067,...,c_1223,c_1227,c_1236,c_1244,c_1252,c_1259,c_1286,c_1316,c_1348,c_1372
0,a,0.03,0.37,0.0,0.19,1,1,1,1,0.93,...,c,a,c,d,b,n,b,b,b,a
1,a,0.03,0.32,0.0,0.18,1,1,1,1,0.93,...,a,a,c,d,b,e,b,b,b,a
2,a,0.02,0.34,0.0,0.29,1,1,1,1,0.43,...,c,a,a,d,b,w,b,b,b,a
3,a,0.04,0.45,0.0,0.37,1,1,1,1,0.57,...,c,a,c,d,b,e,b,a,b,a
4,c,0.04,0.32,0.0,0.18,1,1,1,1,0.93,...,c,a,c,d,b,e,b,b,b,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,a,0.05,0.39,0.0,0.21,1,1,1,1,0.00,...,c,b,c,a,b,i,a,a,c,a
7996,a,0.03,0.39,0.0,0.29,1,1,1,1,0.71,...,c,a,c,d,b,w,b,b,b,a
7997,c,0.03,0.39,0.0,0.18,1,1,1,1,1.00,...,c,a,c,d,b,n,b,b,b,a
7998,c,0.05,0.29,0.2,0.18,1,1,1,1,0.36,...,c,a,a,a,b,n,b,b,b,a


Возьмем пока что один проблемный лейбл, потом попробуем перенести на все лейблы

In [6]:
labels = labels["service_i"]

Раньше я делал impute на всем датасете, так делать не стоит. Impute не должен производиться на валидационной части. Засунем его в пайплайн.

Еще я делал impute поверх категориальных фичей, переведенных в One-Hot. Это работало корректно, но мы не можем положить Onehot в SMOTE, поэтому one hot будем делать после него, а Impute запроцессим отдельно.

Мы будем овер-семплить минорный класс с помощью интерполяции [SMOTE](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/) и андер-семплить мажорный класс рандомно.

Мы будем использовать ColumnTransformer далее в пайплайне. Он поменяет местами колонки, что собьет дальнейшние вычисления в SMOTENC, где мы передаем параметр categorical_variables. Поэтому сначала отсортируем колонки по типу, чтобы их порядок сохранился после Impute.

In [52]:
data = data.reindex(columns=sorted(data.columns, key=lambda col: data.dtypes[col]))
data.columns[:5], data.columns[-5:]

(Index(['n_0047', 'n_0050', 'n_0052', 'n_0061', 'n_0075'], dtype='object'),
 Index(['c_1259', 'c_1286', 'c_1316', 'c_1348', 'c_1372'], dtype='object'))

In [78]:
cat_features = np.where(data.dtypes == object)[0]
numerical_features = np.where(data.dtypes != object)[0]

cat_features, numerical_features

(array([21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
        38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
        55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
        72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
        89]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20]))

In [None]:
pipeline = Pipeline(
    (
        # TODO: will SMOTE randomly add indicators? it may be bad
        # ("impute_continious", SimpleImputer(strategy="mean", add_indicator=True)),
        (
            "impute",
            ColumnTransformer(
                (
                    (
                        "impute_numerical",
                        SimpleImputer(strategy="mean"),
                        numerical_features,
                    ),
                    (
                        "impute_categorical",
                        SimpleImputer(strategy="most_frequent"),
                        cat_features,
                    ),
                )
            ),
        ),
        # ("impute", KNNImputer()),
        (
            "over_sample_mior",
            SMOTENC(
                sampling_strategy=0.1,
                categorical_features=cat_features,
                random_state=seed,
            ),
        ),
        ("under_sample_major", RandomUnderSampler(sampling_strategy=0.5)),
        # ("one_hot_categorical", OneHotEncoder(drop="if_binary")),
        (
            "one_hot_categorical",
            ColumnTransformer(
                (("inner", OneHotEncoder(handle_unknown="ignore"), cat_features),),
                remainder="passthrough",
            ),
        ),
        ("model", LogisticRegression(solver="liblinear")),
    )
)

In [103]:
pipeline.fit(data, labels)

In [106]:
pipeline.predict(data)

array([1, 0, 0, ..., 0, 0, 0])

## TODO: add regularization

In [108]:
metrics.log_loss(labels, pipeline.predict_proba(data))

0.19752135057805129

In [112]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)
scores = cross_val_score(pipeline, data, labels, scoring="neg_log_loss", cv=cv, n_jobs=-1)

In [113]:
scores

array([-0.24, -0.23, -0.22, -0.2 , -0.23, -0.21, -0.21, -0.21, -0.19,
       -0.25, -0.23, -0.19, -0.21, -0.24, -0.22, -0.22, -0.21, -0.24,
       -0.21, -0.25, -0.22, -0.2 , -0.2 , -0.23, -0.21, -0.21, -0.24,
       -0.22, -0.18, -0.23])

Пока что хуже, чем бейзлайн: было в районе 0.10

In [114]:
train_features, valid_features, train_targets, valid_targets = train_test_split(
    data, labels, random_state=seed, test_size=0.2
)

In [116]:
pipeline.fit(train_features, train_targets)
metrics.log_loss(valid_targets, pipeline.predict_proba(valid_features))

0.2198744240019927

### Попробуем тот же пайплайн без smote -- воспроизведем старые результаты?

In [117]:
pipeline = Pipeline(
    (
        # TODO: will SMOTE randomly add indicators? it may be bad
        # ("impute_continious", SimpleImputer(strategy="mean", add_indicator=True)),
        (
            "impute",
            ColumnTransformer(
                (
                    (
                        "impute_numerical",
                        SimpleImputer(strategy="mean"),
                        numerical_features,
                    ),
                    (
                        "impute_categorical",
                        SimpleImputer(strategy="most_frequent"),
                        cat_features,
                    ),
                )
            ),
        ),
        (
            "one_hot_categorical",
            ColumnTransformer(
                (("inner", OneHotEncoder(handle_unknown="ignore"), cat_features),),
                remainder="passthrough",
            ),
        ),
        ("model", LogisticRegression(solver="liblinear")),
    )
)

In [118]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)
scores = cross_val_score(pipeline, data, labels, scoring="neg_log_loss", cv=cv, n_jobs=-1)
scores

In [119]:
scores

array([-0.08, -0.08, -0.07, -0.08, -0.07, -0.08, -0.07, -0.07, -0.07,
       -0.09, -0.08, -0.07, -0.09, -0.07, -0.08, -0.09, -0.07, -0.09,
       -0.08, -0.07, -0.08, -0.07, -0.07, -0.08, -0.08, -0.09, -0.07,
       -0.07, -0.07, -0.09])

Получили то же самое, что в бейзлайне. Попробуем играться с семплингом

In [125]:
pipeline = Pipeline(
    (
        # TODO: will SMOTE randomly add indicators? it may be bad
        # ("impute_continious", SimpleImputer(strategy="mean", add_indicator=True)),
        (
            "impute",
            ColumnTransformer(
                (
                    (
                        "impute_numerical",
                        SimpleImputer(strategy="mean"),
                        numerical_features,
                    ),
                    (
                        "impute_categorical",
                        SimpleImputer(strategy="most_frequent"),
                        cat_features,
                    ),
                )
            ),
        ),
        # ("impute", KNNImputer()),
        (
            "over_sample_minor",
            SMOTENC(
                sampling_strategy=0.1,
                categorical_features=cat_features,
                random_state=seed,
                k_neighbors=5,
            ),
        ),
        # ("under_sample_major", RandomUnderSampler(sampling_strategy=0.5)),
        (
            "one_hot_categorical",
            ColumnTransformer(
                (("inner", OneHotEncoder(handle_unknown="ignore"), cat_features),),
                remainder="passthrough",
            ),
        ),
        ("model", LogisticRegression(solver="liblinear")),
    )
)

search = GridSearchCV(
    estimator=pipeline,
    param_grid={
        "over_sample_minor__sampling_strategy": [
            0.05,
            0.10,
            0.15,
            0.20,
            0.25,
            0.30,
            0.35,
            0.40,
            0.45,
            0.50,
        ],
        "over_sample_minor__k_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9]
    },
    cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed),
    scoring="neg_log_loss",
    n_jobs=-1,
)

search.fit(data, labels)



In [126]:
search.best_score_

-0.0937902497721044

In [127]:
search.best_params_

{'over_sample_minor__k_neighbors': 2,
 'over_sample_minor__sampling_strategy': 0.05}

Просто работать с minority — не помогает. лучшие параметры — когда мы вообще ничего не делаем 🤡

## Обучать с class_weight
cамый простой бейзлайн

In [134]:
labels.value_counts(normalize=True)

0    0.98
1    0.02
Name: service_i, dtype: float64

In [136]:
pipeline = Pipeline(
    (
        # TODO: will SMOTE randomly add indicators? it may be bad
        # ("impute_continious", SimpleImputer(strategy="mean", add_indicator=True)),
        (
            "impute",
            ColumnTransformer(
                (
                    (
                        "impute_numerical",
                        SimpleImputer(strategy="mean"),
                        numerical_features,
                    ),
                    (
                        "impute_categorical",
                        SimpleImputer(strategy="most_frequent"),
                        cat_features,
                    ),
                )
            ),
        ),
        (
            "one_hot_categorical",
            ColumnTransformer(
                (("inner", OneHotEncoder(handle_unknown="ignore"), cat_features),),
                remainder="passthrough",
            ),
        ),
        ("model", LogisticRegression(solver="liblinear", class_weight={1: .95})),
    )
)

# scores = cross_val_score(
#     pipeline,
#     data,
#     labels,
#     scoring="neg_log_loss",
#     cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed),
#     n_jobs=-1,
# )
# scores

search = GridSearchCV(
    estimator=pipeline,
    param_grid={
        "model__class_weight": [{1: weight} for weight in np.linspace(0.05, 0.95, 20)]
    },
    cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed),
    scoring="neg_log_loss",
    n_jobs=-1,
)

search.fit(data, labels)

In [137]:
search.best_params_, search.best_score_

({'model__class_weight': {1: 0.8552631578947368}}, -0.07823196745435311)

### nan to -1
другой способ impute: numerical → -1, categorical → "missed"

In [161]:
pipeline = Pipeline(
    (
        # TODO: will SMOTE randomly add indicators? it may be bad
        # ("impute_continious", SimpleImputer(strategy="mean", add_indicator=True)),
        (
            "impute",
            ColumnTransformer(
                (
                    (
                        "impute_numerical",
                        SimpleImputer(strategy="constant", fill_value=-1),
                        numerical_features,
                    ),
                    (
                        "impute_categorical",
                        SimpleImputer(strategy="constant", fill_value="missed"),
                        cat_features,
                    ),
                )
            ),
        ),
        (
            "one_hot_categorical",
            ColumnTransformer(
                (("inner", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),),
                remainder="passthrough",
            ),
        ),
        ("model", LogisticRegression(solver="liblinear")),
    )
)

scores = cross_val_score(
    pipeline,
    data,
    labels,
    scoring="neg_log_loss",
    cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed),
    n_jobs=-1,
)
scores


array([-0.08, -0.08, -0.08, -0.08, -0.07, -0.08, -0.07, -0.07, -0.07,
       -0.09, -0.08, -0.08, -0.09, -0.07, -0.09, -0.09, -0.07, -0.09,
       -0.08, -0.07, -0.08, -0.07, -0.08, -0.08, -0.08, -0.09, -0.07,
       -0.07, -0.07, -0.09])

не помогает

### Random forest??

In [164]:
pipeline = Pipeline(
    (
        # TODO: will SMOTE randomly add indicators? it may be bad
        # ("impute_continious", SimpleImputer(strategy="mean", add_indicator=True)),
        (
            "impute",
            ColumnTransformer(
                (
                    (
                        "impute_numerical",
                        SimpleImputer(strategy="mean"),
                        numerical_features,
                    ),
                    (
                        "impute_categorical",
                        SimpleImputer(strategy="most_frequent"),
                        cat_features,
                    ),
                )
            ),
        ),
        # (
        #     "over_sample_minor",
        #     SMOTENC(
        #         sampling_strategy=0.1,
        #         categorical_features=cat_features,
        #         random_state=seed,
        #         k_neighbors=5,
        #     ),
        # ),
        # ("under_sample_major", RandomUnderSampler(sampling_strategy=0.5)),
        (
            "one_hot_categorical",
            ColumnTransformer(
                (("inner", OneHotEncoder(handle_unknown="ignore"), cat_features),),
                remainder="passthrough",
            ),
        ),
        ("model", RandomForestClassifier()),
    )
)

scores = cross_val_score(
    pipeline,
    data,
    labels,
    scoring="neg_log_loss",
    cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed),
    n_jobs=-1,
)
scores

# search = GridSearchCV(
#     estimator=pipeline,
#     param_grid={
#         "model__class_weight": [{1: weight} for weight in np.linspace(0.05, 0.95, 20)]
#     },
#     cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed),
#     scoring="neg_log_loss",
#     n_jobs=-1,
# )

# search.fit(data, labels)

array([-0.11, -0.11, -0.2 , -0.12, -0.12, -0.15, -0.07, -0.16, -0.08,
       -0.2 , -0.15, -0.15, -0.19, -0.24, -0.16, -0.2 , -0.07, -0.08,
       -0.07, -0.11, -0.23, -0.08, -0.07, -0.07, -0.28, -0.11, -0.07,
       -0.15, -0.16, -0.16])

## попробовать другие лейблы

In [166]:
all_labels = pd.read_csv("./data/prepared/train_labels.csv", low_memory=False).drop(
    columns=["id"]
)
all_labels.apply(lambda x: pd.value_counts(x, normalize=True))

Unnamed: 0,service_a,service_b,service_c,service_d,service_e,service_f,service_g,service_h,service_i,service_j,service_k,service_l,service_m,service_n
0,0.53,0.67,0.74,0.98,0.95,0.97,0.95,0.7,0.98,0.15,0.22,0.89,0.91,0.82
1,0.47,0.33,0.26,0.02,0.05,0.03,0.05,0.3,0.02,0.85,0.78,0.11,0.09,0.18


Модель плохо работали на service_h — log_loss=0.47. попробуем на ней

In [168]:
labels_new = all_labels["service_h"]

In [192]:
pipeline = Pipeline(
    (
        # TODO: will SMOTE randomly add indicators? it may be bad
        # ("impute_continious", SimpleImputer(strategy="mean", add_indicator=True)),
        (
            "impute",
            ColumnTransformer(
                (
                    (
                        "impute_numerical",
                        SimpleImputer(strategy="mean"),
                        numerical_features,
                    ),
                    (
                        "impute_categorical",
                        SimpleImputer(strategy="most_frequent"),
                        cat_features,
                    ),
                )
            ),
        ),
        (
            "over_sample_minor",
            SMOTENC(
                categorical_features=cat_features,
                random_state=seed,
            ),
        ),
        (
            "one_hot_categorical",
            ColumnTransformer(
                (("inner", OneHotEncoder(handle_unknown="ignore"), cat_features),),
                remainder="passthrough",
            ),
        ),
        ("model", LogisticRegression(solver="liblinear")),
    )
)

scores = cross_val_score(
    pipeline,
    data,
    labels_new,
    scoring="neg_log_loss",
    cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed),
    n_jobs=-1,
)
scores.mean(), scores.std()

(-0.5332241364124012, 0.023589668176814328)

все ломается

In [194]:
pipeline = Pipeline(
    (
        # TODO: will SMOTE randomly add indicators? it may be bad
        # ("impute_continious", SimpleImputer(strategy="mean", add_indicator=True)),
        (
            "impute",
            ColumnTransformer(
                (
                    (
                        "impute_numerical",
                        SimpleImputer(strategy="mean"),
                        numerical_features,
                    ),
                    (
                        "impute_categorical",
                        SimpleImputer(strategy="most_frequent"),
                        cat_features,
                    ),
                )
            ),
        ),
        (
            "one_hot_categorical",
            ColumnTransformer(
                (("inner", OneHotEncoder(handle_unknown="ignore"), cat_features),),
                remainder="passthrough",
            ),
        ),
        ("model", RandomForestClassifier()),
    )
)

pipeline.fit(data, labels)

In [208]:
(pipeline["model"].feature_importances_ > 0.05).any()

False

никаких важных фичей не нашел

In [225]:
def get_pipeline(
    categorical_features: np.ndarray, numerical_features: np.ndarray
) -> Pipeline:
    """
    Arguments:
        categorical_features, numerical_features: arrays with indices of columns
        that are categorical/numerical. e.g. `[0, 15, 35, 36]`
    Returns:
        sklearn.Pipeline with following steps:
        1. impute NaNs. mean for numerical and most frequent for categorical
        2. one-hot encode categorical features
        3. apply LogisticRegression with default parameters
    """
    imputer = ColumnTransformer(
        (
            (
                "impute_numerical",
                SimpleImputer(strategy="mean"),
                numerical_features,
            ),
            (
                "impute_categorical",
                SimpleImputer(strategy="most_frequent"),
                categorical_features,
            ),
        )
    )
    one_hot_encoder = ColumnTransformer(
        (
            (
                "inner",
                OneHotEncoder(handle_unknown="ignore"),
                categorical_features,
            ),
        ),
        remainder="passthrough",
    )
    model = MultiOutputClassifier(LogisticRegression(solver="liblinear"))

    return Pipeline(
        (
            ("impute", imputer),
            ("one_hot_categorical", one_hot_encoder),
            ("model", model),
        )
    )

In [226]:
data.columns

Index(['n_0047', 'n_0050', 'n_0052', 'n_0061', 'n_0075', 'n_0091', 'o_0176',
       'o_0264', 'n_0002', 'n_0005', 'n_0019', 'n_0038', 'n_0067', 'n_0078',
       'n_0083', 'n_0108', 'n_0109', 'o_0120', 'o_0144', 'o_0201', 'o_0230',
       'release', 'c_0368', 'c_0401', 'c_0426', 'c_0444', 'c_0456', 'c_0461',
       'c_0466', 'c_0500', 'c_0543', 'c_0544', 'c_0554', 'c_0582', 'c_0590',
       'c_0601', 'c_0623', 'c_0638', 'c_0653', 'c_0657', 'c_0662', 'c_0665',
       'c_0699', 'c_0704', 'c_0707', 'c_0709', 'c_0738', 'c_0755', 'c_0761',
       'c_0762', 'c_0764', 'c_0770', 'c_0809', 'c_0838', 'c_0845', 'c_0858',
       'c_0870', 'c_0887', 'c_0891', 'c_0917', 'c_0939', 'c_0956', 'c_0975',
       'c_0980', 'c_0983', 'c_0996', 'c_1004', 'c_1052', 'c_1055', 'c_1069',
       'c_1075', 'c_1101', 'c_1122', 'c_1130', 'c_1131', 'c_1145', 'c_1158',
       'c_1186', 'c_1189', 'c_1210', 'c_1223', 'c_1227', 'c_1236', 'c_1244',
       'c_1252', 'c_1259', 'c_1286', 'c_1316', 'c_1348', 'c_1372'],
      d