In [2]:
import numpy as np
import pandas as pd
import sklearn
import plotly.express as px

from sklearn import metrics
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    RepeatedStratifiedKFold,
    GridSearchCV,
)
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [3]:
from IPython.core.display import HTML

HTML(
    r"""
<style>
    .output-plaintext, .output-stream, .output {
        font-family: "JetBrainsMono Nerd Font Mono"; # Any monospaced font should work
    }
</style>
"""
)

np.set_printoptions(precision=2)
pd.set_option("display.precision", 2)
pd.set_option("display.float_format", "{:.2f}".format)
seed = 42

In [4]:
data = pd.read_csv("./data/prepared/train.csv", low_memory=False).drop(columns=["id"])
labels = pd.read_csv("./data/prepared/train_labels.csv", low_memory=False).drop(
    columns=["id"]
)

In [36]:
feature_counts = data.describe(include="all").loc["count"]
feature_counts

release      8000
n_0000      12.00
n_0001     388.00
n_0002    7662.00
n_0003     112.00
            ...  
c_1373        440
c_1374       3155
c_1375        563
c_1376          7
c_1377       1908
Name: count, Length: 1194, dtype: object

Попробуем посчитать корреляцию между всем...

In [37]:
from sklearn.preprocessing import OrdinalEncoder

data_for_corr = pd.DataFrame(
    OrdinalEncoder().fit_transform(data),
    columns=data.columns
)

In [78]:
corr = data_for_corr.corr().abs()
# remove correlation with itself and lower triangle
# corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

In [79]:
corr

Unnamed: 0,release,n_0000,n_0001,n_0002,n_0003,n_0004,n_0005,n_0006,n_0007,n_0009,...,c_1366,c_1367,c_1369,c_1370,c_1372,c_1373,c_1374,c_1375,c_1376,c_1377
release,1.00,0.27,,0.11,,0.00,0.01,,0.00,,...,0.07,,0.05,0.01,0.03,0.03,0.00,0.01,0.09,0.04
n_0000,0.27,1.00,,0.54,,,0.24,,,,...,,,,,,,0.61,,,
n_0001,,,1.00,0.69,0.36,,0.19,0.02,0.01,0.00,...,0.16,,0.06,0.16,0.06,0.12,0.14,0.07,1.00,0.17
n_0002,0.11,0.54,0.69,1.00,0.34,0.68,0.15,0.09,0.01,0.32,...,0.01,,0.10,0.07,0.01,0.17,0.07,0.01,0.75,0.03
n_0003,,,0.36,0.34,1.00,,0.01,0.13,0.46,,...,0.00,,0.79,0.56,,0.04,0.02,0.14,,0.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c_1373,0.03,,0.12,0.17,0.04,0.39,0.06,0.03,0.32,1.00,...,0.11,,1.00,0.06,0.06,1.00,0.01,0.09,0.67,0.07
c_1374,0.00,0.61,0.14,0.07,0.02,0.22,0.02,0.02,0.03,0.13,...,0.05,,0.28,0.03,0.01,0.01,1.00,0.05,0.50,0.01
c_1375,0.01,,0.07,0.01,0.14,,0.01,0.06,0.00,0.06,...,0.06,,0.13,0.18,0.02,0.09,0.05,1.00,,0.20
c_1376,0.09,,1.00,0.75,,,0.67,,,,...,,,,,,0.67,0.50,,1.00,


In [80]:
corr.abs() > 0.4

Unnamed: 0,release,n_0000,n_0001,n_0002,n_0003,n_0004,n_0005,n_0006,n_0007,n_0009,...,c_1366,c_1367,c_1369,c_1370,c_1372,c_1373,c_1374,c_1375,c_1376,c_1377
release,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
n_0000,False,True,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
n_0001,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
n_0002,False,True,True,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
n_0003,False,False,False,False,True,False,False,False,True,False,...,False,False,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c_1373,False,False,False,False,False,False,False,False,False,True,...,False,False,True,False,False,True,False,False,True,False
c_1374,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
c_1375,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
c_1376,False,False,True,True,False,False,True,False,False,False,...,False,False,False,False,False,True,True,False,True,False


In [81]:
(corr.abs() > 0.4).sum()

release     81
n_0000     177
n_0001     202
n_0002     158
n_0003     117
          ... 
c_1373     122
c_1374      83
c_1375     125
c_1376     171
c_1377      64
Length: 1194, dtype: int64

In [82]:
corr["c_1376"]

release   0.09
n_0000     NaN
n_0001    1.00
n_0002    0.75
n_0003     NaN
          ... 
c_1373    0.67
c_1374    0.50
c_1375     NaN
c_1376    1.00
c_1377     NaN
Name: c_1376, Length: 1194, dtype: float64

In [52]:
data["n_0001"].notna().sum()

388

In [58]:
data[["n_0001", "c_1376"]][data[["n_0001", "c_1376"]].notna().all(axis=1)]

Unnamed: 0,n_0001,c_1376
1139,0.9,a
5890,0.5,b


вот это корреляция...

с таким нельзя удалять просто все колонки, где высокая корреляция. видимо, все дело здесь в нанах

In [69]:
pd.set_option("display.max_rows", 200)

In [84]:
corr["n_0001"][corr["n_0001"] > 0.5]

n_0001   1.00
n_0002   0.69
n_0012   0.79
n_0020   0.93
n_0026   0.84
n_0030   0.82
n_0032   0.60
n_0033   1.00
n_0036   0.52
n_0078   0.79
n_0080   0.88
n_0082   1.00
n_0092   1.00
n_0093   1.00
n_0103   1.00
n_0108   0.79
n_0113   0.50
o_0119   0.72
o_0124   0.60
o_0130   0.54
o_0131   1.00
o_0134   1.00
o_0135   0.52
o_0138   0.62
o_0140   0.60
o_0141   0.70
o_0146   0.51
o_0150   1.00
o_0155   0.65
o_0156   1.00
o_0162   1.00
o_0170   0.65
o_0174   1.00
o_0178   1.00
o_0181   0.65
o_0183   0.81
o_0193   0.98
o_0196   0.51
o_0197   0.64
o_0198   0.79
o_0205   0.79
o_0209   0.72
o_0222   0.69
o_0223   0.79
o_0231   0.65
o_0244   0.68
o_0248   0.59
o_0253   0.90
o_0255   0.95
o_0256   0.63
o_0269   0.66
o_0281   0.63
o_0286   0.56
o_0287   1.00
o_0288   1.00
o_0294   1.00
o_0295   0.73
o_0298   0.66
o_0303   0.56
o_0307   1.00
o_0310   1.00
o_0325   0.99
c_0336   1.00
c_0340   0.53
c_0343   0.63
c_0344   1.00
c_0385   0.99
c_0387   1.00
c_0397   0.79
c_0398   0.68
c_0414   1.00
c_0439

In [72]:
data[["n_0001", "n_0002"]][data[["n_0001", "n_0002"]].notna().all(axis=1)].values

array([[0.81, 0.03],
       [0.45, 0.05],
       [0.31, 0.05],
       [0.79, 0.04],
       [0.38, 0.04],
       [0.38, 0.06],
       [0.79, 0.04],
       [0.9 , 0.02],
       [0.76, 0.03],
       [0.33, 0.05],
       [0.38, 0.06],
       [0.55, 0.05],
       [0.38, 0.06],
       [0.81, 0.03],
       [0.4 , 0.05],
       [0.79, 0.03],
       [0.6 , 0.04],
       [0.33, 0.06],
       [0.5 , 0.04],
       [0.57, 0.04],
       [0.98, 0.03],
       [0.48, 0.05],
       [0.45, 0.04],
       [0.81, 0.04],
       [0.9 , 0.02],
       [0.76, 0.04],
       [0.45, 0.05],
       [0.88, 0.03],
       [0.36, 0.05],
       [0.67, 0.05],
       [0.79, 0.05],
       [0.93, 0.04],
       [0.48, 0.05],
       [0.38, 0.04],
       [0.86, 0.03],
       [0.86, 0.03],
       [0.5 , 0.04],
       [0.79, 0.03],
       [0.88, 0.03],
       [0.79, 0.04],
       [0.33, 0.06],
       [0.64, 0.04],
       [0.81, 0.03],
       [0.38, 0.03],
       [0.86, 0.03],
       [0.48, 0.03],
       [0.79, 0.04],
       [0.81,

тут тоже непонятно. у второй колонки почти все значения в промежутке [0.02-0.06]. наверное, поэтому легко корреляцию выводить

In [89]:
first = "n_0001"
second = "n_0020"
data[[first, second]][data[[first, second]].notna().all(axis=1)]

Unnamed: 0,n_0001,n_0020
442,0.33,0.24
725,0.86,0.95
820,0.33,0.22
1383,0.45,0.35
1431,0.9,0.89
1768,0.31,0.35
2598,0.55,0.57
3642,0.9,0.86
4172,0.88,0.86
4353,0.93,0.97


Ну, здесь корреляция действительно наблюдается. Но только пересечений по записям у них всего лишь 20 штук

### Эксперимент: маскируем рандомные фичи в валидации, чтобы имитировать тест сет

In [5]:
def get_pipeline(
    categorical_features: np.ndarray, numerical_features: np.ndarray
) -> Pipeline:
    """
    Arguments:
        categorical_features, numerical_features: arrays with indices of columns
        that are categorical/numerical. e.g. `[0, 15, 35, 36]`
    Returns:
        sklearn.Pipeline with following steps:
        1. impute NaNs. mean for numerical and most frequent for categorical
        2. one-hot encode categorical features
        3. apply LogisticRegression with default parameters
    """
    imputer = ColumnTransformer(
        (
            (
                "impute_numerical",
                SimpleImputer(strategy="mean"),
                numerical_features,
            ),
            (
                "impute_categorical",
                SimpleImputer(strategy="most_frequent"),
                categorical_features,
            ),
        )
    )
    one_hot_encoder = ColumnTransformer(
        (
            (
                "inner",
                OneHotEncoder(handle_unknown="ignore"),
                categorical_features,
            ),
        ),
        remainder="passthrough",
    )
    model = MultiOutputClassifier(LogisticRegression(solver="liblinear"))

    return Pipeline(
        (
            ("impute", imputer),
            ("one_hot_categorical", one_hot_encoder),
            ("model", model),
        )
    )

In [10]:
from sklearn.metrics import roc_auc_score, log_loss

In [9]:
def compute_metrics(
    labels: list[str], targets: pd.DataFrame, preds: np.ndarray, probas: np.ndarray
) -> pd.DataFrame:
    """
    Compute LogLoss and ROC AUC.

    Returns pd.DataFrame with dimensions [len(labels) + 1, 2]
    """
    metrics = pd.DataFrame(index=labels)

    metrics["log_loss"] = [
        round(
            log_loss(targets.iloc[:, label_id], probas[:, label_id]),
            2,
        )
        for label_id, label in enumerate(labels)
    ]

    metrics["roc_auc"] = roc_auc_score(
        targets,
        probas,
        average=None,
    )

    metrics.loc["mean", "log_loss"] = metrics["log_loss"].mean()
    metrics.loc["mean", "roc_auc"] = metrics["roc_auc"].mean()

    return metrics.round(2)

In [7]:
# Drop ill-defined columns
train_counts = data.describe(include="all").loc["count"]
well_defined_columns = train_counts[train_counts > 7000].index
# Drop columns with value `1` for all records
well_defined_columns = well_defined_columns.drop(
    ["n_0047", "n_0050", "n_0052", "n_0061", "n_0075", "n_0091"]
)
data = data[well_defined_columns]

# We should sort columns here because ColumnTransformer will mess column order
# So the categorical features indices will be same before and after imputing
data = data.reindex(columns=sorted(data.columns, key=lambda col: data.dtypes[col]))

In [110]:
pipeline = get_pipeline(
    categorical_features=np.where(data.dtypes == object)[0],
    numerical_features=np.where(data.dtypes != object)[0],
)

for i in range(10):
    train_features, valid_features, train_targets, valid_targets = train_test_split(
        data, labels, random_state=seed, test_size=0.2
    )

    pipeline.fit(train_features, train_targets)

    # mask random features to simulate prod situations
    # where many columns may be NaN
    columns_to_mask = np.random.choice(
        len(data.columns),
        size=int(len(data.columns) * 0.1),
        replace=False,
    )
    print("masking features № ", columns_to_mask)
    valid_features[valid_features.columns[columns_to_mask]] = np.nan

    valid_preds = pipeline.predict(valid_features)
    valid_probas = pipeline.predict_proba(valid_features)
    valid_probas = np.transpose([y_pred[:, 1] for y_pred in valid_probas])

    metrics = compute_metrics(
        labels=labels.columns,
        targets=valid_targets,
        preds=valid_preds,
        probas=valid_probas,
    )
    print(metrics.loc["mean"])

masking features №  [18 24 59 70 37 33  0 40]
log_loss   0.31
roc_auc    0.80
Name: mean, dtype: float64
masking features №  [43 14 46 41 77 22 66 45]
log_loss   0.30
roc_auc    0.80
Name: mean, dtype: float64
masking features №  [41 23 33  6 19 78 13 56]
log_loss   0.30
roc_auc    0.81
Name: mean, dtype: float64
masking features №  [64 23 63 21 72 14 59 50]
log_loss   0.30
roc_auc    0.81
Name: mean, dtype: float64
masking features №  [16 49 28 46 64  9 18 70]
log_loss   0.31
roc_auc    0.80
Name: mean, dtype: float64
masking features №  [62 15 83 17 58 66 23 75]
log_loss   0.31
roc_auc    0.80
Name: mean, dtype: float64
masking features №  [50 46 18 33 20 65 44 77]
log_loss   0.30
roc_auc    0.81
Name: mean, dtype: float64
masking features №  [77 79 47  0 66  1 71  2]
log_loss   0.44
roc_auc    0.75
Name: mean, dtype: float64
masking features №  [75 68  0 73 13 50 72 41]
log_loss   0.31
roc_auc    0.80
Name: mean, dtype: float64
masking features №  [77 16 37 48 71 42 28  1]
log_loss 

Когда наним 79-й лейбл, сильно проседаем

In [92]:
data[data.columns[[64, 19, 79, 59]]]

Unnamed: 0,c_1075,c_0444,c_1259,c_0996
0,a,a,n,a
1,b,a,e,b
2,a,a,w,b
3,a,a,e,a
4,a,a,e,a
...,...,...,...,...
7995,b,a,i,b
7996,a,b,w,a
7997,a,a,n,a
7998,b,a,n,b


In [95]:
data["c_1259"].describe()

count     8000
unique      32
top          n
freq      1806
Name: c_1259, dtype: object

In [100]:
from sklearn.preprocessing import OrdinalEncoder

In [109]:
feature_transformed = OrdinalEncoder().fit_transform(data[["c_1259"]])
pd.DataFrame({"c_1259": feature_transformed[:, 0], "service_a": labels["service_a"]}).corr()

Unnamed: 0,c_1259,service_a
c_1259,1.0,0.18
service_a,0.18,1.0


In [112]:
train_features, valid_features, train_targets, valid_targets = train_test_split(
    data, labels, random_state=seed, test_size=0.2
)

pipeline.fit(train_features, train_targets)

# mask random features to simulate prod situations
# where many columns may be NaN
columns_to_mask = [79]
print("masking features № ", columns_to_mask)
valid_features[valid_features.columns[columns_to_mask]] = np.nan

valid_preds = pipeline.predict(valid_features)
valid_probas = pipeline.predict_proba(valid_features)
valid_probas = np.transpose([y_pred[:, 1] for y_pred in valid_probas])

metrics = compute_metrics(
    labels=labels.columns,
    targets=valid_targets,
    preds=valid_preds,
    probas=valid_probas,
)
print(metrics)

masking features №  [79]
           log_loss  roc_auc
service_a      1.78     0.68
service_b      0.78     0.62
service_c      0.57     0.66
service_d      0.09     0.84
service_e      0.21     0.74
service_f      0.08     0.95
service_g      0.19     0.75
service_h      0.59     0.70
service_i      0.09     0.70
service_j      0.37     0.69
service_k      0.48     0.70
service_l      0.27     0.84
service_m      0.22     0.87
service_n      0.36     0.81
mean           0.43     0.75


Для сравнения метрики без выбрасывания

In [115]:
train_features, valid_features, train_targets, valid_targets = train_test_split(
    data, labels, random_state=seed, test_size=0.2
)

pipeline.fit(train_features, train_targets)

valid_preds = pipeline.predict(valid_features)
valid_probas = pipeline.predict_proba(valid_features)
valid_probas = np.transpose([y_pred[:, 1] for y_pred in valid_probas])

metrics_norm = compute_metrics(
    labels=labels.columns,
    targets=valid_targets,
    preds=valid_preds,
    probas=valid_probas,
)
print(metrics_norm)

           log_loss  roc_auc
service_a      0.40     0.89
service_b      0.52     0.79
service_c      0.53     0.71
service_d      0.05     0.96
service_e      0.20     0.76
service_f      0.08     0.96
service_g      0.18     0.77
service_h      0.49     0.79
service_i      0.08     0.74
service_j      0.38     0.68
service_k      0.48     0.70
service_l      0.25     0.86
service_m      0.21     0.88
service_n      0.36     0.81
mean           0.30     0.81


In [117]:
(metrics - metrics_norm).abs()

Unnamed: 0,log_loss,roc_auc
service_a,1.38,0.21
service_b,0.26,0.17
service_c,0.04,0.05
service_d,0.04,0.12
service_e,0.01,0.02
service_f,0.0,0.01
service_g,0.01,0.02
service_h,0.1,0.09
service_i,0.01,0.04
service_j,0.01,0.01


Корреляция маленькая, а результат большой. Мб это самая большая корреляция из всех фичей?

In [125]:
data_for_corr = pd.DataFrame(
    OrdinalEncoder().fit_transform(data),
    columns=data.columns
)
corr = pd.concat((data_for_corr, labels["service_a"]), axis="columns").corr().abs()
corr["service_a"][corr["service_a"] > 0.1]

n_0002      0.27
n_0078      0.31
n_0108      0.31
n_0109      0.20
o_0230      0.11
c_0554      0.13
c_0638      0.15
c_0699      0.21
c_0707      0.16
c_0809      0.14
c_1227      0.14
c_1244      0.15
c_1259      0.18
service_a   1.00
Name: service_a, dtype: float64

Да нет, ничего особенного