# EDA

В README не объясняют, что за задачу мы решаем и какие у нас данные, надо разобраться.


* В файле `problem_train.csv` в каждой строке содержится информация об объекте
с уникальным идентификатором, заданным в столбце `id`.

* В файле `problem_labels.csv` для каждого `id` из файла `problem_train.csv` приведена информация о
принадлежности к 14 категориям.

* В файле `problem_test.csv` дана информация об объектах, аналогичная содержащейся в файле
`problem_train.csv`.

* Используя `problem_train.csv` и `problem_labels.csv` в качестве данных для обучения, постройте файл
`problem_test_labels.csv`, в котором для каждого объекта из файла `problem_test.csv` укажите
вероятность его принадлежности к каждой из 14 рассматриваемых категорий.

* Для оценки качества полученного результата будет использоваться метрика `LogLoss`. По каждой из 14 
категорий метрика считается независимо, затем берется среднее арифметическое значений метрик,
полученных для категорий.

* Пожалуйста, предоставьте краткое описание пути поиска решения и код, использованный для получения
результата.

In [1]:
from IPython.core.display import HTML

HTML(
    r"""
<style>
    .output-plaintext, .output-stream, .output {
        font-family: "JetBrainsMono Nerd Font Mono"; # Any monospaced font should work
    }
</style>
"""
)

In [2]:
import os
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import pandas as pd

In [3]:
pio.templates.default = "plotly_dark"

## TODO: change to white before submitting

### Откроем данные

In [4]:
train_df = pd.read_csv("./data/raw/problem_train.csv", low_memory=False)

In [5]:
train_df

Unnamed: 0,id,release,n_0000,n_0001,n_0002,n_0003,n_0004,n_0005,n_0006,n_0007,...,c_1368,c_1369,c_1370,c_1371,c_1372,c_1373,c_1374,c_1375,c_1376,c_1377
0,11193,a,,,0.025449,,,0.368421,,,...,,,,,a,,q,,,
1,11382,a,,,0.031297,,,0.315789,,,...,,,a,,a,,,,,
2,16531,a,,,0.024475,,,0.342105,,,...,,,a,,a,,b,,,
3,1896,a,,,0.041694,,,0.447368,,,...,,,,,a,,,,,
4,18262,c,,,0.038120,,,0.315789,,,...,,,b,,a,,a,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10898,a,,,0.053931,,,0.394737,,,...,,,,,a,,,,,b
7996,16664,a,,,0.031731,,,0.394737,,,...,,,,,a,,a,,,
7997,5334,c,,0.904762,0.033463,,,0.394737,,,...,,,,,a,,,,,
7998,7905,c,,,0.047109,,,0.289474,,,...,,,,,a,,q,,,


In [6]:
desc = train_df.describe(include="all")
desc

Unnamed: 0,id,release,n_0000,n_0001,n_0002,n_0003,n_0004,n_0005,n_0006,n_0007,...,c_1368,c_1369,c_1370,c_1371,c_1372,c_1373,c_1374,c_1375,c_1376,c_1377
count,8000.0,8000,12.0,388.0,7662.0,112.0,58.0,7658.0,1348.0,995.0,...,3,48,377,2,7998,440,3155,563,7,1908
unique,,3,,,,,,,,,...,3,11,3,2,2,5,24,19,2,2
top,,a,,,,,,,,,...,d,m,b,c,a,c,b,j,b,b
freq,,3838,,,,,,,,,...,1,22,312,1,7460,167,1559,232,4,1441
mean,9220.303375,,0.307692,0.655744,0.040935,0.102749,0.352874,0.395981,0.193175,0.012012,...,,,,,,,,,,
std,5236.8979,,0.245454,0.216399,0.024599,0.153007,0.209337,0.085,0.236683,0.041382,...,,,,,,,,,,
min,0.0,,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,...,,,,,,,,,,
25%,4773.5,,0.134615,0.428571,0.031974,0.011905,0.2,0.342105,0.0,0.0,...,,,,,,,,,,
50%,9221.5,,0.307692,0.702381,0.041369,0.047619,0.283333,0.394737,0.2,0.0,...,,,,,,,,,,
75%,13702.75,,0.403846,0.857143,0.04895,0.142857,0.525,0.421053,0.4,0.011905,...,,,,,,,,,,


#### Удостоверимся, что id не повторяется

In [7]:
train_df.id.duplicated().any()

False

#### Как определены фичи

In [8]:
px.histogram(desc.loc["count"])

In [9]:
na_cols = desc.loc["count"][desc.loc["count"] == 0].index
na_cols

Index(['n_0101', 'o_0224', 'o_0257', 'o_0308', 'c_0491', 'c_0529', 'c_0541',
       'c_0596', 'c_0632', 'c_0644', 'c_0693', 'c_0708', 'c_0710', 'c_0763',
       'c_0784', 'c_0803', 'c_0848', 'c_0902', 'c_0959', 'c_1009', 'c_1072',
       'c_1129', 'c_1148', 'c_1168', 'c_1230', 'c_1257', 'c_1277', 'c_1281',
       'c_1308', 'c_1332'],
      dtype='object')

### Посмотрим, какие размерности у категориальных фичей

In [10]:
px.histogram(desc.loc["unique"])

## TODO: Check if there are columns with text type. like: dog

### Посмотрим на типы данных

In [11]:
train_df.dtypes

id           int64
release     object
n_0000     float64
n_0001     float64
n_0002     float64
            ...   
c_1373      object
c_1374      object
c_1375      object
c_1376      object
c_1377      object
Length: 1379, dtype: object

In [12]:
train_df.dtypes.value_counts()

object     1025
float64     345
int64         9
dtype: int64

In [13]:
train_df.columns

Index(['id', 'release', 'n_0000', 'n_0001', 'n_0002', 'n_0003', 'n_0004',
       'n_0005', 'n_0006', 'n_0007',
       ...
       'c_1368', 'c_1369', 'c_1370', 'c_1371', 'c_1372', 'c_1373', 'c_1374',
       'c_1375', 'c_1376', 'c_1377'],
      dtype='object', length=1379)

In [14]:
train_df.columns.to_series().apply(lambda name: name[:2]).value_counts()

c_    1050
o_     211
n_     116
id       1
re       1
dtype: int64

#### Есть колонки начинающиеся с `o_`, это что?

In [15]:
train_df.columns[
    train_df.columns.to_series().apply(lambda name: str.startswith(name, "o_"))
]

Index(['o_0116', 'o_0117', 'o_0118', 'o_0119', 'o_0120', 'o_0121', 'o_0122',
       'o_0123', 'o_0124', 'o_0125',
       ...
       'o_0317', 'o_0318', 'o_0319', 'o_0320', 'o_0321', 'o_0322', 'o_0323',
       'o_0324', 'o_0325', 'o_0326'],
      dtype='object', length=211)

In [16]:
train_df["o_0326"]

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
7995    11.0
7996     NaN
7997     NaN
7998     9.0
7999     NaN
Name: o_0326, Length: 8000, dtype: float64

#### Интересно, int64 -- не только id. Возможно, это те же n_, которые просто определились как int?

In [17]:
int_columns = train_df.dtypes[
    train_df.dtypes.apply(lambda dtype: dtype == np.int64)
].index
int_columns

Index(['id', 'n_0047', 'n_0050', 'n_0052', 'n_0061', 'n_0075', 'n_0091',
       'o_0176', 'o_0264'],
      dtype='object')

In [18]:
train_df[int_columns]

Unnamed: 0,id,n_0047,n_0050,n_0052,n_0061,n_0075,n_0091,o_0176,o_0264
0,11193,1,1,1,1,1,1,303,7
1,11382,1,1,1,1,1,1,293,7
2,16531,1,1,1,1,1,1,131,0
3,1896,1,1,1,1,1,1,113,4
4,18262,1,1,1,1,1,1,240,6
...,...,...,...,...,...,...,...,...,...
7995,10898,1,1,1,1,1,1,4,5
7996,16664,1,1,1,1,1,1,207,5
7997,5334,1,1,1,1,1,1,369,8
7998,7905,1,1,1,1,1,1,69,4


In [19]:
train_df[["n_0047", "n_0050", "n_0052", "n_0061", "n_0075", "n_0091"]].value_counts()

n_0047  n_0050  n_0052  n_0061  n_0075  n_0091
1       1       1       1       1       1         8000
dtype: int64

Какие-то бесполезные колонка, все значения 1
## TODO: удалить эти колонки???

### Подгрузим лейблы

In [20]:
train_labels_df = pd.read_csv("./data/raw/problem_labels.csv")
train_labels_df

Unnamed: 0,id,service_a,service_b,service_c,service_d,service_e,service_f,service_g,service_h,service_i,service_j,service_k,service_l,service_m,service_n
0,11193,1,1,0,0,0,0,0,0,0,1,1,0,0,0
1,11382,0,0,0,0,0,0,0,0,0,1,1,0,0,0
2,16531,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,1896,0,0,0,1,0,0,0,0,0,1,0,1,0,0
4,18262,0,0,0,1,1,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10898,1,1,1,0,0,0,1,0,0,1,1,1,1,1
7996,16664,0,0,0,0,0,0,0,0,0,1,1,0,0,0
7997,5334,1,0,0,0,0,0,0,0,0,1,1,0,0,0
7998,7905,1,1,1,0,0,0,0,0,0,1,1,0,0,0


Проверим, что айдишники совпадают

In [21]:
(train_labels_df.id == train_df.id).all()

True

Пока что ничего не понимаю. есть категориальные, флоут переменные. Возможно, это что-то нагенерили из исходных данных.

В названиях есть release, service -- мб это что-то связанное с разными продуктами? или с разными сервисами внутри тинькофф?

Одной записи соответствует несколько лейблов, то есть у нас мультилейбл задача

#### мб стоит взглянуть на матрицу корреляции между лейблами?)))

In [22]:
px.imshow(
    train_labels_df.drop(columns=["id"]).corr(),
)

Соседние лейблы в целом сильнее скоррелированы. например i-m = 0.80, j-k = 0.53, f-g = 0.40, d-e = 0.42, a-b = 0.56, b-c = 0.4.

Есть еще h-i = 0.30, c-g = 0.27

Эта корреляция мб не будет мешать модели, если на каждый будет отдельная модель

#### Все лейблы определены на всех записях

In [23]:
train_labels_df.describe()

Unnamed: 0,id,service_a,service_b,service_c,service_d,service_e,service_f,service_g,service_h,service_i,service_j,service_k,service_l,service_m,service_n
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,9220.303375,0.472125,0.32825,0.261625,0.017,0.053875,0.029125,0.051875,0.297,0.016125,0.84825,0.782,0.108,0.088625,0.178
std,5236.8979,0.499254,0.469606,0.439547,0.129279,0.225785,0.168167,0.221788,0.456965,0.125964,0.358801,0.412913,0.3104,0.284219,0.382537
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4773.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
50%,9221.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,13702.75,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
max,18302.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Вот только вопрос - 8000 это много или мало 🤔 Думаю, достаточно

### Посмотрим на тест

In [24]:
test_df = pd.read_csv("./data/raw/problem_test.csv", low_memory=False)
test_df

Unnamed: 0,id,release,n_0000,n_0001,n_0002,n_0003,n_0004,n_0005,n_0006,n_0007,...,c_1368,c_1369,c_1370,c_1371,c_1372,c_1373,c_1374,c_1375,c_1376,c_1377
0,7957,a,,,0.029890,,,0.368421,,,...,,,,,a,,q,,,
1,693,c,,0.452381,0.032164,,,0.421053,0.0,,...,,,,,a,,x,,,
2,15558,c,,0.547619,0.049166,,,0.500000,0.2,,...,,,,,a,e,b,,,
3,15614,a,,,0.048733,,,0.394737,,,...,,,,,b,,,,,
4,8149,b,,,0.049924,,,0.368421,,,...,,,,,a,,,,,b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,200,b,,,0.014728,,,0.368421,,0.059524,...,,,,,a,,,,,
1996,10876,a,,,0.052415,,,0.578947,,0.000000,...,,,,,a,,,,,
1997,4035,a,,,0.038770,,,0.368421,,,...,,,,,a,,b,,,
1998,3640,b,,,0.039420,,,0.552632,,,...,,,,,a,,,,,


In [25]:
test_desc = test_df.describe(include="all")
test_desc

Unnamed: 0,id,release,n_0000,n_0001,n_0002,n_0003,n_0004,n_0005,n_0006,n_0007,...,c_1368,c_1369,c_1370,c_1371,c_1372,c_1373,c_1374,c_1375,c_1376,c_1377
count,2000.0,2000,2.0,116.0,1904.0,20.0,14.0,1903.0,373.0,258.0,...,0.0,12,109,0.0,1999,108,779,149,1,495
unique,,3,,,,,,,,,...,,7,2,,2,5,13,15,1,2
top,,a,,,,,,,,,...,,m,b,,a,a,b,j,a,b
freq,,940,,,,,,,,,...,,4,91,,1841,41,391,53,1,372
mean,9166.8875,,0.384615,0.658046,0.041033,0.107341,0.295238,0.397157,0.213405,0.00969,...,,,,,,,,,,
std,5322.849729,,0.217571,0.208902,0.024645,0.166684,0.188043,0.084669,0.252157,0.018267,...,,,,,,,,,,
min,23.0,,0.230769,0.0,0.006714,0.0,0.066667,0.026316,0.0,0.0,...,,,,,,,,,,
25%,4502.5,,0.307692,0.452381,0.032164,0.025794,0.141667,0.342105,0.0,0.0,...,,,,,,,,,,
50%,9236.0,,0.384615,0.678571,0.041585,0.063492,0.283333,0.394737,0.2,0.0,...,,,,,,,,,,
75%,13802.25,,0.461538,0.857143,0.049085,0.105159,0.391667,0.421053,0.4,0.011905,...,,,,,,,,,,


In [26]:
test_na_cols = test_desc.loc["count"][test_desc.loc["count"] == 0].index
test_na_cols

Index(['n_0008', 'n_0011', 'n_0055', 'n_0101', 'n_0111', 'o_0117', 'o_0121',
       'o_0122', 'o_0126', 'o_0149',
       ...
       'c_1288', 'c_1292', 'c_1308', 'c_1312', 'c_1332', 'c_1334', 'c_1349',
       'c_1360', 'c_1368', 'c_1371'],
      dtype='object', length=179)

Тут уже много фичей неопределено

#### Удостоверимся, что у нас одинаковые фичи там и там

In [27]:
train_df.columns.intersection(test_df.columns)

Index(['id', 'release', 'n_0000', 'n_0001', 'n_0002', 'n_0003', 'n_0004',
       'n_0005', 'n_0006', 'n_0007',
       ...
       'c_1368', 'c_1369', 'c_1370', 'c_1371', 'c_1372', 'c_1373', 'c_1374',
       'c_1375', 'c_1376', 'c_1377'],
      dtype='object', length=1379)

## PCA/t-SNE на хорошо определенных фичах (90%+)

In [28]:
train_counts = train_df.describe(include="all").loc["count"]
train_counts[train_counts > 7000]

id         8000.0
release      8000
n_0002     7662.0
n_0005     7658.0
n_0019     7538.0
            ...  
c_1259       8000
c_1286       7995
c_1316       7911
c_1348       8000
c_1372       7998
Name: count, Length: 91, dtype: object

In [29]:
well_done_df = train_df[train_counts[train_counts > 7000].index]
well_done_df

Unnamed: 0,id,release,n_0002,n_0005,n_0019,n_0038,n_0047,n_0050,n_0052,n_0061,...,c_1223,c_1227,c_1236,c_1244,c_1252,c_1259,c_1286,c_1316,c_1348,c_1372
0,11193,a,0.025449,0.368421,0.0,0.193548,1,1,1,1,...,c,a,c,d,b,n,b,b,b,a
1,11382,a,0.031297,0.315789,0.0,0.177419,1,1,1,1,...,a,a,c,d,b,e,b,b,b,a
2,16531,a,0.024475,0.342105,0.0,0.290323,1,1,1,1,...,c,a,a,d,b,w,b,b,b,a
3,1896,a,0.041694,0.447368,0.0,0.370968,1,1,1,1,...,c,a,c,d,b,e,b,a,b,a
4,18262,c,0.038120,0.315789,0.0,0.177419,1,1,1,1,...,c,a,c,d,b,e,b,b,b,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10898,a,0.053931,0.394737,0.0,0.209677,1,1,1,1,...,c,b,c,a,b,i,a,a,c,a
7996,16664,a,0.031731,0.394737,0.0,0.290323,1,1,1,1,...,c,a,c,d,b,w,b,b,b,a
7997,5334,c,0.033463,0.394737,0.0,0.177419,1,1,1,1,...,c,a,c,d,b,n,b,b,b,a
7998,7905,c,0.047109,0.289474,0.2,0.177419,1,1,1,1,...,c,a,a,a,b,n,b,b,b,a


#### todo: maybe remove o_ and similar int features, because they're actually categorical???

In [30]:
well_done_df.describe()

Unnamed: 0,id,n_0002,n_0005,n_0019,n_0038,n_0047,n_0050,n_0052,n_0061,n_0067,...,n_0083,n_0091,n_0108,n_0109,o_0120,o_0144,o_0176,o_0201,o_0230,o_0264
count,8000.0,7662.0,7658.0,7538.0,7286.0,8000.0,8000.0,8000.0,8000.0,8000.0,...,7992.0,8000.0,8000.0,8000.0,7974.0,7974.0,8000.0,7867.0,7931.0,8000.0
mean,9220.303375,0.040935,0.395981,0.02035,0.238586,1.0,1.0,1.0,1.0,0.562482,...,0.664654,1.0,0.487258,0.080375,0.474918,7.467018,198.3955,1.733443,1.364393,4.585875
std,5236.8979,0.024599,0.085,0.072958,0.079213,0.0,0.0,0.0,0.0,0.292893,...,0.41482,0.0,0.261672,0.084939,0.894713,1.670485,118.487761,1.129209,1.044707,2.237912
min,0.0,0.0,0.0,0.0,0.032258,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4773.5,0.031974,0.342105,0.0,0.193548,1.0,1.0,1.0,1.0,0.357143,...,0.25,1.0,0.266667,0.0,0.0,6.0,92.0,1.0,1.0,4.0
50%,9221.5,0.041369,0.394737,0.0,0.209677,1.0,1.0,1.0,1.0,0.571429,...,1.0,1.0,0.466667,0.0625,0.0,7.0,193.0,2.0,1.0,5.0
75%,13702.75,0.04895,0.421053,0.0,0.258065,1.0,1.0,1.0,1.0,0.857143,...,1.0,1.0,0.7,0.125,1.0,8.0,315.0,2.0,2.0,6.0
max,18302.0,1.0,1.0,1.0,0.919355,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.75,6.0,18.0,369.0,4.0,3.0,10.0


#### Start with "n_" columns

In [31]:
cols = well_done_df.columns.to_series()
n_cols = cols[cols.apply(lambda name: str.startswith(name, "n_"))].index

numeric_df = well_done_df[n_cols]
numeric_df

Unnamed: 0,n_0002,n_0005,n_0019,n_0038,n_0047,n_0050,n_0052,n_0061,n_0067,n_0075,n_0078,n_0083,n_0091,n_0108,n_0109
0,0.025449,0.368421,0.0,0.193548,1,1,1,1,0.928571,1,0.800000,1.000000,1,0.800000,0.1875
1,0.031297,0.315789,0.0,0.177419,1,1,1,1,0.928571,1,0.666667,0.000000,1,0.666667,0.1875
2,0.024475,0.342105,0.0,0.290323,1,1,1,1,0.428571,1,0.833333,1.000000,1,0.833333,0.1875
3,0.041694,0.447368,0.0,0.370968,1,1,1,1,0.571429,1,0.566667,0.833333,1,0.566667,0.1875
4,0.038120,0.315789,0.0,0.177419,1,1,1,1,0.928571,1,0.600000,0.666667,1,0.600000,0.2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.053931,0.394737,0.0,0.209677,1,1,1,1,0.000000,1,0.166667,0.000000,1,0.166667,0.0625
7996,0.031731,0.394737,0.0,0.290323,1,1,1,1,0.714286,1,0.733333,0.083333,1,0.733333,0.1250
7997,0.033463,0.394737,0.0,0.177419,1,1,1,1,1.000000,1,0.833333,1.000000,1,0.833333,0.0625
7998,0.047109,0.289474,0.2,0.177419,1,1,1,1,0.357143,1,0.400000,1.000000,1,0.400000,0.1250


PCA wants imputing

In [32]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(numeric_df)
imputed_df = imp.transform(numeric_df)

In [33]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
comps = pca.fit_transform(imputed_df)

In [34]:
pca.explained_variance_ratio_

array([0.48430617, 0.29450547, 0.1668958 , 0.02291048, 0.01235706])

мы смогли 60 фичей переварить в 3 компоненты с почти полными покрытием

In [35]:
train_labels_df

Unnamed: 0,id,service_a,service_b,service_c,service_d,service_e,service_f,service_g,service_h,service_i,service_j,service_k,service_l,service_m,service_n
0,11193,1,1,0,0,0,0,0,0,0,1,1,0,0,0
1,11382,0,0,0,0,0,0,0,0,0,1,1,0,0,0
2,16531,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,1896,0,0,0,1,0,0,0,0,0,1,0,1,0,0
4,18262,0,0,0,1,1,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10898,1,1,1,0,0,0,1,0,0,1,1,1,1,1
7996,16664,0,0,0,0,0,0,0,0,0,1,1,0,0,0
7997,5334,1,0,0,0,0,0,0,0,0,1,1,0,0,0
7998,7905,1,1,1,0,0,0,0,0,0,1,1,0,0,0


In [36]:
px.scatter(x=comps[:, 0], y=comps[:, 1])

 Похоже на проекцию цилиндра -- плотные бока, пустая середина 🤔

In [37]:
px.scatter(x=comps[:, 0], y=comps[:, 1], color=train_labels_df["service_a"])

In [38]:
px.scatter(x=comps[:, 0], y=comps[:, 1], color=train_labels_df["service_d"])

Мы даже на этих 15 количественных признаках получаем какую-то разделенность!! задача в принципе решаема

#### 3DDDDDDDD

In [39]:
column = "service_a"
px.scatter_3d(x=comps[:, 0], y=comps[:, 1], z=comps[:, 2], color=train_labels_df[column])

### TODO: some n_cols as n_0005 have discreate values that repeats like 0.177419, 0.209677, 0.290323
можно посчитать их дискретизацию

#### TODO: 
некоторые фичи могут почти не давать никакой информации в целом, но для конкретного лейбла быть очень важны
например признак наличия слова "суицид" в переписке → в лейбл `сервис психотерапия`

это имеет смысл, потому что у нас есть лейблы, у которых ооочень мало единиц


#### TODO: для некоторых признаков можно постромть фичу `признак is nan`. мб это будет коррелировать с какими-то сервисами

## TODO: BASELINE - use pca 3 components

### TODO: baseline -- более 80 %, только категориальные, ...

### TODO: можно сделать простые преобразования над нумерик фичами, у которых сильно изменяющися масштаб (e-3, e+3, ...)
например логарифм

## TODO: как заполнять наны?

```
x   y   z
1   a   NaN
2   a   42
4   b   NAN
-2  NaN NaN
5   a   NaN
```

* построить линрег f(x, y) -> z. для z ~ 20 % не нан

* 15% + 5% = train + test

* использовать эту новую фичу уже в бустинге