# Считывание данных

In [342]:
import pandas as pd
import numpy as np

In [343]:
data = pd.read_csv('data/data.csv')
data.head()

Unnamed: 0,qn1,qn2,qn3,qn4,qn5,qn6,qn7,qn8,qn9,qn10,qn11,target
0,,0.496372,1.356517,,0.201783,47.104283,0.2017825,1.191783,35139180.0,0.365426,1.624946,0
1,247.899905,,,2.009905,,,12230770000.0,3.219905,12116250.0,,,0
2,,-1.1687,-1.292688,,-1.292688,356.428482,-1.292688,-0.012688,151653.6,0.088284,-0.151629,0
3,13.669695,0.669695,,-0.315305,-1.330305,,117000600.0,0.339695,506861.8,-0.330305,,0
4,102.404032,-0.118875,12.493512,0.252032,-0.59586,79.846382,15129000000.0,3.304032,108049.0,-0.541999,-0.10743,0


Посмотрим на данные

In [344]:
data.target.value_counts()

target
0    972
1     28
Name: count, dtype: int64

In [345]:
data.isna().sum()

qn1       587
qn2        73
qn3        80
qn4       587
qn5        73
qn6        84
qn7         0
qn8       164
qn9         0
qn10       74
qn11       86
target      0
dtype: int64

# Однофакторный анализ

## Availability

In [346]:
# считаем долю не NaN значений
availability = data.notna().sum() / data.shape[0]
availability

qn1       0.413
qn2       0.927
qn3       0.920
qn4       0.413
qn5       0.927
qn6       0.916
qn7       1.000
qn8       0.836
qn9       1.000
qn10      0.926
qn11      0.914
target    1.000
dtype: float64

In [347]:
# выбираем те признаки, у которых доля Availability >= 0.8
availability[availability >= 0.8]

qn2       0.927
qn3       0.920
qn5       0.927
qn6       0.916
qn7       1.000
qn8       0.836
qn9       1.000
qn10      0.926
qn11      0.914
target    1.000
dtype: float64

In [348]:
# оставляем в данных только те признаки, у которых availability >= 80%
data = data[availability[availability >= 0.8].index]
data.head()

Unnamed: 0,qn2,qn3,qn5,qn6,qn7,qn8,qn9,qn10,qn11,target
0,0.496372,1.356517,0.201783,47.104283,0.2017825,1.191783,35139180.0,0.365426,1.624946,0
1,,,,,12230770000.0,3.219905,12116250.0,,,0
2,-1.1687,-1.292688,-1.292688,356.428482,-1.292688,-0.012688,151653.6,0.088284,-0.151629,0
3,0.669695,,-1.330305,,117000600.0,0.339695,506861.8,-0.330305,,0
4,-0.118875,12.493512,-0.59586,79.846382,15129000000.0,3.304032,108049.0,-0.541999,-0.10743,0


## WoE преобразование

Воспользуемся библиотекой scorecardpy для WoE преобразования

In [349]:
import scorecardpy as sc

In [350]:
# отбросим признаки, которые имеют iv меньше 0.02, то есть малоинформативные признаки

data_iv = sc.var_filter(data, y="target", iv_limit=0.02)
data_iv.head()

[INFO] filtering variables ...


Unnamed: 0,qn3,qn6,qn5,qn11,qn8,qn2,qn10,target
0,1.356517,47.104283,0.201783,1.624946,1.191783,0.496372,0.365426,0
1,,,,,3.219905,,,0
2,-1.292688,356.428482,-1.292688,-0.151629,-0.012688,-1.1687,0.088284,0
3,,,-1.330305,,0.339695,0.669695,-0.330305,0
4,12.493512,79.846382,-0.59586,-0.10743,3.304032,-0.118875,-0.541999,0


In [351]:
# выполним WoE преобразование

bins = sc.woebin(data_iv, y="target")
data_woe = sc.woebin_ply(data_iv, bins)
data_woe.head()

[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  .stack().replace('missing', np.nan) \
  .stack().replace('missing', np.nan) \
  ).groupby(['variable', 'rowid', 'bin_chr'], group_keys=False).agg({'bad':sum,'good':sum})\
  ).groupby(['variable', 'rowid', 'bin_chr'], group_keys=False).agg({'bad':sum,'good':sum})\
  ).groupby(['variable', 'rowid', 'bin_chr'], group_keys=False).agg({'bad':sum,'good':sum})\
  ).groupby(['variable', 'rowid', 'bin_chr'], group_keys=False).agg({'bad':sum,'good':sum})\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
Length: 927
Categories (41, object): ['[-inf,-3.2)' < '[-3.2,-2.8)' < '[-2.8,-2.

[INFO] converting into woe values ...


  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum}).reset_index().assign(bin=lambda x: x['bstbin'])\
  .agg({'good':sum, 'bad':sum}).reset_index().assign(bin=lambda x: x['bstbin'])\
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  .agg({'good':sum, 'bad':sum}).reset_index()\
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  .agg({'good':sum, 'bad':sum}).reset_index()\
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum}).reset_

Unnamed: 0,target,qn3_woe,qn5_woe,qn11_woe,qn6_woe,qn8_woe,qn2_woe,qn10_woe
0,0,-0.022381,0.148218,0.050644,-1.723281,-0.634899,0.139635,-0.381395
1,0,-0.822297,-0.880306,-0.8955,-0.871689,-0.344669,-0.880306,-0.893911
2,0,1.070613,-0.073545,-0.001028,0.701412,-0.18255,0.034274,0.470992
3,0,-0.822297,-0.073545,-0.8955,-0.871689,-0.634899,0.139635,1.281922
4,0,-0.924487,-0.073545,-0.001028,-1.723281,-0.344669,0.327098,-0.903326


# Многофакторный анализ

### Отбор признаков через L1-регуляризацию

In [352]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

X = data_woe.drop(columns=['target']).to_numpy()
y = data_woe.target.to_numpy()

k_fold = StratifiedKFold(n_splits=5, shuffle=True)
scaler = StandardScaler()
model_l1 = LogisticRegression(penalty='l1', solver='liblinear')

gini_best = -1
C_best = None
coef_best = None
for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    model_l1.C = C
    ginis = []

    for i, (train_index, test_index) in enumerate(k_fold.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model_l1.fit(X_train, y_train)
        y_pred = model_l1.predict(X_test)

        gini = 2 * roc_auc_score(y_test, y_pred) - 1
        ginis.append(gini)

    gini_mean = np.mean(ginis)
    if gini_mean > gini_best:
        gini_best = gini_mean
        C_best = C
        coef_best = model_l1.coef_

In [353]:
gini_best

0.15333333333333332

In [354]:
model_l1.coef_

array([[0.31160177, 0.14106104, 0.3622679 , 0.7686407 , 0.47519825,
        0.42918343, 0.45891712]])

Видно, что ни один из коэффициентов не занулился, значит мы не будеи выкидывать никакие признаки из рассмотрения

### Обучение модели с L2-регуляризацией

In [355]:
k_fold = StratifiedKFold(n_splits=5, shuffle=True)
scaler = StandardScaler()
model_l2 = LogisticRegression(penalty='l2', solver='liblinear')

gini_best = -1
C_best = None
coef_best = None
for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    model_l2.C = C
    ginis = []

    for i, (train_index, test_index) in enumerate(k_fold.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model_l2.fit(X_train, y_train)
        y_pred = model_l2.predict(X_test)

        gini = 2 * roc_auc_score(y_test, y_pred) - 1
        ginis.append(gini)

    gini_mean = np.mean(ginis)
    if gini_mean > gini_best:
        gini_best = gini_mean
        C_best = C
        coef_best = model_l2.coef_

### Итоговая модель

In [356]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression(penalty='l2', solver='liblinear', C=C_best)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
roc_auc = roc_auc_score(y_test, y_pred)
gini = 2 * roc_auc - 1

In [357]:
gini

0.19999999999999996

In [358]:
roc_auc

0.6

## Воспользуемся catboost

Подберем коэффициент перед L2-регуляризацией через grid_search

In [436]:
from catboost import CatBoostClassifier, Pool, cv

train_pool = Pool(X, y)

parameters = {
    'eval_metric': 'NormalizedGini:hints=skip_train~false',  # используем коэффициент Джини как метрику (как и нужно в задании)
    'loss_function': 'Logloss',
}


grid_parameters = {
    'l2_leaf_reg': [0.1, 1, 10, 100],
}

model = CatBoostClassifier(**parameters)
grid_search_result = model.grid_search(grid_parameters, train_pool, cv=5)

0:	learn: 0.7963893	test: 0.5111684	best: 0.5111684 (0)	total: 4.09ms	remaining: 4.08s
1:	learn: 0.8150269	test: 0.4639175	best: 0.5111684 (0)	total: 11.9ms	remaining: 5.92s
2:	learn: 0.9038327	test: 0.4201031	best: 0.5111684 (0)	total: 16.7ms	remaining: 5.55s
3:	learn: 0.9114863	test: 0.4407216	best: 0.5111684 (0)	total: 19.8ms	remaining: 4.93s
4:	learn: 0.9055270	test: 0.4261168	best: 0.5111684 (0)	total: 25.7ms	remaining: 5.11s
5:	learn: 0.9051764	test: 0.4948454	best: 0.5111684 (0)	total: 31.3ms	remaining: 5.18s
6:	learn: 0.9030147	test: 0.5317869	best: 0.5317869 (6)	total: 35.1ms	remaining: 4.98s
7:	learn: 0.9300070	test: 0.5154639	best: 0.5317869 (6)	total: 37.5ms	remaining: 4.65s
8:	learn: 0.9265015	test: 0.5189003	best: 0.5317869 (6)	total: 40.7ms	remaining: 4.48s
9:	learn: 0.9263262	test: 0.5274914	best: 0.5317869 (6)	total: 44.1ms	remaining: 4.37s
10:	learn: 0.9315261	test: 0.4656357	best: 0.5317869 (6)	total: 46.2ms	remaining: 4.16s
11:	learn: 0.9318766	test: 0.4175258	best:

## Истоговая модель

In [446]:
best_params = grid_search_result['params']
best_model = CatBoostClassifier({**parameters, **best_params}, use_best_model=True)
# best_model.fit(train_pool)

Посмотрим на метрику модели через кросс-валидацию

In [447]:
cv_data = cv(train_pool, {**parameters, **best_params}, fold_count=5, verbose_eval=False)

Training on fold [0/5]

bestTest = 0.7820512821
bestIteration = 5

Training on fold [1/5]

bestTest = 0.264957265
bestIteration = 1

Training on fold [2/5]

bestTest = 0.9321305842
bestIteration = 5

Training on fold [3/5]

bestTest = 0.9072164948
bestIteration = 155

Training on fold [4/5]

bestTest = 0.4381443299
bestIteration = 14



In [445]:
mean_metric = cv_data['test-NormalizedGini-mean'].iloc[-1]  # Например, для AUC
print('Mean NormalizedGini:', mean_metric)

Mean NormalizedGini: 0.5442347343378271
