In [None]:
import warnings
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from genetic_selection import GeneticSelectionCV
warnings.simplefilter("ignore")

## Загрузка и предобработка данных

In [None]:
train = pd.read_csv("train.csv")
valid = pd.read_csv("valid.csv")

## Обработка категориальных переменных

In [None]:
categorical_features = train.select_dtypes(include=["object"])
categorical_features = categorical_features.columns.tolist()

In [None]:
# OneHotEncoding
train_categorical = pd.get_dummies(train[categorical_features].fillna("NA"))
valid_categorical = pd.get_dummies(valid[categorical_features].fillna("NA"))

## Обрабокта числовых переменных

In [None]:
numeric_features = train.select_dtypes(exclude=["object"])
numeric_features = numeric_features.drop(["TARGET", "SK_ID_CURR"], axis=1)
numeric_features = numeric_features.columns.tolist()

In [None]:
# (x - x.mean()) / x.std()
scaler = StandardScaler()
train_numeric = scaler.fit_transform(train[numeric_features].fillna(0))
valid_numeric = scaler.transform(valid[numeric_features].fillna(0))

## Объединение числовых и категориальных переменных

In [None]:
X_train = pd.DataFrame(train_numeric, columns=numeric_features)
X_valid = pd.DataFrame(valid_numeric, columns=numeric_features)

X_train = pd.concat([X_train, train_categorical], axis=1)
X_valid = pd.concat([X_valid, valid_categorical], axis=1)

In [None]:
y_train, y_valid = train["TARGET"], valid["TARGET"]
X_train = X_train.drop("NAME_FAMILY_STATUS_Unknown", axis=1)

In [None]:
print("train.shape: {} rows, {} cols".format(*X_train.shape))
print("valid.shape: {} rows, {} cols".format(*X_valid.shape))

train.shape: 184506 rows, 249 cols
valid.shape: 123005 rows, 249 cols


## Обучение базовой модели

In [None]:
model = LogisticRegression(
    penalty="l2", C=0.001
)
model.fit(X_train, y_train)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, y_pred[:, 1])

print(f"Baseline score: {round(score, 4)}")

Baseline score: 0.7284


## Отбор признаков с помощью генетики

__Описание объекта `GeneticSelectionCV`__:

* max_features - максимальное количество отобранных переменных;
* n_population - размер популяции генетического алгоритма;
* crossover_proba - вероятность для осуществления кроссовера;
* mutation_proba - вероятность для осуществления мутации;
* n_generations - количество итераций генетического алгоритма;

In [None]:
selector = GeneticSelectionCV(
    estimator,
    cv=5,
    verbose=1,
    scoring="roc_auc",
    max_features=50,
    n_population=249,
    crossover_proba=0.5,
    mutation_proba=0.2,
    n_generations=50,
    crossover_independent_proba=0.5,
    mutation_independent_proba=0.05,
    tournament_size=3,
    n_gen_no_change=5,
    caching=True,
    n_jobs=-1
)

In [None]:
selector = selector.fit(X_train.fillna(0), y_train)

Selecting features with genetic algorithm.
gen	nevals	avg                              	std                    	min              	max              
0  	249   	[-10000.            124.40963855]	[0.         8.01860709]	[-10000.    105.]	[-10000.    149.]
1  	147   	[-10000.            118.00401606]	[0.         5.83680195]	[-10000.     97.]	[-10000.    137.]
2  	138   	[-10000.            113.36546185]	[0.         4.92666826]	[-10000.     97.]	[-10000.    133.]
3  	149   	[-10000.            109.80321285]	[0.         5.15204854]	[-10000.     97.]	[-10000.    126.]
4  	146   	[-10000.            106.04016064]	[0.         5.01947948]	[-10000.     90.]	[-10000.    121.]
5  	167   	[-10000.            102.14457831]	[0.         5.66777517]	[-10000.     84.]	[-10000.    118.]
6  	158   	[-10000.             97.73092369]	[0.         5.44817566]	[-10000.     84.]	[-10000.    116.]
7  	159   	[-10000.             94.30522088]	[0.         5.39626645]	[-10000.     79.]	[-10000.    108.]
8  	168   	[

In [None]:
selector.support_

## Проверим модельку на отобранных признаках

In [None]:
selected_features = X_train.columns[selector.support_]
X_train[selected_features].head(n=2)

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,OWN_CAR_AGE,REGION_RATING_CLIENT_W_CITY,...,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_University,WALLSMATERIAL_MODE_Mixed,COMMUNICATION_TYPE_PUSH
0,-0.576826,-0.300016,-0.369717,0.007297,-0.237377,-0.579955,-0.47426,0.558594,1.542072,-0.065096,...,0,0,0,0,0,0,0,0,0,1
1,0.80478,0.22501,-0.753393,-1.090716,-0.931573,-0.101438,-0.45729,0.524575,-0.456493,-0.065096,...,0,0,0,0,1,0,0,0,0,0


In [None]:
model = LogisticRegression(
    penalty="l2", C=0.001
)
model.fit(X_train[selected_features], y_train)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = model.predict_proba(X_valid[selected_features])
score = roc_auc_score(y_valid, y_pred[:, 1])

print(f"Baseline score: {round(score, 4)}")

Baseline score: 0.7245
