In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.preprocessing import StandardScaler as ss
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

import xgboost as xgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

## Преобработка данных

In [2]:
df = pd.read_csv('application_data.csv') # загружаем данные

In [3]:
df.head() # знакомимся с данными

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.rename(columns=str.lower, inplace=True) # приводим названия колонок в нижний регистр

In [5]:
df.info() # смотрим общую статистику

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62100 entries, 0 to 62099
Columns: 122 entries, sk_id_curr to amt_req_credit_bureau_year
dtypes: float64(74), int64(32), object(16)
memory usage: 57.8+ MB


In [6]:
df = df.select_dtypes(exclude='object') # удаляем категориальные колонки

In [7]:
# данные, где больше 50-60% пустые, создают фактор случайности, удаляем такие колонки
nan_threshold = round(df.shape[0] * 0.4)

In [8]:
nan_counts = df.isnull().sum().to_frame(name='counts').query('counts > 0')

In [9]:
cols_to_delete = nan_counts.query('counts > @nan_threshold').reset_index()['index'].tolist()

In [10]:
df = df.drop(columns=cols_to_delete)

In [11]:
# формируем список колонок, где нужно заменить пустые значения
cols_with_nan = df.isnull().sum().to_frame(name='counts').query('counts > 0').reset_index()['index'].tolist()

In [12]:
df[cols_with_nan].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62100 entries, 0 to 62099
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   amt_annuity                 62095 non-null  float64
 1   amt_goods_price             62050 non-null  float64
 2   cnt_fam_members             62099 non-null  float64
 3   ext_source_2                61952 non-null  float64
 4   ext_source_3                49793 non-null  float64
 5   obs_30_cnt_social_circle    61894 non-null  float64
 6   def_30_cnt_social_circle    61894 non-null  float64
 7   obs_60_cnt_social_circle    61894 non-null  float64
 8   def_60_cnt_social_circle    61894 non-null  float64
 9   days_last_phone_change      62099 non-null  float64
 10  flag_document_13            62099 non-null  float64
 11  flag_document_14            62099 non-null  float64
 12  flag_document_15            62099 non-null  float64
 13  flag_document_16            620

In [13]:
# заполняем пустые значения медианой, так как она более устойчива к выбросам
for col in cols_with_nan:
    df[col].fillna(df[col].median(), inplace=True)

In [14]:
# проверяем, что в цифровых данных не содержатся одинаковые значения
df.describe()

Unnamed: 0,sk_id_curr,target,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,...,flag_document_18,flag_document_19,flag_document_20,flag_document_21,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
count,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,...,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0,62100.0
mean,136015.838374,0.080386,0.419839,169954.0,599498.5,27076.086957,538696.5,0.02087,-16024.697746,63049.181514,...,0.008406,0.000676,0.000483,0.000338,0.006135,0.006763,0.028792,0.233897,0.227359,1.763591
std,20739.985265,0.271893,0.725057,478988.0,402968.5,14503.882851,370064.2,0.013792,4361.121119,140650.644829,...,0.091298,0.025998,0.021974,0.018386,0.082695,0.104886,0.187042,0.869564,0.574432,1.763247
min,100002.0,0.0,0.0,25650.0,45000.0,2052.0,45000.0,0.000533,-25184.0,-17531.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,118118.75,0.0,0.0,112500.0,270000.0,16456.5,238500.0,0.010006,-19652.0,-2790.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,136041.0,0.0,0.0,144000.0,513531.0,24903.0,450000.0,0.01885,-15746.0,-1218.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,153954.25,0.0,1.0,202500.0,808650.0,34587.0,679500.0,0.028663,-12387.0,-290.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,172018.0,1.0,11.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7676.0,365243.0,...,1.0,1.0,1.0,1.0,3.0,6.0,6.0,24.0,8.0,25.0


In [15]:
num_cols = df.select_dtypes(exclude='object').columns.tolist()

In [16]:
# нормализуем числовые данных
for num_col in num_cols:
  if num_col.startswith('flag')==False and num_col!='target':
    scaler = ss()
    df[num_col] = scaler.fit_transform(df[num_col].values.reshape(-1, 1))

In [17]:
df.head()

Unnamed: 0,sk_id_curr,target,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,...,flag_document_18,flag_document_19,flag_document_20,flag_document_21,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
0,-1.736459,1,-0.579047,0.067948,-0.478704,-0.163791,-0.507204,-0.150012,1.50506,-0.452801,...,0.0,0.0,0.0,0.0,-0.074192,-0.064483,-0.153936,-0.268984,-0.395802,-0.433063
1,-1.736411,0,-0.579047,0.208871,1.722243,0.594495,1.596502,-1.256441,-0.169752,-0.456718,...,0.0,0.0,0.0,0.0,-0.074192,-0.064483,-0.153936,-0.268984,-0.395802,-1.000203
2,-1.736362,0,-0.579047,-0.213898,-1.152701,-1.401435,-1.090891,-0.78581,-0.692787,-0.449871,...,0.0,0.0,0.0,0.0,-0.074192,-0.064483,-0.153936,-0.268984,-0.395802,-1.000203
3,-1.736266,0,-0.579047,-0.072975,-0.711764,0.179982,-0.653126,-0.931763,-0.683385,-0.469878,...,0.0,0.0,0.0,0.0,-0.074192,-0.064483,-0.153936,-0.268984,-0.395802,-0.433063
4,-1.736218,0,-0.579047,-0.10116,-0.214655,-0.359258,-0.069438,0.565035,-0.895947,-0.469871,...,0.0,0.0,0.0,0.0,-0.074192,-0.064483,-0.153936,-0.268984,-0.395802,-1.000203


## Уменьшение размерности и применение оверсемплинга

В качестве метода уменьшения размерности был выбран PCA, так как он использует дисперцию как способ измерить степень колебания значений столбца, а большая дисперсия подразумевает содержание большей информации. Для этого и была проведена нормализация данных.


In [18]:
X = df.drop(columns='target')
y = df['target']

In [19]:
y.value_counts(normalize=True) # посмотреть на распределение целевой переменной

0    0.919614
1    0.080386
Name: target, dtype: float64

В целях балансировки распределения целевой переменной мы применим оверсемплинг после уменьшения размерности.

In [20]:
# уменьшаем размерность в 3 раза
vectors = np.array(X.values.tolist())
pca20D = PCA(20)
X = pca20D.fit_transform(vectors)

In [21]:
# применяем оверсемплинг
method = SMOTE(random_state=42)
X_resampled, y_resampled = method.fit_resample(X, y)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, stratify=y_resampled, test_size=0.2, random_state=42)

## Построение модели

In [25]:
params = {'learning_rate': [0.01, 0.05, 0.1], 'max_depth': [7,10,15], 'n_estimators': [100,200,300]}

xgb_model_init = xgb.XGBClassifier(random_state=42)
random_cv = RandomizedSearchCV(xgb_model_init, params, cv=3, n_iter=7).fit(X_train, y_train)

xgb_model = random_cv.best_estimator_

In [26]:
xgb_preds = xgb_model.predict(X_test)

In [27]:
def get_metrics(sample_name, model_name, model_preds, y_true):
    metrics = {'sample': sample_name ,'model':model_name, 'accuracy': round(accuracy_score(y_true, model_preds), 3),\
                'recall': round(recall_score(y_true, model_preds), 3), \
                'precision': round(precision_score(y_true, model_preds), 3), \
                'f1': round(f1_score(y_true, model_preds), 3), \
                'roc_auc': round(roc_auc_score(y_true, model_preds), 3)}
    return metrics

In [28]:
metrics = []
metrics.append(get_metrics('train','xgb', xgb_model.predict(X_train), y_train))
metrics.append(get_metrics('test','xgb', xgb_preds, y_test))

In [29]:
pd.DataFrame(metrics)

Unnamed: 0,sample,model,accuracy,recall,precision,f1,roc_auc
0,train,xgb,0.995,1.0,0.989,0.995,0.995
1,test,xgb,0.922,0.956,0.894,0.924,0.922


Была выбрана модель бустинга, так как она на практике показывает результат лучше, чем классические модели. Гиперпараметры были выбраны путем подбора. Модель показывает хорошее качество (~0.9). При этом модель несильно переобучена, так как метрики на тренировочной и обучающей выборки близки по значению.