In [1475]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.preprocessing import RobustScaler
from scipy.stats import pearsonr, spearmanr, chi2_contingency
from sklearn.metrics import roc_auc_score
from scipy.stats import zscore

from sklearn.metrics import classification_report

In [1476]:
ar = pd.read_csv('data/application_record.zip', compression='zip')
cr = pd.read_csv('data/credit_record.zip', compression='zip')

In [1477]:
# общая информация о датасете
ar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

In [1478]:
# количество уникальных ID (при общем количестве 438 557)
ar['ID'].nunique()

438510

In [1479]:
# избавляемся от дубликатов в исходном датафрейме
ar.drop_duplicates('ID', keep='last', inplace=True)
ar.shape

(438510, 18)

In [1480]:
# общая информация о датасете, содержащем данные о просрочках по кредиту
cr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [1481]:
# количество уникальных ID
cr['ID'].unique().size

45985

In [1482]:
# смотрим по каким признакам есть нулевые значения
ar.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134187
CNT_FAM_MEMBERS             0
dtype: int64

In [None]:
# проверяем наличие нулевых значений с помощью визуализации
sns.heatmap(ar.isnull())

In [None]:
# слишком много пропущенных значений в поле OCCUPATION_TYPE, избавляемся от него
ar.drop('OCCUPATION_TYPE', axis=1, inplace=True)

In [None]:
# проверяем наличие нулевых значений с помощью визуализации
sns.heatmap(ar.isnull())

In [None]:
# проверяем наличие нулевых значений с помощью визуализации
sns.heatmap(cr.isnull())

In [None]:
variances = ar.select_dtypes(include=['number']).var()
print("Дисперсия признаков:\n", variances)

In [None]:
# очень низкая дисперсия, избавляемся от него
ar.drop('FLAG_MOBIL', axis=1, inplace=True)

In [None]:
cr['STATUS'].value_counts() 

- **0:** 1-29 days past due 
- **1:** 30-59 days past due 
- **2:** 60-89 days overdue 
- **3:** 90-119 days overdue 
- **4:** 120-149 days overdue 
- **5:** Overdue or bad debts, write-offs for more than 150 days 
- **C:** paid off that month 
- **X:** No loan for the month

In [None]:
cr['STATUS'] = cr['STATUS'].replace({'C' : 0, 'X' : 0})
cr['STATUS'] = cr['STATUS'].astype(int)
cr['STATUS'].value_counts() 

In [None]:
overdue = cr.copy()


In [None]:
overdue = overdue.groupby('ID').agg(max).reset_index()
overdue.drop('MONTHS_BALANCE', axis=1, inplace=True)
overdue

In [None]:
df = ar.join(overdue.set_index('ID'), on='ID', how='inner')
df.head()

In [None]:
# cr['STATUS'] = cr['STATUS'].apply(lambda x: 1 if x >= 1 else 0)
# cr['STATUS'].value_counts() 

df['STATUS'] = df['STATUS'].apply(lambda x: 1 if x >= 1 else 0)
df['STATUS'].value_counts() 

In [None]:
# # убираем поле с информацией о дате просрочки, оставляем только таргет
# cr = cr.groupby('ID').agg(max).reset_index()
# cr.drop('MONTHS_BALANCE', axis=1, inplace=True)
# cr

In [None]:
# # объединяем в итоговом датафрейме признаки с таргетом
# df = ar.join(cr.set_index('ID'), on='ID', how='inner')
# df.head()

In [None]:
# Вставляем доп.признак с количеством дней просрочки на предпоследнюю позицию
# df.insert(len(df.columns) - 1, 'OVERDUE', df['STATUS'])

# df['OVERDUE'].value_counts()

In [None]:
# df['STATUS'].value_counts() 

In [None]:
# df['STATUS'] = df['STATUS'].apply(lambda x: 1 if x >= 1 else 0)
# df['STATUS'].value_counts() 

In [None]:
# Вычисление матрицы корреляций
corr_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Матрица корреляций (Пирсон)')
plt.show()

In [None]:
correlation_results = []

# Проходим по всем числовым признакам
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    if col != 'target':
        pearson_corr, _ = pearsonr(df[col], df['STATUS'])
        spearman_corr, _ = spearmanr(df[col], df['STATUS'])
        correlation_results.append((col, pearson_corr, spearman_corr))

# Создаем DataFrame с результатами
correlation_df = pd.DataFrame(correlation_results, columns=['Feature', 'Pearson', 'Spearman'])
print(correlation_df.sort_values(by='Pearson', ascending=False))

In [None]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# vif_data = pd.DataFrame()
# vif_data['Feature'] = df.columns
# vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
# print(vif_data)
# VIF > 5 или 10 указывает на сильную мультиколлинеарность.

In [None]:
# df.drop('CNT_FAM_MEMBERS', axis=1, inplace=True)
df.drop('CNT_CHILDREN', axis=1, inplace=True)
df.drop('NAME_FAMILY_STATUS', axis=1, inplace=True)
df.drop('NAME_EDUCATION_TYPE', axis=1, inplace=True)
# df.drop('DAYS_BIRTH', axis=1, inplace=True)
df.drop('DAYS_EMPLOYED', axis=1, inplace=True)
# df.drop('CODE_GENDER', axis=1, inplace=True)
# df.drop('FLAG_PHONE', axis=1, inplace=True)
# df.drop('FLAG_WORK_PHONE', axis=1, inplace=True)
# df.drop('FLAG_EMAIL', axis=1, inplace=True)

В соответствии с типом признака (номинальный или порядковым) и количеством уникальных значений определим каким образом будем кодировать категориальные переменные.

In [None]:
# смотрим типы данных
df.dtypes

In [None]:
object_type = pd.DataFrame(df.dtypes =='object').reset_index()
object_type = object_type[object_type[0] == True]['index']
object_type

In [None]:
df['FLAG_OWN_CAR'].value_counts() # binary encoder

In [None]:
df['FLAG_OWN_REALTY'].value_counts()

In [None]:
df['NAME_INCOME_TYPE'].value_counts() # не все значения одинаково часто встречаются, TargetEncoding не подойдет, но порядковый - LabelEncoding

In [None]:
df['NAME_HOUSING_TYPE'].value_counts() # не все значения одинаково часто встречаются, target encoding не подойдет -> BinaryEncoding

In [None]:
# попробуем просто заменить на 0 и 1
df['FLAG_OWN_CAR'] = df['FLAG_OWN_CAR'].replace({'Y' : 1, 'N' : 0})
df['FLAG_OWN_REALTY'] = df['FLAG_OWN_REALTY'].replace({'Y' : 1, 'N' : 0})
df['CODE_GENDER'] = df['CODE_GENDER'].replace({'F' : 1, 'M' : 0})

In [None]:
#  кодируем категориальный порядковый признак
le = LabelEncoder()
df['NAME_INCOME_TYPE'] = le.fit_transform(df['NAME_INCOME_TYPE'])
df['NAME_HOUSING_TYPE'] = le.fit_transform(df['NAME_HOUSING_TYPE'])
# df['NAME_EDUCATION_TYPE'] = le.fit_transform(df['NAME_EDUCATION_TYPE'])

In [None]:
# mean_encoding = df.groupby('NAME_FAMILY_STATUS')['STATUS'].mean()
# df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].map(mean_encoding)

In [None]:
# смотрим типы данных
df.dtypes

In [None]:
df

In [None]:
df['STATUS'].value_counts()

In [None]:
# Вычисление матрицы корреляций
corr_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Матрица корреляций (Пирсон)')
plt.show()

In [None]:
correlation_results = []

# Проходим по всем числовым признакам
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    if col != 'target':
        pearson_corr, _ = pearsonr(df[col], df['STATUS'])
        spearman_corr, _ = spearmanr(df[col], df['STATUS'])
        correlation_results.append((col, pearson_corr, spearman_corr))

# Создаем DataFrame с результатами
correlation_df = pd.DataFrame(correlation_results, columns=['Feature', 'Pearson', 'Spearman'])
print(correlation_df.sort_values(by='Pearson', ascending=False))

In [None]:
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# vif_data = pd.DataFrame()
# vif_data['Feature'] = df.columns
# vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
# print(vif_data)
# VIF > 5 или 10 указывает на сильную мультиколлинеарность.

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='STATUS', y='AMT_INCOME_TOTAL', data=df)
plt.title('Распределение дохода для дефолтных и недефолтных заемщиков')
plt.show()

In [None]:
# # проверяем наличие выбросов с помощью визуализации

# fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(18,10))
# sns.scatterplot(x=df['ID'], y=df['FLAG_OWN_CAR'], data=df, ax=ax[0][0])
# sns.scatterplot(x=df['ID'], y=df['FLAG_OWN_REALTY'], data=df, ax=ax[0][1])
# sns.scatterplot(x=df['ID'], y=df['AMT_INCOME_TOTAL'], data=df, ax=ax[0][2])

# sns.scatterplot(x=df['ID'], y=df['NAME_INCOME_TYPE'], data=df, ax=ax[1][0])
# sns.scatterplot(x=df['ID'], y=df['NAME_HOUSING_TYPE'], data=df, ax=ax[1][1])
# sns.scatterplot(x=df['ID'], y=df['FLAG_WORK_PHONE'], data=df, ax=ax[1][2])

# sns.scatterplot(x=df['ID'], y=df['FLAG_PHONE'], data=df, ax=ax[2][0])
# sns.scatterplot(x=df['ID'], y=df['FLAG_EMAIL'], data=df, ax=ax[2][1])

In [None]:
# # Вычисляем Z-оценки и удаляем выбросы (|Z| > 3)
# df['z_score'] = zscore(df['AMT_INCOME_TOTAL'])
# df = df[np.abs(df['z_score']) <= 3]

In [None]:
# # Вычисляем Z-оценки и удаляем выбросы (|Z| > 3)
# df['z_score'] = zscore(df['NAME_INCOME_TYPE'])
# df = df[np.abs(df['z_score']) <= 3]

In [None]:
# # проверяем наличие выбросов с помощью визуализации после очистки

# fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(18,10))
# sns.scatterplot(x=df['ID'], y=df['FLAG_OWN_CAR'], data=df, ax=ax[0][0])
# sns.scatterplot(x=df['ID'], y=df['FLAG_OWN_REALTY'], data=df, ax=ax[0][1])
# sns.scatterplot(x=df['ID'], y=df['AMT_INCOME_TOTAL'], data=df, ax=ax[0][2])

# sns.scatterplot(x=df['ID'], y=df['NAME_INCOME_TYPE'], data=df, ax=ax[1][0])
# sns.scatterplot(x=df['ID'], y=df['NAME_HOUSING_TYPE'], data=df, ax=ax[1][1])
# sns.scatterplot(x=df['ID'], y=df['FLAG_WORK_PHONE'], data=df, ax=ax[1][2])

# sns.scatterplot(x=df['ID'], y=df['FLAG_PHONE'], data=df, ax=ax[2][0])
# sns.scatterplot(x=df['ID'], y=df['FLAG_EMAIL'], data=df, ax=ax[2][1])

In [None]:
# генерация признаков + PSI 

In [None]:
df['DAYS_BIRTH'] = - (df['DAYS_BIRTH'] // 365)
df['DAYS_BIRTH']

In [None]:
plt.hist(df['DAYS_BIRTH'], bins=10, edgecolor='black')
plt.show()

In [None]:
# Определяем границы бинов
bins = [20, 30, 40, 50, 60, 100]

# Создаем метки для бинов
labels = ['21-30', '31-40', '41-50', '51-60', '60+']

# Разбиваем возраст на бины
df['DAYS_BIRTH'] = pd.cut(df['DAYS_BIRTH'], bins=bins, labels=labels, right=False)

In [None]:
df['DAYS_BIRTH'].value_counts()

In [None]:
mean_encoding = df.groupby('DAYS_BIRTH')['STATUS'].mean()
df['DAYS_BIRTH'] = df['DAYS_BIRTH'].map(mean_encoding)

In [None]:
df

In [None]:
X = df.iloc[:,1:-1]  # с конца убираем таргет 
y = df['STATUS']

In [None]:
# делим данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# приводим признаки к одной шкале, RobustScaler не чувствителен к выбросам в отличие от MinMaxScaler()
rs = RobustScaler()
X_scaled = pd.DataFrame(rs.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(rs.transform(X_test), columns=X_test.columns)

In [None]:
# # приводим признаки к одной шкале, MinMaxScaler() чувствителен к выбросам 
# rs = MinMaxScaler()
# X_scaled = pd.DataFrame(rs.fit_transform(X_train), columns=X_train.columns)
# X_test_scaled = pd.DataFrame(rs.transform(X_test), columns=X_test.columns)

In [None]:
X_scaled

In [None]:
y_train.value_counts()

In [None]:
oversample = SMOTE() # Synthetic Minority Over-sampling Technique
X_balanced, y_balanced = oversample.fit_resample(X_scaled, y_train)

In [None]:
y_balanced.value_counts()

In [None]:
# def calculate_psi(expected, actual, bins=10, epsilon=1e-6):
#     # Разбиваем данные на бины
#     breakpoints = np.histogram_bin_edges(expected, bins=bins)
#     expected_counts, _ = np.histogram(expected, bins=breakpoints)
#     actual_counts, _ = np.histogram(actual, bins=breakpoints)
    
#     # Добавляем epsilon, чтобы избежать деления на ноль
#     expected_counts = expected_counts + epsilon
#     actual_counts = actual_counts + epsilon
    
#     # Нормализуем до процентов
#     expected_percents = expected_counts / np.sum(expected_counts)
#     actual_percents = actual_counts / np.sum(actual_counts)
    
#     # Рассчитываем PSI
#     psi = np.sum((actual_percents - expected_percents) * np.log(actual_percents / expected_percents))
    
#     return psi

# def calculate_psi_for_dataframe(df_expected, df_actual, bins=10):
#     psi_results = {}
    
#     for column in df_expected.columns:
#         expected = df_expected[column]
#         actual = df_actual[column]
#         psi_value = calculate_psi(expected, actual, bins=bins)
#         psi_results[column] = psi_value
    
#     return psi_results

In [None]:
# # Рассчитываем PSI для всех признаков
# psi_results = calculate_psi_for_dataframe(X_balanced, X_test_balanced, bins=10)

# # Выводим результаты
# for feature, psi in psi_results.items():
#     print(f"PSI для признака {feature}: {psi:.4f}")

In [None]:
# classifiers = {
#     "LogisticRegression" : LogisticRegression(max_iter=1000),
#     "KNeighbors" : KNeighborsClassifier(),
#     "SVC" : SVC(),
#     "DecisionTree" : DecisionTreeClassifier(),
#     "RandomForest" : RandomForestClassifier(),
#     "XGBoost" : XGBClassifier()
# }

In [None]:
# train_scores = []
# test_scores = []

# for key, classifier in classifiers.items():
#     classifier.fit(X_balanced, y_balanced)
#     train_score = classifier.score(X_balanced, y_balanced)
#     train_scores.append(train_score)
#     test_score = classifier.score(X_test, y_test)
#     test_scores.append(test_score)

# print(train_scores)
# print(test_scores)

In [None]:
xgb = XGBClassifier()
model = xgb.fit(X_balanced, y_balanced)
train_score = xgb.score(X_balanced, y_balanced)
test_score = xgb.score(X_test, y_test)
xgb_prediction = xgb.predict(X_test)

print(f"xgb train score = {train_score}")
print(f"xgb test score = {test_score}")

In [None]:
print(classification_report(y_test, xgb_prediction)) #1 xgb

In [None]:
# Вычисляем AUC
auc = roc_auc_score(y_test, xgb_prediction)

# Вычисляем коэффициент Джини
gini = 2 * auc - 1
print(f"xgb AUC: {auc}")
print(f"xgb Gini: {gini}")

In [None]:
cm = confusion_matrix(y_test, xgb_prediction)
print("Confusion Matrix:\n", cm)

In [None]:
import shap

# Создание explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_balanced)

# Визуализация значимости признаков
shap.summary_plot(shap_values, X_balanced, feature_names=X_balanced.columns)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_balanced)
shap.plots.waterfall(shap_values[0])

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_balanced)
shap.plots.waterfall(shap_values[1])

In [None]:
shap.plots.bar(shap_values)

In [None]:
logreg = LogisticRegression(max_iter=1000)
model = logreg.fit(X_balanced, y_balanced)
train_score = logreg.score(X_balanced, y_balanced)
test_score = logreg.score(X_test, y_test)
logreg_prediction = logreg.predict(X_test)

print(f"logreg train score = {train_score}")
print(f"logreg test score = {test_score}")

In [None]:
print(classification_report(y_test, logreg_prediction)) #logreg

In [None]:
# Вычисляем AUC
auc = roc_auc_score(y_test, logreg_prediction)

# Вычисляем коэффициент Джини
gini = 2 * auc - 1
print(f"logreg AUC: {auc}")
print(f"logreg Gini: {gini}")