In [111]:
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import *
from matplotlib import pyplot as plt
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell

In [112]:
plt.rc('font', family='GULIM')
warnings.filterwarnings(action='ignore')
InteractiveShell.ast_node_interactivity = "all"

In [113]:
def standard_scale_train(train_data, feature_names):
    """
    특정 feature들에 대해 Z 정규화를 수행하는 함수, sklearn의 StandardScaler 사용
    :param train_data: 훈련 데이터셋 (DataFrame)
    :param feature_names: 정규화를 수행할 특성들의 리스트
    :return: 정규화된 훈련 데이터셋, 훈련된 StandardScaler 객체
    """
    scaler = StandardScaler()
    train_data_scaled = train_data.copy()
    train_data_scaled[feature_names] = scaler.fit_transform(train_data[feature_names])
    return train_data_scaled, scaler


def standard_scale_val(X_val, feature_names, scaler):
    """
    validation set을 특정 feature들에 대해 Z 정규화하는 함수, sklearn의 StandardScaler 사용
    :param X_val: validation set (DataFrame)
    :param feature_names: 정규화를 수행할 특성들의 리스트
    :param scaler: 훈련 데이터셋에 대해 훈련된 StandardScaler 객체
    :return: 정규화된 validation set
    """
    X_val_scaled = X_val.copy()
    X_val_scaled[feature_names] = scaler.transform(X_val[feature_names])
    return X_val_scaled

# Data Check

In [114]:
input_dir = 'Database/'
file_data = 'train.csv'
data = pd.read_csv(os.path.join(input_dir, file_data))
data = data.drop(columns=['ID'])
train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

In [115]:
discrete_list = ['대출기간', '주택소유상태', '대출목적', '최근_2년간_연체_횟수', '연체계좌수', '총연체금액', '부채_대비_소득_비율', '대출등급']
continuous_list = ['대출금액', '연간소득', '총계좌수', '근로기간', '이자/대출', '원금/대출']

In [116]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77035 entries, 56034 to 15795
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   대출금액          77035 non-null  int64  
 1   대출기간          77035 non-null  object 
 2   근로기간          77035 non-null  object 
 3   주택소유상태        77035 non-null  object 
 4   연간소득          77035 non-null  int64  
 5   부채_대비_소득_비율   77035 non-null  float64
 6   총계좌수          77035 non-null  int64  
 7   대출목적          77035 non-null  object 
 8   최근_2년간_연체_횟수  77035 non-null  int64  
 9   총상환원금         77035 non-null  int64  
 10  총상환이자         77035 non-null  float64
 11  총연체금액         77035 non-null  float64
 12  연체계좌수         77035 non-null  float64
 13  대출등급          77035 non-null  object 
dtypes: float64(4), int64(5), object(5)
memory usage: 8.8+ MB


# General Preprocessing

In [117]:
bins = [-1, 15, 30, 45, 60, np.inf]
labels = [0, 1, 2, 3, 4]
train['부채_대비_소득_비율'] = pd.cut(train['부채_대비_소득_비율'], bins=bins, labels=labels)
test['부채_대비_소득_비율'] = pd.cut(test['부채_대비_소득_비율'], bins=bins, labels=labels)

In [118]:
bins = [-1, 100.0, np.inf]
labels = [0, 1]
train['총연체금액'] = pd.cut(train['총연체금액'], bins=bins, labels=labels)
test['총연체금액'] = pd.cut(test['총연체금액'], bins=bins, labels=labels)

In [119]:
bins = [-1, 2, 4, np.inf]
labels = [0, 1, 2]
train['연체계좌수'] = pd.cut(train['연체계좌수'], bins=bins, labels=labels)
test['연체계좌수'] = pd.cut(test['연체계좌수'], bins=bins, labels=labels)

In [120]:
bins = [-1, 6, 12, 18, np.inf]
labels = [0, 1, 2, 3]
train['최근_2년간_연체_횟수'] = pd.cut(train['최근_2년간_연체_횟수'], bins=bins, labels=labels)
test['최근_2년간_연체_횟수'] = pd.cut(test['최근_2년간_연체_횟수'], bins=bins, labels=labels)

In [121]:
train['근로기간'] = train['근로기간'].str.extract(r'(\d+)')
train['근로기간'] = train['근로기간'].fillna(0)
train['근로기간'] = train['근로기간'].astype(int)

test['근로기간'] = test['근로기간'].str.extract(r'(\d+)')
test['근로기간'] = test['근로기간'].fillna(0)
test['근로기간'] = test['근로기간'].astype(int)

# Train 이산형변수 인코딩

In [122]:
df_1 = train.copy()
ec_dict = {}
for i, col in enumerate(discrete_list):
    label_encoder = LabelEncoder()
    encoder_data = label_encoder.fit_transform(df_1[col])
    df_1[col] = encoder_data
    ec_dict[col] = label_encoder

# Train 파생변수

In [123]:
df_2 = df_1.copy()
df_2['이자/대출'] = np.where(df_2['대출금액'] == 0, 0, df_2['총상환이자'] / df_2['대출금액'])
df_2['원금/대출'] = np.where(df_2['대출금액'] == 0, 0, df_2['총상환원금'] / df_2['대출금액'])

# Train 이상치제거

In [124]:
df_3 = df_2.copy()
df_3['연간소득'] = winsorize(df_3['연간소득'], limits=[0.0005, 0.0005])

In [125]:
# fig, ax = plt.subplots(figsize=(10, 8))
# df_3[continuous_list].boxplot(ax=ax)
# plt.xticks(rotation=45)
# fig.suptitle('Train Data', fontsize=16)
# fig.tight_layout(rect=[0, 0, 1, 0.96])
# fig.show()

# Train 칼럼삭제

In [126]:
df_4 = df_3.copy()
df_4 = df_4.drop(columns=['총상환원금', '총상환이자'])
df_4.to_csv('Files/train_pre.csv')

# Train 분포저장

In [127]:
X = df_4.drop(columns=['대출등급'])
y = df_4['대출등급']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
# # 위 block에서 만들어진 z_normalize와 z_normalize_val 적용 (Error 날 경우 다시 고민할 것)
X_train_norm = X_train.copy()
X_train_norm.loc[:, continuous_list], scaler = standard_scale_train(X_train, continuous_list)
X_val_norm = X_val.copy()
X_val_norm.loc[:, continuous_list] = standard_scale_val(X_val, continuous_list, scaler)

In [129]:
# class_counts = y_val.value_counts()
# plt.pie(class_counts, labels=class_counts.index, startangle=140, autopct='%1.1f%%')
# plt.axis('equal')
# plt.title('대출등급')
# plt.show()

# 전처리 성능체크

In [130]:
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train_norm, y_train)
dt_pred = model_dt.predict(X_val_norm)
print(f'Decision Tree Classifier Performance\n')
print(f'Accuracy: {accuracy_score(y_val, dt_pred):.4f}')
print(f'Recall (micro): {recall_score(y_val, dt_pred, average="micro"):.4f}')
print(f'Precision (micro): {precision_score(y_val, dt_pred, average="micro"):.4f}')
print(f'F1 Score (micro): {f1_score(y_val, dt_pred, average="micro"):.4f}')

Decision Tree Classifier Performance

Accuracy: 0.9452
Recall (micro): 0.9452
Precision (micro): 0.9452
F1 Score (micro): 0.9452


# Test 이산형변수 인코딩

In [131]:
df_1 = test.copy()
for i, col in enumerate(discrete_list):
    encoder_data = ec_dict[col].transform(df_1[col])
    df_1[col] = encoder_data

# Test 파생변수

In [132]:
df_2 = df_1.copy()
df_2['이자/대출'] = np.where(df_2['대출금액'] == 0, 0, df_2['총상환이자'] / df_2['대출금액'])
df_2['원금/대출'] = np.where(df_2['대출금액'] == 0, 0, df_2['총상환원금'] / df_2['대출금액'])

# Test 이상치제거

In [133]:
df_3 = df_2.copy()
df_3['연간소득'] = winsorize(df_3['연간소득'], limits=[0.0005, 0.0005])

#  Test 칼럼삭제

In [134]:
df_4 = df_3.copy()
df_4 = df_4.drop(columns=['총상환원금', '총상환이자'])

# Test 이산형변수 칼럼통일

In [135]:
X2 = df_4.drop(columns=['대출등급'])
Y2 = df_4['대출등급']

# train에만 있고 test에 없는 경우, 해당 column name으로 test에 zero columns 추가.
X2[list(X_val_norm.columns[X_val_norm.columns.isin(X2) == False])] = 0

# test에만 있고 train에는 없는 경우, 해당 column name은 제거.
X2 = X2.drop(columns=list(X2.columns[X2.columns.isin(X_val_norm) == False]))
X2 = X2[X_val_norm.columns]

# Test 연속형변수 분포통일

In [136]:
# 가지고 있는 cache로 df_test normalize
X2.loc[:, continuous_list] = standard_scale_val(X2, continuous_list, scaler)

# Test 성능체크

In [139]:
pred = model_dt.predict(X2)

print(f'Decision Tree Classifier Performance\n')
print(f'Accuracy: {accuracy_score(Y2, pred):.4f}')
print(f'Recall (micro): {recall_score(Y2, pred, average="micro"):.4f}')
print(f'Precision (micro): {precision_score(Y2, pred, average="micro"):.4f}')
print(f'F1 Score (micro): {f1_score(Y2, pred, average="micro"):.4f}')

Decision Tree Classifier Performance

Accuracy: 0.9439
Recall (micro): 0.9439
Precision (micro): 0.9439
F1 Score (micro): 0.9439


In [138]:
X2.to_csv('Files/test_pre_input.csv')
Y2.to_csv('Files/test_pre_output.csv')