# 고객 대출등급 예측: Modeling

In [3]:
import numpy as np
import random
import os
import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

import pandas as pd

# Load the dataset
file_path = 'data/train.csv'
data = pd.read_csv(file_path)

## 데이터 전처리

In [5]:
# 결측치 확인
missing_values = data.isnull().sum()

# 결측치가 있는 컬럼만 출력
missing_values = missing_values[missing_values > 0]

missing_values

Series([], dtype: int64)

#### Encoding

In [6]:
# 범주형 변수 재확인
categorical_columns = data.select_dtypes(include=['object']).columns

# 범주형 변수 출력
categorical_columns


Index(['ID', '대출기간', '근로기간', '주택소유상태', '대출목적', '대출등급'], dtype='object')

In [7]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# One-Hot Encoder 생성
onehot_encoder = OneHotEncoder()

# Label Encoder 생성
label_encoder = LabelEncoder()

# One-Hot Encoding 적용할 범주형 변수 선택 (ID 제외)
onehot_columns = ['대출기간', '근로기간', '주택소유상태', '대출목적']
data_onehot = pd.DataFrame(onehot_encoder.fit_transform(data[onehot_columns]).toarray())

# Label Encoding 적용 (대출등급)
data['대출등급_encoded'] = label_encoder.fit_transform(data['대출등급'])

# 인코딩된 데이터와 기존 데이터 합치기
data_encoded = pd.concat([data, data_onehot], axis=1)

# 원본 범주형 변수 제거
data_encoded.drop(onehot_columns + ['대출등급'], axis=1, inplace=True)

data_encoded.head()


Unnamed: 0,ID,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,...,24,25,26,27,28,29,30,31,32,33
0,TRAIN_00000,12480000,72000000,18.9,15,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TRAIN_00001,14400000,130800000,22.33,21,0,373572,234060.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,TRAIN_00002,12000000,96000000,8.6,14,0,928644,151944.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TRAIN_00003,14400000,132000000,15.09,15,0,325824,153108.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TRAIN_00004,18000000,71736000,25.39,19,0,228540,148956.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Scaling

In [9]:
# 수치형 변수 재확인 (ID와 인코딩된 대출등급 제외)
numerical_columns_for_scaling = ['대출금액', '연간소득', '부채_대비_소득_비율', '총계좌수', '최근_2년간_연체_횟수', '총상환원금', '총상환이자', '총연체금액', '연체계좌수']

# Standard Scaling 적용
data_encoded[numerical_columns_for_scaling] = scaler.fit_transform(data_encoded[numerical_columns_for_scaling])

# 스케일링 후 데이터 확인
data_encoded.head()

Unnamed: 0,ID,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,...,24,25,26,27,28,29,30,31,32,33
0,TRAIN_00000,-0.563848,-0.220218,-0.014287,-0.852449,-0.376102,-0.800303,-0.972784,-0.038438,-0.072595,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TRAIN_00001,-0.377964,0.370332,0.08789,-0.356109,-0.376102,-0.436814,-0.441082,-0.038438,-0.072595,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,TRAIN_00002,-0.610319,0.020823,-0.321114,-0.935172,-0.376102,0.103276,-0.627621,-0.038438,-0.072595,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TRAIN_00003,-0.377964,0.382384,-0.127783,-0.852449,-0.376102,-0.483274,-0.624977,-0.038438,-0.072595,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TRAIN_00004,-0.029431,-0.22287,0.179044,-0.521556,-0.376102,-0.577932,-0.634409,-0.038438,-0.072595,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Modeling 1 : Random Forest

In [16]:
# 데이터 전처리 및 분할
X = data_encoded.drop(['ID', '대출등급_encoded'], axis=1)
y = data_encoded['대출등급_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모든 열 이름을 문자열로 변환
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# 랜덤 포레스트 분류기 훈련
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# 훈련 세트에서의 성능 평가
training_score = rf_classifier.score(X_train, y_train)

# 테스트 세트에서의 성능 평가
y_pred = rf_classifier.predict(X_test)
f1_test = f1_score(y_test, y_pred, average='macro')

training_score, f1_test


(1.0, 0.5567505123735182)

#### Hyper Parameters Tuning

In [17]:
from sklearn.model_selection import GridSearchCV

# 튜닝할 하이퍼파라미터 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV 설정
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1_macro')

# GridSearchCV를 사용한 훈련
grid_search.fit(X_train, y_train)

# 최적의 파라미터와 그 때의 성능
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

Fitting 3 folds for each of 81 candidates, totalling 243 fits


({'max_depth': 30,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 200},
 0.5224301306758062)

In [18]:
# 최적의 하이퍼파라미터로 랜덤 포레스트 분류기 재구성
rf_classifier_optimized = RandomForestClassifier(
    n_estimators=200, 
    max_depth=30, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    random_state=42
)

# 최적화된 모델 훈련
rf_classifier_optimized.fit(X_train, y_train)

# 테스트 세트에서 예측
y_pred_optimized = rf_classifier_optimized.predict(X_test)

# 최적화된 모델의 성능 평가 (Macro F1-Score)
f1_test_optimized = f1_score(y_test, y_pred_optimized, average='macro')
f1_test_optimized

0.5556508176994651