In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv("./data/test.csv")
train_df = pd.read_csv("./data/train.csv")

### 내 풀이

In [4]:
train_df

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,TRAIN_00000,12480000,36 months,6 years,RENT,72000000,18.90,15,부채 통합,0,0,0.0,0.0,0.0,C
1,TRAIN_00001,14400000,60 months,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B
2,TRAIN_00002,12000000,36 months,5 years,MORTGAGE,96000000,8.60,14,부채 통합,0,928644,151944.0,0.0,0.0,A
3,TRAIN_00003,14400000,36 months,8 years,MORTGAGE,132000000,15.09,15,부채 통합,0,325824,153108.0,0.0,0.0,C
4,TRAIN_00004,18000000,60 months,Unknown,RENT,71736000,25.39,19,주요 구매,0,228540,148956.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96289,TRAIN_96289,14400000,36 months,10+ years,MORTGAGE,210000000,9.33,33,신용 카드,0,974580,492168.0,0.0,0.0,C
96290,TRAIN_96290,28800000,60 months,10+ years,MORTGAGE,132000000,5.16,25,주택 개선,0,583728,855084.0,0.0,0.0,E
96291,TRAIN_96291,14400000,36 months,1 year,MORTGAGE,84000000,11.24,22,신용 카드,0,1489128,241236.0,0.0,0.0,A
96292,TRAIN_96292,15600000,36 months,5 years,MORTGAGE,66330000,17.30,21,부채 통합,2,1378368,818076.0,0.0,0.0,D


In [4]:
drop_col = ['ID']
int_col = ['대출금액', '연간소득', '부채_대비_소득_비율', '총계좌수', '최근_2년간_연체_횟수', '총상환원금', '총상환이자', '총연체금액', '연체계좌수']
#다른 변경 필요 : '대출기간', '근로기간', 
str_col = ['주택소유상태', '대출목적']
y_col = ['대출등급']

#예측할 값 분리
y = train_df[y_col]
train_df.drop(columns=y_col, inplace=True)

#id 삭제
id = test_df['ID']
train_df.drop(columns = drop_col, inplace=True)
test_df.drop(columns = drop_col, inplace=True)

In [5]:
#대출기간 전처리
train_df['대출기간'] = train_df['대출기간'].apply(lambda row: 1 if row == ' 36 months' else 2)
test_df['대출기간'] = test_df['대출기간'].apply(lambda row: 1 if row == ' 36 months' else 2)

#근로기간, 부채_대비_소득_비율 삭제
train_df.drop(columns = ['근로기간', '부채_대비_소득_비율'], inplace=True)
test_df.drop(columns = ['근로기간', '부채_대비_소득_비율'], inplace=True)

In [7]:
#스케일링, 인코딩 함수 만들기 Robustscaler 사용
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split


encoder = LabelEncoder()
scaler = RobustScaler()


def scaling_df(train_df, test_df):
    len1 = len(train_df)
    len2 = len(test_df)

    df = pd.concat([train_df, test_df], axis=0)
    df['주택소유상태'] = encoder.fit_transform(df['주택소유상태'])
    df['대출목적'] = encoder.fit_transform(df['대출목적'])

    #train, test 다시 나누기
    train_df_scaled = df.iloc[:len1, :]
    test_df_scaled = df.iloc[len1:, :]

    print('사이즈 확인 :', train_df.shape, train_df_scaled.shape, test_df.shape, test_df_scaled.shape) #크기 확인

    #수치형 인코딩
    train_df_scaled2 = scaler.fit_transform(train_df_scaled)
    test_df_scaled2 = scaler.transform(test_df_scaled)

    return train_df_scaled2, test_df_scaled2

In [8]:
train_df_scaled, test_df_scaled = scaling_df(train_df, test_df)

사이즈 확인 : (96294, 11) (96294, 11) (64197, 11) (64197, 11)


In [11]:
#y값에 대한 인코딩
label_order = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}

# 라벨 인코더 생성 및 순서 정의
encoder.classes_ = [label for label, order in sorted(label_order.items(), key=lambda x: x[1])]

# 데이터프레임의 '대출등급' 열에 라벨 인코딩 적용
import numpy as np
# y['대출등급'] = encoder.transform(y['대출등급'])
y_array = np.array(y['대출등급'])
y_array_encoded = encoder.fit_transform(y_array)
y['대출등급'] = y_array_encoded

In [12]:
y.value_counts() / len(y) #비율 체크

대출등급
1       0.299261
2       0.286861
0       0.174175
3       0.138679
4       0.076370
5       0.020292
6       0.004362
dtype: float64

- 예측모델 진행

In [17]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

xtr, xval, ytr, yval = train_test_split(train_df_scaled, y, test_size = 0.2, random_state=123)

In [21]:
oversampler = RandomOverSampler(random_state=123)
X_resampled, y_resampled = oversampler.fit_resample(xtr, ytr) #오버샘플링은 훈련데이터에만 적용

In [27]:
X_resampled

array([[ 0.52173913,  0.        , -0.5       , ...,  0.29391185,
         0.        ,  0.        ],
       [-0.60869565,  0.        , -0.5       , ..., -0.40044077,
         0.        ,  0.        ],
       [ 0.08695652,  0.        ,  0.5       , ..., -0.36415978,
         0.        ,  0.        ],
       ...,
       [ 1.2826087 ,  1.        ,  0.5       , ...,  4.36286501,
         0.        ,  0.        ],
       [ 1.82608696,  1.        , -0.5       , ...,  7.69553719,
         0.        ,  0.        ],
       [-0.43478261,  0.        ,  0.5       , ..., -0.0946281 ,
         0.        ,  0.        ]])

In [32]:
#모델학습
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score


model = xgb.XGBClassifier()
model.fit(X_resampled, y_resampled)
yval_pred = model.predict(xval)

#분류모델 성능 예측
print(f'정확도 : {accuracy_score(yval, yval_pred)}')
print(f'f1_score : {f1_score(yval, yval_pred, average = "weighted")}')
print(f'혼동행렬 : {confusion_matrix(yval, yval_pred)}')
print(classification_report(yval, yval_pred))

정확도 : 0.8407497793239525
f1_score : 0.8411463780035248
혼동행렬 : [[3058  234   16    3    2    1    0]
 [ 501 4705  364   36   30    9    0]
 [ 102  353 4737  298   69   11    0]
 [  24   49  166 2224  305   23    2]
 [   5   15   20  213 1130   73    4]
 [   0    1    8   13   57  292   16]
 [   0    0    0    7    9   28   46]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      3314
           1       0.88      0.83      0.86      5645
           2       0.89      0.85      0.87      5570
           3       0.80      0.80      0.80      2793
           4       0.71      0.77      0.74      1460
           5       0.67      0.75      0.71       387
           6       0.68      0.51      0.58        90

    accuracy                           0.84     19259
   macro avg       0.78      0.78      0.77     19259
weighted avg       0.84      0.84      0.84     19259



In [33]:
#gridsearchCv 적용

from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5]
}

grid_search = GridSearchCV(xgb.XGBClassifier(), param_grid, scoring='f1_weighted', cv=5)
grid_search.fit(X_resampled, y_resampled)

best_model = grid_search.best_estimator_
yval_pred = best_model.predict(xval)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best F1 Score: {grid_search.best_score_}')
print(f'Accuracy: {accuracy_score(yval, yval_pred)}')
print(f'F1 Score: {f1_score(yval, yval_pred, average="weighted")}')
print(f'Confusion Matrix: {confusion_matrix(yval, yval_pred)}')

In [34]:
importances = model.feature_importances_
feature_names = train_df.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

print(feature_importance)

         Feature  Importance
1           대출기간    0.461265
8          총상환이자    0.178799
7          총상환원금    0.151801
0           대출금액    0.040791
3           연간소득    0.038416
5           대출목적    0.031028
6   최근_2년간_연체_횟수    0.027499
2         주택소유상태    0.023782
4           총계좌수    0.021829
10         연체계좌수    0.013415
9          총연체금액    0.011374


In [43]:
#예측값 추출
y_pred = model.predict(test_df_scaled)
prediction = encoder.inverse_transform(y_pred)
prediction

array(['B', 'A', 'A', ..., 'D', 'C', 'A'], dtype=object)

In [44]:
df_pred = pd.DataFrame(prediction, columns = ['대출등급'])

In [45]:
df_pred

Unnamed: 0,대출등급
0,B
1,A
2,A
3,C
4,C
...,...
64192,D
64193,D
64194,D
64195,C


In [52]:
submission = pd.concat([id, df_pred], axis=1)

### [Private 1위] Stacking(ET+XGB+DT+RF)

In [5]:
import numpy as np
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import RFECV

import warnings
warnings.filterwarnings('ignore')

시각화는 다른 분들이 충분히 올려주셨기에 생략하고,
성능 개선에 도움을 주었던 4가지 요소들을 중심으로 설명하도록 하겠습니다.

1. 파생변수 생성
- EDA를 통해 '총상환원금', '총상환이자'가 주요한 영향을 미치는 것을 파악했고, 두 변수의 결합('총상환원금'/'총상환이자')으로 파생변수를 생성했습니다.
- 신용점수 평가 기준을 찾아보니 상환능력이 중요한 것 같더군요. 따라서, '총상환원금'/'대출금액'으로 '상환비율' 변수를 생성했습니다.
- 두 변수 생성 이후, 0.83에서 0.951로 점수가 가장 큰 폭으로 상승했습니다.

2.  변수 제거 (RFECV, Feature Importance)
- Feature Importance를 찍어보니 값이 0에 가까운 변수들이 몇 개 있어서 아예 제거하기로 했습니다.
- 변수 선택법인 RFECV를 활용, 변수를 3개로 줄였습니다. ('대출기간', '총상환원금/총상환이자', '상환비율')
- 이 방법으로 0.02 정도 점수가 오른 것 같습니다.

3. 하이퍼파라미터 튜닝 (Optuna)
- 튜닝 전, 성능이 괜찮게 나왔던 ET, RF, DT, XGB 총 4가지 모델들에 대해 튜닝을 진행했습니다.
- XGB의 경우 tree_method에 따라 점수가 많이 달라지는 것을 확인해 튜닝 작업에 추가했습니다.
- 튜닝 이후, 0.951에서 0.953으로 점수가 상승했습니다.

4. 앙상블 모델 (Stacking)
- Voting과 Stacking, 그리고 여러 모델끼리 조합을 해보면서 가장 성능이 좋은 조합을 찾아갔습니다.
- Stacking(ET+XGB+DT+RF)을 사용한 결과, 0.953에서 0.955로 점수가 증가해 최종 모델로 선정했습니다.

In [6]:
def preprocess(train, test):
    
    ############################################### train 전처리 ############################################### 
    train = train.drop(columns = ['ID'])
    train['대출기간'] = train['대출기간'].str.replace('[^0-9]','')
    train['대출기간'] = (train['대출기간'].astype(int))//12
    
    # 범주형 변수 인코딩
    le = LabelEncoder()
    train['대출등급'] = le.fit_transform(train['대출등급'])
    
    # 파생변수
    train['총상환원금/총상환이자'] = (train['총상환원금'])/(train['총상환이자']+1)
    train['상환비율'] = train['총상환원금'] / train['대출금액']
    
    
    ############################################### test 전처리 ############################################### 
    test = test.drop(columns = ['ID'])
    test['대출기간'] = test['대출기간'].str.replace('[^0-9]','')
    test['대출기간'] = (test['대출기간'].astype(int))//12
    
    # 파생변수
    test['총상환원금/총상환이자'] = (test['총상환원금'])/(test['총상환이자']+1)
    test['상환비율'] = test['총상환원금'] / test['대출금액']
    
    return train, test

In [7]:
train,test = preprocess(train_df, test_df)

In [9]:
train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,총상환원금/총상환이자,상환비율
0,12480000,3,6 years,RENT,72000000,18.90,15,부채 통합,0,0,0.0,0.0,0.0,2,0.000000,0.000000
1,14400000,5,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,1,1.596045,0.025943
2,12000000,3,5 years,MORTGAGE,96000000,8.60,14,부채 통합,0,928644,151944.0,0.0,0.0,0,6.111711,0.077387
3,14400000,3,8 years,MORTGAGE,132000000,15.09,15,부채 통합,0,325824,153108.0,0.0,0.0,2,2.128053,0.022627
4,18000000,5,Unknown,RENT,71736000,25.39,19,주요 구매,0,228540,148956.0,0.0,0.0,1,1.534268,0.012697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96289,14400000,3,10+ years,MORTGAGE,210000000,9.33,33,신용 카드,0,974580,492168.0,0.0,0.0,2,1.980173,0.067679
96290,28800000,5,10+ years,MORTGAGE,132000000,5.16,25,주택 개선,0,583728,855084.0,0.0,0.0,4,0.682655,0.020268
96291,14400000,3,1 year,MORTGAGE,84000000,11.24,22,신용 카드,0,1489128,241236.0,0.0,0.0,0,6.172884,0.103412
96292,15600000,3,5 years,MORTGAGE,66330000,17.30,21,부채 통합,2,1378368,818076.0,0.0,0.0,3,1.684888,0.088357


In [8]:
X = train[['대출기간','총상환원금/총상환이자','상환비율']]; y = train['대출등급'] 
test = test[['대출기간','총상환원금/총상환이자','상환비율']]

In [11]:
X

Unnamed: 0,대출기간,총상환원금/총상환이자,상환비율
0,3,0.000000,0.000000
1,5,1.596045,0.025943
2,3,6.111711,0.077387
3,3,2.128053,0.022627
4,5,1.534268,0.012697
...,...,...,...
96289,3,1.980173,0.067679
96290,5,0.682655,0.020268
96291,3,6.172884,0.103412
96292,3,1.684888,0.088357


In [12]:
test

Unnamed: 0,대출기간,총상환원금/총상환이자,상환비율
0,3,2.692214,0.023494
1,3,0.000000,0.000000
2,3,6.340833,0.103413
3,3,2.374741,0.046460
4,3,2.033117,0.045292
...,...,...,...
64192,3,1.712817,0.043584
64193,5,0.771419,0.032020
64194,3,1.626093,0.021490
64195,3,2.225612,0.116279


In [13]:
rf = RandomForestClassifier(random_state = 42
                         , n_estimators = 305
                         , criterion = 'gini'
                         , max_depth = 62
                         , min_samples_split = 7
                         , min_samples_leaf = 1)
dt = DecisionTreeClassifier(random_state = 42
                         , criterion = 'entropy'
                         , max_depth = 25
                         , min_samples_split = 2
                         , min_samples_leaf = 1)
et = ExtraTreesClassifier(random_state = 42
                         , n_estimators = 930
                         , criterion = 'entropy'
                         , max_depth = 65
                         , min_samples_split = 6
                         , min_samples_leaf = 1
                         )
xgb = XGBClassifier(random_state = 42
                   , n_estimators = 665
                   , reg_lambda = 0.04614513317156364
                   , reg_alpha = 0.8831857977740336
                   , tree_method = "exact"
                   , colsample_bytree = 0.7664006730032823
                   , subsample = 0.6579847353498132
                   , learning_rate = 0.4046062291148477
                   , max_depth = 64
                   , min_child_weight = 2
                   )

In [14]:
scale = StandardScaler()
X = scale.fit_transform(X)
test = scale.transform(test)

In [15]:
X

array([[-0.70243609, -0.00887973, -0.99974863],
       [ 1.42361706, -0.00885038, -0.45762403],
       [-0.70243609, -0.00876735,  0.61741996],
       ...,
       [-0.70243609, -0.00876623,  1.1612616 ],
       [-0.70243609, -0.00884875,  0.8466602 ],
       [-0.70243609, -0.00883986,  0.44212644]])

In [16]:
estimators = [('et',et), ('xgb',xgb), ('dt',dt), ('rf',rf)]
stack = StackingClassifier(estimators, final_estimator=LogisticRegression(), verbose=1)
stack.fit(X,y)
pred = stack.predict(test)

In [20]:
test

array([[-0.70243609, -0.00883023, -0.50879969],
       [-0.70243609, -0.00887973, -0.99974863],
       [-0.70243609, -0.00876314,  1.16129353],
       ...,
       [-0.70243609, -0.00884983, -0.55066454],
       [-0.70243609, -0.00883881,  1.43015582],
       [-0.70243609, -0.00876373,  0.07603721]])

In [18]:
sub = pd.read_csv('./data/sample_submission.csv')
sub['대출등급'] = pred
sub['대출등급'] = sub['대출등급'].map({0:'A',1:'B',2:'C',3:'D',4:'E',5:'F',6:'G'})
sub.head()

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C


### Private 17위, 결정트리 모델

- 색다른 전개방식

In [21]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [22]:
#전처리

for i in range(len(train['근로기간'])):
    if train['근로기간'][i] == '< 1 year':
        train.loc[i,'근로기간'] = '<1 year'
    elif train['근로기간'][i] == '3':
        train.loc[i,'근로기간'] = '3 years'
    elif train['근로기간'][i] == '10+years':
        train.loc[i,'근로기간'] = '10+ years'
    elif train['근로기간'][i] == '1 years':
        train.loc[i,'근로기간'] = '1 year'

# 주택소유상태가 ANY인 행 삭제
train = train.drop(labels=28730,axis=0)
train=train.reset_index(drop=True)

#columns 확보
train_columns_original = train.columns[:-1]

#파생변수 만들기
train['원금/대출'] = train['총상환원금']/train['대출금액']
train['이자/대출'] = train['총상환이자']/train['대출금액']

#y의 카테고리 형 수치형으로 변경
credit_score_map={'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6}
train['대출등급']=train['대출등급'].map(credit_score_map)

#y값 분리 및 불필요한 column 삭제
target = train['대출등급']
train=train.drop(['ID','대출등급',],axis=1)

In [23]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse=False)

# 학습 데이터
train_encoded = onehot_encoder.fit_transform(train[['근로기간','대출기간','주택소유상태','대출목적']])

In [24]:
train_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numeric_columns = ['대출금액', '연간소득', '부채_대비_소득_비율', '총계좌수', '최근_2년간_연체_횟수', '총상환원금', '총상환이자', '연체계좌수','총연체금액','원금/대출','이자/대출']
train_numeric_scaled = scaler.fit_transform(train[numeric_columns])

In [26]:
train_numeric_scaled

array([[-0.5638364 , -0.22021706, -0.01428697, ..., -0.03843818,
        -0.99975901, -1.45132485],
       [-0.37794863,  0.37032992,  0.08788893, ..., -0.03843818,
        -0.45763441, -0.40596012],
       [-0.61030834,  0.02082253, -0.32111257, ..., -0.03843818,
         0.61740957, -0.63698546],
       ...,
       [-0.37794863, -0.09969727, -0.24246989, ..., -0.03843818,
         1.16125121, -0.37391048],
       [-0.26176877, -0.27716266, -0.0619492 , ..., -0.03843818,
         0.84664982,  1.92133225],
       [-0.93561193, -0.43715268, -0.22578811, ..., -0.03843818,
         0.44211606,  0.59536763]])

In [27]:
#오케이 그냥 합치는구나
train_combined = np.hstack((train_numeric_scaled, train_encoded))
train_combined

array([[-0.5638364 , -0.22021706, -0.01428697, ...,  0.        ,
         0.        ,  0.        ],
       [-0.37794863,  0.37032992,  0.08788893, ...,  0.        ,
         1.        ,  0.        ],
       [-0.61030834,  0.02082253, -0.32111257, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.37794863, -0.09969727, -0.24246989, ...,  0.        ,
         0.        ,  0.        ],
       [-0.26176877, -0.27716266, -0.0619492 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.93561193, -0.43715268, -0.22578811, ...,  0.        ,
         0.        ,  0.        ]])

In [28]:
#머신러닝 파트

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

k_fold=KFold(n_splits=5,random_state=0,shuffle=True)

dtc=DecisionTreeClassifier(max_depth=16, min_samples_leaf= 3, min_samples_split= 7)
score=cross_val_score(dtc,train_combined,target,cv=k_fold,n_jobs=1,scoring='f1_macro')
print(f"\nMean Accuracy: {np.mean(score)}")


Mean Accuracy: 0.9433677790893175


In [29]:
dtc.fit(train_combined,target)

In [30]:
#여긴 신기한 게 train 다 끝나고 test를 수정함, onehot을 df 합치지 않고도 할 수 있다는 걸 알게 됨
#np.array_equal로 합치기가 안되는 위치를 잡아냄

def test_processing(test_df):
    # 트레인셋에서 대출등급을 제외한 컬럼과 테스트셋이 같은지 확인
    if np.array_equal(train_columns_original,test_df.columns) != True:
        print("컬럼이 맞지 않습니다.")
        return test_df.head(3)

    # 근로기간
    else:
        for i in range(len(test_df['근로기간'])):
            if test_df['근로기간'][i] == '< 1 year':
                test_df.loc[i,'근로기간'] = '<1 year'
            elif test_df['근로기간'][i] == '3':
                test_df.loc[i,'근로기간'] = '3 years'
            elif test_df['근로기간'][i] == '10+years':
                test_df.loc[i,'근로기간'] = '10+ years'
            elif test_df['근로기간'][i] == '1 years':
                test_df.loc[i,'근로기간'] = '1 year'

    # 대출목적 특이케이스 수정
    purpose_list = train['대출목적'].unique()
    for i in range(len(test_df['대출목적'])):
        if test_df['대출목적'][i] not in purpose_list:
            test_df.loc[i,'대출목적']='기타'

    # 트레인셋의 범주형 데이터와 테스트셋의 범주형 데이터가 동일한지 확인
    if np.array_equal(np.sort(train['근로기간'].unique()),np.sort((test_df['근로기간']).unique())) != True:
        print("근로기간 범주가 맞지 않습니다.")
        return test_df.head(3)

    if np.array_equal(np.sort(train['주택소유상태'].unique()),np.sort(test_df['주택소유상태'].unique())) != True:
        print("주택소유상태 범주가 맞지 않습니다.")
        return test_df.head(3)

    if np.array_equal(np.sort(train['대출목적'].unique()),np.sort(test_df['대출목적'].unique())) != True:
        print("대출목적 범주가 맞지 않습니다.")
        return test_df.head(3)
    
    else:
        test_tmp = test_df.copy()
        
        test_tmp['원금/대출'] = test_tmp['총상환원금']/test_tmp['대출금액']
        test_tmp['이자/대출'] = test_tmp['총상환이자']/test_tmp['대출금액']

        test_tmp = test_tmp.drop(['ID'],axis=1)

        # 테스트 데이터 인코딩 #앞에 요소를 다 맞춰놔서 test만 인코딩 transform 해도 오류가 뜨지 않음!
        test_encoded = onehot_encoder.transform(test_tmp[['근로기간','대출기간','주택소유상태','대출목적']])

        numeric_columns = ['대출금액', '연간소득', '부채_대비_소득_비율', '총계좌수', '최근_2년간_연체_횟수', '총상환원금', '총상환이자', '연체계좌수','총연체금액','원금/대출','이자/대출']
        test_numeric_scaled = scaler.transform(test_tmp[numeric_columns])

        test_combined = np.hstack((test_numeric_scaled, test_encoded)) #행끼리 옆으로 붙이기기

        return test_combined

In [31]:
test_combined = test_processing(test)
predictions = dtc.predict(test_combined)

submission=pd.DataFrame({
    "ID":test['ID'],
    '대출등급':predictions
    })

#이런 방식으로 되돌려놓기
credit_score_map_rev={0:'A',1:'B',2:'C',3:'D',4:'E',5:'F',6:'G'}

submission['대출등급']=submission['대출등급'].map(credit_score_map_rev)

In [32]:
submission

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C


### 또다른 방식