# 문제 정의
- 은행 정보로 신용 등급을 예측하시오.
 - 제공된 데이터 목록: score_train.csv, score_test.csv
 - 예측할 컬럼: Credit_Score(Good, Standard, Low)
- 학습용 데이터(train)를 이용해 신용 등급을 예측하는 모델을 만든 후 이를 평가용 데이터(test)에 적용해 얻은 예측값을 다음과 같은 형식의 CSV 파일로 생성하시오.

제출 파일은 다음 1개의 컬럼을 포함해야 한다.
 - pred: 예측값
 - 제출 파일명: 'result.csv'
제출한 모델의 성능은 f1-macro 평가지표에 따라 채점한다.

In [1]:
# 데이터 업로드
from google.colab import files
upload = files.upload()

Saving score_train.csv to score_train.csv
Saving score_test.csv to score_test.csv


In [2]:
# 데이터 불러오기
import pandas as pd
train = pd.read_csv('score_train.csv')
test = pd.read_csv('score_test.csv')

In [3]:
# 탐색적 데이터 분석(EDA)
print('===== Data Size =====')
print('train:', train.shape, 'test:', test.shape)

print('\n===== dtype =====')
print(train.info())

print('\n==== Object =====')
print(train.describe(include = 'O'))

print('\n===== int/float =====')
print(train.describe())

print('\n===== Missing Value =====')
print('train:', train.isnull().sum())
print('test:', test.isnull().sum())

print('\n===== Target =====')
print(train['Credit_Score'].value_counts())

===== Data Size =====
train: (4198, 21) test: (1499, 20)

===== dtype =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4198 entries, 0 to 4197
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Delay_from_due_date       4198 non-null   float64
 1   Num_of_Delayed_Payment    4198 non-null   float64
 2   Num_Credit_Inquiries      4198 non-null   float64
 3   Credit_Utilization_Ratio  4198 non-null   float64
 4   Credit_History_Age        4198 non-null   float64
 5   Payment_of_Min_Amount     4198 non-null   object 
 6   Amount_invested_monthly   4198 non-null   float64
 7   Monthly_Balance           4198 non-null   float64
 8   Credit_Mix                4198 non-null   object 
 9   Payment_Behaviour         4198 non-null   object 
 10  Age                       4198 non-null   float64
 11  Annual_Income             4198 non-null   float64
 12  Num_Bank_Accounts         4198 non-null   

In [4]:
# target 데이터 추출
target = train.pop('Credit_Score')

In [5]:
# 데이터 전처리(결측치, 이상치 X)
# Label Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

cols = train.select_dtypes('object').columns

for col in cols:
  train[col] = le.fit_transform(train[col])
  test[col] = le.transform(test[col])

# 변환 확인
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4198 entries, 0 to 4197
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Delay_from_due_date       4198 non-null   float64
 1   Num_of_Delayed_Payment    4198 non-null   float64
 2   Num_Credit_Inquiries      4198 non-null   float64
 3   Credit_Utilization_Ratio  4198 non-null   float64
 4   Credit_History_Age        4198 non-null   float64
 5   Payment_of_Min_Amount     4198 non-null   int64  
 6   Amount_invested_monthly   4198 non-null   float64
 7   Monthly_Balance           4198 non-null   float64
 8   Credit_Mix                4198 non-null   int64  
 9   Payment_Behaviour         4198 non-null   int64  
 10  Age                       4198 non-null   float64
 11  Annual_Income             4198 non-null   float64
 12  Num_Bank_Accounts         4198 non-null   float64
 13  Num_Credit_Card           4198 non-null   float64
 14  Interest

(None, None)

In [6]:
# 데이터 추출
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size = 0.2,
    random_state = 0
)

datas = [X_train, X_val, y_train, y_val]
for data in datas:
  print(data.shape)

(3358, 20)
(840, 20)
(3358,)
(840,)


In [7]:
# 모델 학습 및 평가
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state= 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

from sklearn.metrics import f1_score
f1 = f1_score(y_val, pred, average='macro')
f1

0.6853171856067161

# 성능 개선

In [8]:
depths = [3,5,7]
n_estimators = [200,300,400,500]
best_f1 = 0
best_depth = None
best_n_estimator = None


for depth in depths:
  for n_estimator in n_estimators:
    rf = RandomForestClassifier(random_state= 0, max_depth = depth, n_estimators = n_estimator)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_val)
    f1 = f1_score(y_val, pred, average= 'macro')
    print('depth', depth, 'n_estimator', n_estimator, 'f1_score:', f1)
    if f1> best_f1:
      best_f1 = f1
      best_depth = depth
      best_n_estimator = n_estimator
print('best combination:', best_depth, '&', best_n_estimator)

depth 3 n_estimator 200 f1_score: 0.6241776598571459
depth 3 n_estimator 300 f1_score: 0.6281402925798337
depth 3 n_estimator 400 f1_score: 0.6264860790477021
depth 3 n_estimator 500 f1_score: 0.6209862303651144
depth 5 n_estimator 200 f1_score: 0.6841649698453068
depth 5 n_estimator 300 f1_score: 0.678067735044034
depth 5 n_estimator 400 f1_score: 0.6785691190300666
depth 5 n_estimator 500 f1_score: 0.6796169877272313
depth 7 n_estimator 200 f1_score: 0.6935646419574878
depth 7 n_estimator 300 f1_score: 0.6936624311599533
depth 7 n_estimator 400 f1_score: 0.688433734939759
depth 7 n_estimator 500 f1_score: 0.6854751713414272
best combination: 7 & 300


In [10]:
# LIghtGBM(randomforest가 더 높은성능)
# import lightgbm as lgb
# lgbmc = lgb.LGBMClassifier(random_state=0,verbose=-1)
# lgbmc.fit(X_train, y_train)
# pred = lgbmc.predict(X_val)
# f1 = f1_score(y_val, pred, average = 'macro')
# f1

0.6805742291502206

In [12]:
# 최종 모델 선정
model = RandomForestClassifier(random_state= 0, max_depth = 7, n_estimators= 300)
model.fit(X_train, y_train)
pred = model.predict(X_val)

from sklearn.metrics import f1_score
f1 = f1_score(y_val, pred, average='macro')
f1

0.6936624311599533

In [13]:
# 파일 생성
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index=False)

In [14]:
pd.read_csv('result.csv')

Unnamed: 0,pred
0,Standard
1,Standard
2,Poor
3,Poor
4,Standard
...,...
835,Good
836,Standard
837,Standard
838,Standard
