# 문제 정의
- 신용카드 신청자의 채무 불이행을 예측하시오.
 - 제공된 데이터 목록: creditcard_train.csv, creditcard_test.csv
 - 예측할 컬럼: STATUS(0: 채무 이행, 1: 채무 불이행)
- 학습용 데이터(train)를 이용해 신용카드 신청자의 데이터를 바탕으로 미래의 채무 불이행을 예측하는 모델을 만든 후 이를 평가용 데이터(test)에 적용해 얻은 예측값을 다음과 같은 형식의 CSV 파일로 생성하시오.

제출 파일은 다음 1개의 컬럼을 포함해야 한다.
- pred: 예측값
- 제출 파일명: 'result.csv'
제출한 모델의 성능은 f1 평가지표에 따라 채점한다.

In [1]:
# 파일 업로드
from google.colab import files
uploads = files.upload()

Saving creditcard_test.csv to creditcard_test.csv
Saving creditcard_train.csv to creditcard_train.csv


In [47]:
# 데이터 불러오기
import pandas as pd
train = pd.read_csv('creditcard_train.csv')
test = pd.read_csv('creditcard_test.csv')

In [48]:
# EDA
print('===== 데이터 크기 =====')
print('train:', train.shape, 'test:', test.shape)

print('\n===== Data type =====')
print(train.info())

print('\n===== Object =====')
print(train.describe(include='O'))

print('\n===== int/float =====')
print(train.describe())

print('\n===== 결측치(train) =====')
print(train.isnull().sum())

print('\n===== 결측치(test) =====')
print(test.isnull().sum())

print('\n===== target info =====')
print(train['STATUS'].value_counts())

===== 데이터 크기 =====
train: (25519, 19) test: (7591, 18)

===== Data type =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25519 entries, 0 to 25518
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   25519 non-null  int64  
 1   CODE_GENDER          25519 non-null  object 
 2   FLAG_OWN_CAR         25519 non-null  object 
 3   FLAG_OWN_REALTY      25519 non-null  object 
 4   CNT_CHILDREN         25519 non-null  int64  
 5   AMT_INCOME_TOTAL     25519 non-null  float64
 6   NAME_INCOME_TYPE     25519 non-null  object 
 7   NAME_EDUCATION_TYPE  25519 non-null  object 
 8   NAME_FAMILY_STATUS   25519 non-null  object 
 9   NAME_HOUSING_TYPE    25519 non-null  object 
 10  DAYS_BIRTH           25519 non-null  int64  
 11  DAYS_EMPLOYED        25519 non-null  int64  
 12  FLAG_MOBIL           25519 non-null  int64  
 13  FLAG_WORK_PHONE      25519 non-null  int64  
 14  FLAG_PHO

In [49]:
# 결측치 채우기
train['OCCUPATION_TYPE'] = train['OCCUPATION_TYPE'].fillna('Missing_value')
train.isnull().sum()

Unnamed: 0,0
ID,0
CODE_GENDER,0
FLAG_OWN_CAR,0
FLAG_OWN_REALTY,0
CNT_CHILDREN,0
AMT_INCOME_TOTAL,0
NAME_INCOME_TYPE,0
NAME_EDUCATION_TYPE,0
NAME_FAMILY_STATUS,0
NAME_HOUSING_TYPE,0


In [50]:
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

cols = train.select_dtypes('object').columns

for col in cols:
  train[col] = le.fit_transform(train[col])
  test[col] = le.transform(test[col])

In [51]:
# 인코딩 확인
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25519 entries, 0 to 25518
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   25519 non-null  int64  
 1   CODE_GENDER          25519 non-null  int64  
 2   FLAG_OWN_CAR         25519 non-null  int64  
 3   FLAG_OWN_REALTY      25519 non-null  int64  
 4   CNT_CHILDREN         25519 non-null  int64  
 5   AMT_INCOME_TOTAL     25519 non-null  float64
 6   NAME_INCOME_TYPE     25519 non-null  int64  
 7   NAME_EDUCATION_TYPE  25519 non-null  int64  
 8   NAME_FAMILY_STATUS   25519 non-null  int64  
 9   NAME_HOUSING_TYPE    25519 non-null  int64  
 10  DAYS_BIRTH           25519 non-null  int64  
 11  DAYS_EMPLOYED        25519 non-null  int64  
 12  FLAG_MOBIL           25519 non-null  int64  
 13  FLAG_WORK_PHONE      25519 non-null  int64  
 14  FLAG_PHONE           25519 non-null  int64  
 15  FLAG_EMAIL           25519 non-null 

(None, None)

In [52]:
# 데이터 분리
from sklearn.model_selection import train_test_split
target = train.pop('STATUS')

X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size = 0.2,
    random_state = 0
)

In [53]:
# 분리 확인

datas = [X_train, X_val, y_train, y_val]

for data in datas:
  print(data.shape)

(20415, 18)
(5104, 18)
(20415,)
(5104,)


In [54]:
# 머신러닝 선정
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0)

rf.fit(X_train, y_train)

pred = rf.predict(X_val)

pred

array([0, 0, 0, ..., 0, 0, 0])

In [55]:
# 평가
from sklearn.metrics import f1_score

f1 = f1_score(y_val, pred)
f1

0.2742857142857143

# 성능 개선

In [58]:
# depths = [3,5,7]
# n_estimators = [200,300,400,500]
# best_f1 = 0
# best_depth = None
# best_n_estimator = None

# for n_estimator in n_estimators:
#     rf = RandomForestClassifier(random_state=0, n_estimators = n_estimator)
#     rf.fit(X_train, y_train)
#     pred = rf.predict(X_val)
#     f1 = f1_score(y_val, pred)
#     print(f1, 'n_estimator:', n_estimator)
#     if f1 > best_f1:
#       best_f1 = f1
#       best_n_estimator = n_estimator

# print('best_n_estimator:', best_n_estimator)
# 성능개선 실패

0.2727272727272727 n_estimator: 200
0.2727272727272727 n_estimator: 300
0.2727272727272727 n_estimator: 400
0.2727272727272727 n_estimator: 500
best_n_estimator: 200


In [61]:
# # 불균형 데이터 조정 (결과:성능저하)

# rf = RandomForestClassifier(random_state=0, class_weight = 'balanced')

# rf.fit(X_train, y_train)

# pred = rf.predict(X_val)

# f1 = f1_score(y_val, pred)

# f1

0.26436781609195403

In [63]:
# lightgbm (결과: 더 낮은 성능)
# import lightgbm as lgb
# lgbmc = lgb.LGBMClassifier(random_state=0, verbose = -1)
# lgbmc.fit(X_train, y_train)

# pred = lgbmc.predict(X_val)

# f1 = f1_score(y_val, pred)
# f1

0.20155038759689922

In [64]:
# 최종 모델 선정
rf = RandomForestClassifier(random_state=0)

rf.fit(X_train, y_train)

pred = rf.predict(X_val)

f1 = f1_score(y_val, pred)
f1

0.2742857142857143

In [65]:
# 예측 및 결과 파일 생성

pred = rf.predict(test)

submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv',index=False)

In [66]:
pd.read_csv('result.csv')

Unnamed: 0,pred
0,0
1,0
2,0
3,0
4,0
...,...
7586,0
7587,0
7588,0
7589,0
