# Kfold 활용

In [114]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from matplotlib import rc 
rc('font', family = 'NanumMyeongjo')
plt.rcParams['axes.unicode_minus'] = False

train = pd.read_csv('card_dataset/train.csv')
test = pd.read_csv('card_dataset/test.csv')
submission = pd.read_csv('card_dataset/sample_submission.csv')

In [115]:
train[['work_phone','phone','email','FLAG_MOBIL']] = train[['work_phone','phone','email','FLAG_MOBIL']].astype('object')
new_train = train.copy()

## 결측치 처리 

In [116]:
## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 

new_train.loc[new_train['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_train.loc[new_train['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경

new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type == 'State servant'), 'occyp_type'] = 'State servant'

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 

new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff' 

In [117]:
test[['work_phone','phone','email','FLAG_MOBIL']] = test[['work_phone','phone','email','FLAG_MOBIL']].astype('object')

In [118]:
new_test = test.copy()

## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 

new_test.loc[new_test['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_test.loc[new_test['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경

new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type == 'State servant'), 'occyp_type'] = 'State servant' 

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 

new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff' 

## 학습 데이터 전처리 

In [119]:
# 1. FLAG_MOBIL 변수 삭제
m_train = new_train.drop('FLAG_MOBIL', axis = 1)
m_test = new_test.drop('FLAG_MOBIL', axis =1)

In [120]:
# 2. credit 변수 분할 

target_credit = m_train['credit']
m_train_x = m_train.drop('credit', axis = 1)

In [121]:
# 3. 더미변수 
dum_train_x = pd.get_dummies(m_train_x, drop_first = True)
dum_test_x = pd.get_dummies(m_test, drop_first = True)

In [124]:
target_credit = target_credit.astype('int')
target_credit

0        1
1        1
2        2
3        0
4        2
        ..
26452    1
26453    2
26454    2
26455    2
26456    2
Name: credit, Length: 26457, dtype: int64

## 모델 학습 

In [125]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [126]:
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

In [127]:
X_train, X_val, y_train, y_val = train_test_split(dum_train_x, target_credit, test_size = 0.25,
                                                   random_state = 503, 
                                                    stratify = target_credit)

In [128]:
folds = StratifiedKFold(n_splits = 5, 
                        shuffle = True, 
                        random_state = 503)

In [129]:
target_credit

0        1
1        1
2        2
3        0
4        2
        ..
26452    1
26453    2
26454    2
26455    2
26456    2
Name: credit, Length: 26457, dtype: int64

In [130]:
output_logloss =[]
sub = np.zeros((dum_test_x.shape[0], 3))

for n_fold, (train_index, val_index) in enumerate(folds.split(dum_train_x, target_credit)):
    
    X_train, X_val = dum_train_x.iloc[train_index], dum_train_x.iloc[val_index]
    y_train, y_val = target_credit.iloc[train_index],target_credit.iloc[val_index]
    
    clf = RandomForestClassifier(n_estimators = 100, 
                                max_depth = 12,
                                min_samples_leaf = 8,
                                min_samples_split = 20,
                                random_state = 0,
                                n_jobs = -1)
    
    clf.fit(X_train, y_train)
    prediction = clf.predict_proba(X_val)
    y_val_onehot= pd.get_dummies(y_val)
    
    logloss = log_loss(y_val_onehot, prediction)
    output_logloss.append(log_loss)
    
    sub += clf.predict_proba(dum_test_x)
    
    
sub = sub/5

In [133]:
submission.iloc[:,1:] = sub

In [136]:
submission.to_csv('submission/5fold_prediction.csv',index =False)