# XGBoost + 5folds 모델 

In [8]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
train = pd.read_csv('card_dataset/train.csv')
test = pd.read_csv('card_dataset/test.csv')
submission =pd.read_csv('card_dataset/sample_submission.csv')

In [3]:
new_train = train.copy()
new_test = test.copy()

new_train[['work_phone','phone','email','FLAG_MOBIL']] = new_train[['work_phone','phone','email','FLAG_MOBIL']].astype('object')
new_test[['work_phone','phone','email','FLAG_MOBIL']] = new_test[['work_phone','phone','email','FLAG_MOBIL']].astype('object')

## 1. 결측치 처리 

In [4]:
## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_train.loc[new_train['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_train.loc[new_train['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type == 'State servant'), 'occyp_type'] = 'State servant'

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff'


In [5]:
## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_test.loc[new_test['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_test.loc[new_test['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type == 'State servant'), 'occyp_type'] = 'State servant' 

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff' 

## 2. 학습 데이터 처리 

In [6]:
# 1. FLAG_MOBIL 변수 삭제
m_train = new_train.drop('FLAG_MOBIL', axis = 1)
m_test = new_test.drop('FLAG_MOBIL', axis =1)


# 2. credit 변수 분할 
target_credit = m_train['credit']
target_credit = target_credit.astype('int')

m_train_x = m_train.drop('credit', axis = 1)


# 3. 더미변수 
dum_train_x = pd.get_dummies(m_train_x, drop_first = True)
dum_test_x = pd.get_dummies(m_test, drop_first = True)

## 3. 학습 모델링 
### 3-1. 기본 XGBoost 모델 

In [17]:
xgb = XGBClassifier()

In [18]:
train_x, val_x, train_y, val_y = train_test_split(dum_train_x, target_credit,
                                                 test_size = 0.25,
                                                 stratify = target_credit)

In [19]:
xgb.fit(train_x, train_y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [20]:
pred = xgb.predict_proba(val_x)

In [21]:
logloss = log_loss(val_y, pred)

In [22]:
logloss

0.7667897782498972

In [23]:
# 전체 데이터 학습 및 test 데이터로 예측 결과 

xgb.fit(dum_train_x, target_credit)
pred_target = xgb.predict_proba(dum_test_x)





In [31]:
submission.iloc[:,1:]=pred_target

In [32]:
submission.to_csv('submission/basic_xgb.csv', index = False)

### 3-2 Xgboost 모델 파라미터 조정 

In [33]:
from sklearn.model_selection import GridSearchCV

In [37]:
evals = [(val_x, val_y)]
params = { 'n_estimators' : [50, 100, 200, 400],
          'max_depth' : [3, 6, 8, 10, 12]
            }
xgbc = XGBClassifier(learning_rate = 0.03, 
                     objective = 'multi:softprob',
                 random_state = 504)
grid_cv = GridSearchCV(xgbc, param_grid = params, cv = 3)
grid_cv.fit(train_x, train_y, eval_metric ='mlogloss',
           eval_set = evals, early_stopping_rounds = 10, verbose = 0)



GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.03, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None,
                                     objective='multi:softprob',
                                     random_state=504, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
      

In [38]:
print('최적 하이퍼 파라미터 : ', grid_cv.best_params_)
print('최적의 logloss : {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터 :  {'max_depth': 8, 'n_estimators': 400}
최적의 logloss : 0.6952


In [39]:
xgbc = XGBClassifier(learning_rate = 0.03, 
                     objective = 'multi:softprob',
                     n_estimators = 400, 
                     max_depth = 8,
                 random_state = 504)

xgbc.fit(train_x, train_y, eval_metric = 'mlogloss', 
        eval_set = evals, early_stopping_rounds =10, verbose = 0)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=504, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [40]:
pred = xgbc.predict_proba(dum_test_x)

In [41]:
pred.shape

(10000, 3)

In [42]:
submission.iloc[:,1:] = pred 
submission.to_csv('submission/xgb_adjustParams.csv', index = False)

### 3-3. K-Folds 평균값 

In [43]:
folds = StratifiedKFold(n_splits = 5,
                       shuffle = True,
                       random_state = 504)

In [44]:
outcomes = []
sub = np.zeros((dum_test_x.shape[0], 3))



for n_folds, (train_index, val_index) in enumerate(folds.split(dum_train_x, target_credit)):
    
    X_train, X_val = dum_train_x.iloc[train_index], dum_train_x.iloc[val_index]
    y_train, y_val = target_credit.iloc[train_index], target_credit.iloc[val_index]
    
    xgb = XGBClassifier(learning_rate = 0.03, 
                     objective = 'multi:softprob',
                     n_estimators = 400, 
                     max_depth = 8,
                 random_state = 504)
    
    evals = [(X_val, y_val)]
    xgb.fit(train_x, train_y, eval_metric = 'mlogloss', 
        eval_set = evals, early_stopping_rounds =10, verbose = 0)
    
    prediction = xgb.predict_proba(X_val)
    y_val_onehot = pd.get_dummies(y_val)
    
    logloss = log_loss(y_val_onehot, prediction)
    outcomes.append(logloss)
    
    sub+= xgb.predict_proba(dum_test_x)
    
sub = sub/5



In [46]:
np.mean(outcomes)

0.6376004132250175

In [47]:
submission.iloc[:,1:]= sub
submission.to_csv('submission/xgb_5folds.csv', index =False)