# LightGBM 모델 활용 

In [10]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from lightgbm import LGBMClassifier


In [2]:
train = pd.read_csv('card_dataset/train.csv')
test = pd.read_csv('card_dataset/test.csv')
submission =pd.read_csv('card_dataset/sample_submission.csv')

In [4]:
new_train = train.copy()
new_test = test.copy()

new_train[['work_phone','phone','email','FLAG_MOBIL']] = new_train[['work_phone','phone','email','FLAG_MOBIL']].astype('object')
new_test[['work_phone','phone','email','FLAG_MOBIL']] = new_test[['work_phone','phone','email','FLAG_MOBIL']].astype('object')

## 결측치 처리

In [5]:
## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_train.loc[new_train['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_train.loc[new_train['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type == 'State servant'), 'occyp_type'] = 'State servant'

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff'

In [6]:
## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_test.loc[new_test['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_test.loc[new_test['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type == 'State servant'), 'occyp_type'] = 'State servant' 

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff' 

## 학습 데이터 처리

In [7]:
# 1. FLAG_MOBIL 변수 삭제
m_train = new_train.drop('FLAG_MOBIL', axis = 1)
m_test = new_test.drop('FLAG_MOBIL', axis =1)


# 2. credit 변수 분할 
target_credit = m_train['credit']
target_credit = target_credit.astype('int')

m_train_x = m_train.drop('credit', axis = 1)


# 3. 더미변수 
dum_train_x = pd.get_dummies(m_train_x, drop_first = True)
dum_test_x = pd.get_dummies(m_test, drop_first = True)

## LGBMClassifierS

In [13]:
folds = StratifiedKFold(n_splits=5, 
                        shuffle=True, random_state=42)

In [17]:
random.seed(42)
lgb_models={}

sub = np.zeros((dum_test_x.shape[0], 3))

for n_folds, (train_index, val_index) in enumerate(folds.split(dum_train_x, target_credit)):
    
    print(f'===================================={n_folds+1}============================================')

    X_train, X_val = dum_train_x.iloc[train_index], dum_train_x.iloc[val_index]
    y_train, y_val = target_credit.iloc[train_index], target_credit.iloc[val_index]
    
    
    
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    sub+= lgb.predict_proba(dum_test_x)
    
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.645431	valid_1's multi_logloss: 0.760628
[200]	training's multi_logloss: 0.551831	valid_1's multi_logloss: 0.748481
[300]	training's multi_logloss: 0.478893	valid_1's multi_logloss: 0.744012
Early stopping, best iteration is:
[273]	training's multi_logloss: 0.496233	valid_1's multi_logloss: 0.743269


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.643829	valid_1's multi_logloss: 0.771873
[200]	training's multi_logloss: 0.549639	valid_1's multi_logloss: 0.763654
Early stopping, best iteration is:
[251]	training's multi_logloss: 0.51072	valid_1's multi_logloss: 0.762198


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.648877	valid_1's multi_logloss: 0.768813
[200]	training's multi_logloss: 0.55049	valid_1's multi_logloss: 0.760161
Early stopping, best iteration is:
[199]	training's multi_logloss: 0.5

In [18]:
submission.iloc[:,1:] = sub/5
submission.to_csv('submission/lgbm_base.csv', index =False)

In [19]:
target_credit.value_counts()

2    16968
1     6267
0     3222
Name: credit, dtype: int64