# 데이터 증강 

credit 변수의 분포가 균일하지 않음.    
2 : 16968, 1 : 6267, 0 : 3222


데이터의 특성 상 imbalanced 할 수 밖에 없음.    
그러나 이러한 데이터셋을 그대로 사용할 경우, 많은 수를 가진 레이블을 지정할 가능성이 높음. 이를 해결하기 위해 증강 기법을 사용해보고자 함.   

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score,roc_auc_score



from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [6]:
train = pd.read_csv('card_dataset/train.csv')
test = pd.read_csv('card_dataset/test.csv')
submission =pd.read_csv('card_dataset/sample_submission.csv')

In [7]:
new_train = train.copy()
new_test = test.copy()

new_train[['work_phone','phone','email','FLAG_MOBIL']] = new_train[['work_phone','phone','email','FLAG_MOBIL']].astype('object')
new_test[['work_phone','phone','email','FLAG_MOBIL']] = new_test[['work_phone','phone','email','FLAG_MOBIL']].astype('object')

In [8]:
## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_train.loc[new_train['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_train.loc[new_train['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type == 'State servant'), 'occyp_type'] = 'State servant'

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff'


## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_test.loc[new_test['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_test.loc[new_test['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type == 'State servant'), 'occyp_type'] = 'State servant' 

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff' 

In [43]:
# 1. FLAG_MOBIL 변수 삭제
m_train = new_train.drop('FLAG_MOBIL', axis = 1)
m_test = new_test.drop(['FLAG_MOBIL','index'], axis =1)


# 2. credit 변수 분할 
target_credit = m_train['credit']
target_credit = target_credit.astype('int')

m_train_x = m_train.drop(['index','credit'], axis = 1)


# 3. 더미변수 
dum_train_x = pd.get_dummies(m_train_x, drop_first = True)
dum_test_x = pd.get_dummies(m_test, drop_first = True)

In [44]:
target_credit=target_credit.astype('int')
train_x, test_x, train_y, test_y = train_test_split(dum_train_x, target_credit, 
                                                    random_state = 507,
                                                    test_size = 0.25)

In [45]:
folds = StratifiedKFold(n_splits=5, 
                        shuffle=True, random_state=42)

In [48]:
%time
lgb_models={}
loss = []
sub = np.zeros((dum_test_x.shape[0], 3))

for n_folds, (train_index, val_index) in enumerate(folds.split(dum_train_x, target_credit)):
    
    print(f'===================================={n_folds+1}============================================')

    X_train, X_val = dum_train_x.iloc[train_index], dum_train_x.iloc[val_index]
    y_train, y_val = target_credit.iloc[train_index], target_credit.iloc[val_index]
    
    
    
    lgb = LGBMClassifier(n_estimators=5000,
                        is_unbalance = True, n_jobs = -1,
                        num_leaves = 64,
                        max_depth = 10,
                        random_state = 507,
                        learning_rate = 0.003)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=30,
           verbose=1000)
    y_val_onehot = pd.get_dummies(y_val)
    prediction = lgb.predict_proba(X_val)
    loss.append(log_loss(y_val_onehot, prediction))
    sub+= lgb.predict_proba(dum_test_x)
    
    
    print(f'================================================================================\n\n')

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.2 µs
Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.706519	valid_1's multi_logloss: 0.770599
[2000]	training's multi_logloss: 0.640601	valid_1's multi_logloss: 0.750577
[3000]	training's multi_logloss: 0.590134	valid_1's multi_logloss: 0.740195
[4000]	training's multi_logloss: 0.549255	valid_1's multi_logloss: 0.734773
[5000]	training's multi_logloss: 0.512804	valid_1's multi_logloss: 0.730157
Did not meet early stopping. Best iteration is:
[5000]	training's multi_logloss: 0.512804	valid_1's multi_logloss: 0.730157


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.701369	valid_1's multi_logloss: 0.780179
[2000]	training's multi_logloss: 0.635666	valid_1's multi_logloss: 0.762242
[3000]	training's multi_logloss: 0.585638	valid_1's multi_logloss: 0.754024
Early stopping, best iteration is:
[3451]	training's multi_logloss: 0.56597	v

In [49]:
np.mean(loss)

0.7412896726455214

In [50]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_train_over,y_train_over = smote.fit_sample(dum_train_x,target_credit)

In [59]:
%time
lgb_models={}
loss = []
sub = np.zeros((dum_test_x.shape[0], 3))

for n_folds, (train_index, val_index) in enumerate(folds.split(X_train_over, y_train_over)):
    
    print(f'===================================={n_folds+1}============================================')

    X_train, X_val = X_train_over.iloc[train_index], X_train_over.iloc[val_index]
    y_train, y_val = y_train_over.iloc[train_index], y_train_over.iloc[val_index]
    
    
    
    lgb = LGBMClassifier(n_estimators=3000,
                        is_unbalance = True, n_jobs = -1,
                        num_leaves = 64,
                        max_depth = 14,
                        random_state = 507,
                        learning_rate = 0.01)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=30,
           verbose=500)
    y_val_onehot = pd.get_dummies(y_val)
    prediction = lgb.predict_proba(X_val)
    loss.append(log_loss(y_val_onehot, prediction))
    sub+= lgb.predict_proba(dum_test_x)
    
    
    print(f'================================================================================\n\n')

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs
Training until validation scores don't improve for 30 rounds
[500]	training's multi_logloss: 0.690662	valid_1's multi_logloss: 0.737887
[1000]	training's multi_logloss: 0.56486	valid_1's multi_logloss: 0.647068
[1500]	training's multi_logloss: 0.478613	valid_1's multi_logloss: 0.588451
[2000]	training's multi_logloss: 0.421223	valid_1's multi_logloss: 0.556468
[2500]	training's multi_logloss: 0.378916	valid_1's multi_logloss: 0.536918
[3000]	training's multi_logloss: 0.344483	valid_1's multi_logloss: 0.522703
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.344483	valid_1's multi_logloss: 0.522703


Training until validation scores don't improve for 30 rounds
[500]	training's multi_logloss: 0.690503	valid_1's multi_logloss: 0.742245
[1000]	training's multi_logloss: 0.561129	valid_1's multi_logloss: 0.642458
[1500]	training's multi_logloss: 0.48162	valid_1's multi_logloss: 0.589006
[2000]	t

In [60]:
np.mean(loss)

0.5244076009775325

In [62]:
submission.iloc[:,1:] = sub/5

In [63]:
submission.to_csv('submission/smote.csv', index =False)

# 수정 

In [65]:
edu_list = {'Academic degree':5,
           'Higher education':4,
           'Incomplete higher':3,
           'Secondary / secondary special':2,
           'Lower secondary':1}


new_train['edu_type'] = train['edu_type'].map(edu_list)
new_test['edu_type'] = test['edu_type'].map(edu_list)

In [66]:
# 1. FLAG_MOBIL 변수 삭제
m_train = new_train.drop('FLAG_MOBIL', axis = 1)
m_test = new_test.drop(['FLAG_MOBIL','index'], axis =1)


# 2. credit 변수 분할 
target_credit = m_train['credit']
target_credit = target_credit.astype('int')

m_train_x = m_train.drop(['index','credit'], axis = 1)


# 3. 더미변수 
dum_train_x = pd.get_dummies(m_train_x, drop_first = True)
dum_test_x = pd.get_dummies(m_test, drop_first = True)

In [67]:
folds = StratifiedKFold(n_splits=10, 
                        shuffle=True, random_state=42)

In [68]:
smote = SMOTE(random_state=0)
X_train_over,y_train_over = smote.fit_sample(dum_train_x,target_credit)

In [69]:
loss = []
sub = np.zeros((dum_test_x.shape[0], 3))

for n_folds, (train_index, val_index) in enumerate(folds.split(X_train_over, y_train_over)):
    
    print(f'===================================={n_folds+1}============================================')

    X_train, X_val = X_train_over.iloc[train_index], X_train_over.iloc[val_index]
    y_train, y_val = y_train_over.iloc[train_index], y_train_over.iloc[val_index]
    
    
    lgb = LGBMClassifier(n_estimators=3000,
                        is_unbalance = True, n_jobs = -1,
                        num_leaves = 128,
                        max_depth = 16,
                        random_state = 507,
                        learning_rate = 0.005)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=30,
           verbose=1000)
    y_val_onehot = pd.get_dummies(y_val)
    prediction = lgb.predict_proba(X_val)
    loss.append(log_loss(y_val_onehot, prediction))
    sub+= lgb.predict_proba(dum_test_x)
    
    
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.590267	valid_1's multi_logloss: 0.663464
[2000]	training's multi_logloss: 0.447592	valid_1's multi_logloss: 0.573876
[3000]	training's multi_logloss: 0.364263	valid_1's multi_logloss: 0.530792
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.364263	valid_1's multi_logloss: 0.530792


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.593209	valid_1's multi_logloss: 0.670981
[2000]	training's multi_logloss: 0.450928	valid_1's multi_logloss: 0.574692
[3000]	training's multi_logloss: 0.364676	valid_1's multi_logloss: 0.52628
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.364676	valid_1's multi_logloss: 0.52628


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.594447	valid_1's multi_logloss: 0.664883
[2000]	training's multi_logloss: 0.

In [70]:
np.mean(loss)

0.526040539861962

In [71]:
len(loss)

10

In [72]:
submission.iloc[:,1:] = sub/10

In [74]:
submission.to_csv('submission/smote2.csv', index = False)