# Scaling 

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler


In [3]:
train = pd.read_csv('card_dataset/train.csv')
test = pd.read_csv('card_dataset/test.csv')
submission =pd.read_csv('card_dataset/sample_submission.csv')

In [4]:
new_train = train.copy()
new_test = test.copy()

new_train[['work_phone','phone','email','FLAG_MOBIL']] = new_train[['work_phone','phone','email','FLAG_MOBIL']].astype('object')
new_test[['work_phone','phone','email','FLAG_MOBIL']] = new_test[['work_phone','phone','email','FLAG_MOBIL']].astype('object')

In [5]:
## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_train.loc[new_train['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_train.loc[new_train['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type == 'State servant'), 'occyp_type'] = 'State servant'

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff'


## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_test.loc[new_test['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_test.loc[new_test['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type == 'State servant'), 'occyp_type'] = 'State servant' 

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff' 

In [6]:
edu_list = {'Academic degree':5,
           'Higher education':4,
           'Incomplete higher':3,
           'Secondary / secondary special':2,
           'Lower secondary':1}


new_train['edu_type'] = train['edu_type'].map(edu_list)
new_test['edu_type'] = test['edu_type'].map(edu_list)

In [9]:
# 1. FLAG_MOBIL 변수 삭제
m_train = new_train.drop('FLAG_MOBIL', axis = 1)
m_test = new_test.drop(['FLAG_MOBIL','index'], axis =1)


# 2. credit 변수 분할 
target_credit = m_train['credit']
target_credit = target_credit.astype('int')

m_train_x = m_train.drop(['index','credit'], axis = 1)


# 3. 더미변수 
dum_train_x = pd.get_dummies(m_train_x, drop_first = True)
dum_test_x = pd.get_dummies(m_test, drop_first = True)

In [10]:
scaler = MinMaxScaler()
folds = StratifiedKFold(n_splits=5, 
                        shuffle=True, random_state=42)
smote = SMOTE(random_state=0)
X_train_over,y_train_over = smote.fit_sample(dum_train_x,target_credit)

In [13]:
X_train_over = scaler.fit_transform(X_train_over)

dum_test_x = scaler.fit_transform(dum_test_x)

In [17]:
loss = []
sub = np.zeros((dum_test_x.shape[0], 3))

for n_folds, (train_index, val_index) in enumerate(folds.split(X_train_over, y_train_over)):
    
    print(f'===================================={n_folds+1}============================================')

    X_train, X_val = X_train_over[train_index], X_train_over[val_index]
    y_train, y_val = y_train_over.iloc[train_index], y_train_over.iloc[val_index]
    
    
    lgb = LGBMClassifier(n_estimators=3000,
                        is_unbalance = True, n_jobs = -1,
                        num_leaves = 128,
                        max_depth = 16,
                        random_state = 507,
                        learning_rate = 0.005)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=30,
           verbose=1000)
    y_val_onehot = pd.get_dummies(y_val)
    prediction = lgb.predict_proba(X_val)
    loss.append(log_loss(y_val_onehot, prediction))
    sub+= lgb.predict_proba(dum_test_x)
    
    
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.586066	valid_1's multi_logloss: 0.66891
[2000]	training's multi_logloss: 0.443405	valid_1's multi_logloss: 0.581662
[3000]	training's multi_logloss: 0.35409	valid_1's multi_logloss: 0.535356
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.35409	valid_1's multi_logloss: 0.535356


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.586308	valid_1's multi_logloss: 0.669882
[2000]	training's multi_logloss: 0.434991	valid_1's multi_logloss: 0.569073
[3000]	training's multi_logloss: 0.350214	valid_1's multi_logloss: 0.524967
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.350214	valid_1's multi_logloss: 0.524967


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.586276	valid_1's multi_logloss: 0.66758
[2000]	training's multi_logloss: 0.44

In [18]:
np.mean(loss)

0.5324307789858999

In [19]:
submission.iloc[:,1:]= sub/5

In [None]:
submission.to_csv('submission/scaling.csv', index =False)