In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE


In [10]:
train = pd.read_csv('card_dataset/train.csv')
test = pd.read_csv('card_dataset/test.csv')
submission =pd.read_csv('card_dataset/sample_submission.csv')

In [11]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [12]:
train['gen_car_real'] = (train.gender.astype(str) 
                         + '_' + train.car.astype(str) +'_' 
                         + train.reality.astype(str)
                        )
train['phones'] = (train.work_phone.astype(str) +'_' +
                  train.phone.astype(str))



test['gen_car_real'] = (test.gender.astype(str) 
                         + '_' + test.car.astype(str) +'_' 
                         + test.reality.astype(str)
                        )
test['phones'] = (test.work_phone.astype(str) +'_' +
                  test.phone.astype(str))


In [14]:
new_train = train.drop(['index','gender','car','reality','FLAG_MOBIL','work_phone','phone'],axis = 1)
new_test = test.drop(['index','gender','car','reality','FLAG_MOBIL','work_phone','phone'],axis = 1)

In [15]:
## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_train.loc[new_train['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_train.loc[new_train['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type == 'State servant'), 'occyp_type'] = 'State servant'

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_train.loc[(new_train.occyp_type.isna())  & (new_train.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff'


## 1. income_type = 'Pensioner' 일때 occyp_type = 'Retired' and DAYS_EMPLOYED 
new_test.loc[new_test['income_type'] == 'Pensioner', 'occyp_type'] = 'Retired'
new_test.loc[new_test['income_type'] == 'Pensioner', 'DAYS_EMPLOYED'] = 0 

## 2. occyp_type = NaN & income_type = 'State servant'인 경우 occyp_type ='State servant' 로 변경
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type == 'State servant'), 'occyp_type'] = 'State servant' 

## 3. occyp_type = NaN & income_type  in ('working', 'Commercial associate', 'Student) 일 경우 occyp_type ='Extra staff'로 변경 
new_test.loc[(new_test.occyp_type.isna())  & (new_test.income_type.isin(['Working','Commercial associate','Student'])), 'occyp_type'] = 'Extra staff' 

In [23]:
edu_list = {'Academic degree':5,
           'Higher education':4,
           'Incomplete higher':3,
           'Secondary / secondary special':2,
           'Lower secondary':1}


new_train['edu_type'] = train['edu_type'].map(edu_list)
new_test['edu_type'] = test['edu_type'].map(edu_list)

In [24]:
# 2. credit 변수 분할 
target_credit = new_train['credit']
target_credit = target_credit.astype('int')

m_train_x = new_train.drop(['credit'], axis = 1)


# 3. 더미변수 
dum_train_x = pd.get_dummies(m_train_x, drop_first = True)
dum_test_x = pd.get_dummies(new_test, drop_first = True)

In [25]:
smote = SMOTE(random_state=0)
X_train_over,y_train_over = smote.fit_sample(dum_train_x,target_credit)

In [26]:
folds = StratifiedKFold(n_splits=5, 
                        shuffle=True, random_state=42)

In [27]:
loss = []
sub = np.zeros((dum_test_x.shape[0], 3))

for n_folds, (train_index, val_index) in enumerate(folds.split(X_train_over, y_train_over)):
    
    print(f'===================================={n_folds+1}============================================')

    X_train, X_val = X_train_over.iloc[train_index], X_train_over.iloc[val_index]
    y_train, y_val = y_train_over.iloc[train_index], y_train_over.iloc[val_index]
    
    
    lgb = LGBMClassifier(n_estimators=3000,
                        is_unbalance = True, n_jobs = -1,
                        num_leaves = 128,
                        max_depth = 16,
                        random_state = 507,
                        learning_rate = 0.005)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=30,
           verbose=1000)
    y_val_onehot = pd.get_dummies(y_val)
    prediction = lgb.predict_proba(X_val)
    loss.append(log_loss(y_val_onehot, prediction))
    sub+= lgb.predict_proba(dum_test_x)
    
    
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.590093	valid_1's multi_logloss: 0.672272
[2000]	training's multi_logloss: 0.445304	valid_1's multi_logloss: 0.582015
[3000]	training's multi_logloss: 0.356076	valid_1's multi_logloss: 0.536321
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.356076	valid_1's multi_logloss: 0.536321


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.58285	valid_1's multi_logloss: 0.667125
[2000]	training's multi_logloss: 0.440784	valid_1's multi_logloss: 0.57519
[3000]	training's multi_logloss: 0.356476	valid_1's multi_logloss: 0.533988
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.356476	valid_1's multi_logloss: 0.533988


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.581268	valid_1's multi_logloss: 0.665475
[2000]	training's multi_logloss: 0.

In [28]:
np.mean(loss)

0.5350495418474781

In [29]:
submission.iloc[:,1:] = sub/5

In [30]:
submission.to_csv('submission/grouping.csv' , index =False)