# Label Encoder 

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier 
from imblearn.over_sampling import SMOTE

pd.set_option('max_columns', None)

In [39]:
train = pd.read_csv('card_dataset/train.csv')
test = pd.read_csv('card_dataset/test.csv')
submission = pd.read_csv('card_dataset/sample_submission.csv')

In [40]:
train.fillna('NAN', inplace =True)
test.fillna('NAN', inplace = True)

In [41]:
from sklearn.preprocessing import LabelEncoder

In [42]:
cat_cols = [x for x in train.columns if train[x].dtype == 'object']
num_cols = [x for x in train.columns if train[x].dtype != 'object']

In [43]:
cat_cols 

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type']

In [44]:
num_cols

['index',
 'child_num',
 'income_total',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'FLAG_MOBIL',
 'work_phone',
 'phone',
 'email',
 'family_size',
 'begin_month',
 'credit']

In [45]:
le = LabelEncoder()

In [46]:
for i in cat_cols:
    print(i)
    train[i] = le.fit_transform(train[i].astype(str))
    test[i] =le.fit_transform(test[i].astype(str))

gender
car
reality
income_type
edu_type
family_type
house_type
occyp_type


In [47]:
train

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,0,0,0,0,202500.0,0,1,1,2,-13899,-4709,1,0,0,0,12,2.0,-6.0,1.0
1,1,0,0,1,1,247500.0,0,4,0,1,-11380,-1540,1,0,0,1,8,3.0,-5.0,1.0
2,2,1,1,1,0,450000.0,4,1,1,1,-19087,-4434,1,0,1,0,10,2.0,-22.0,2.0
3,3,0,0,1,0,202500.0,0,4,1,1,-15088,-2092,1,0,1,0,15,2.0,-37.0,0.0
4,4,0,1,1,0,157500.0,2,1,1,1,-15037,-2105,1,0,0,0,10,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,26452,0,0,0,2,225000.0,2,4,1,1,-12079,-1984,1,0,0,0,3,4.0,-2.0,1.0
26453,26453,0,0,1,1,180000.0,4,1,2,1,-15291,-2475,1,0,0,0,12,2.0,-47.0,2.0
26454,26454,0,1,0,0,292500.0,4,4,0,5,-10082,-2015,1,0,0,0,3,2.0,-25.0,2.0
26455,26455,1,0,1,0,171000.0,4,2,3,1,-10145,-107,1,0,0,0,8,1.0,-59.0,2.0


In [51]:
y = train['credit']
x = train.drop(['index','FLAG_MOBIL','credit'], axis =1)
test_x = test.drop(['index','FLAG_MOBIL'], axis = 1 )

In [52]:
folds = StratifiedKFold(n_splits = 5, 
                       shuffle = True , random_state = 511)
smote = SMOTE(random_state = 0 )
X_train_over, y_train_over = smote.fit_sample(x, y)

In [53]:
loss = []
sub = np.zeros((test_x.shape[0], 3 ))


for n_folds, (train_index, val_index) in enumerate(folds.split(X_train_over, y_train_over)):
    
    print(f'======================================={n_folds + 1 }==========================================')
    
    X_train, X_val = X_train_over.iloc[train_index], X_train_over.iloc[val_index]
    y_train, y_val = y_train_over.iloc[train_index], y_train_over.iloc[val_index]
    
    
    lgb = LGBMClassifier(n_estimators = 10000,
                        n_jobs = -1, 
                        num_leaves = 2^20,
                        max_depth = 32,
                        random_state = 512,
                        learning_rate = 0.03)
    
    lgb.fit(X_train, y_train,
           eval_set = [(X_train, y_train), (X_val, y_val)],
           early_stopping_rounds = 100,
           verbose  = 1000)
    
    y_val_onehot = pd.get_dummies(y_val)
    prediction = lgb.predict_proba(X_val)
    loss.append(log_loss(y_val_onehot, prediction))
    
    sub+= lgb.predict_proba(test_x)
    
    
    print(f'=============================================================================================\n\n')

Training until validation scores don't improve for 100 rounds
[1000]	training's multi_logloss: 0.555847	valid_1's multi_logloss: 0.628943
[2000]	training's multi_logloss: 0.430339	valid_1's multi_logloss: 0.559407
[3000]	training's multi_logloss: 0.351438	valid_1's multi_logloss: 0.525728
[4000]	training's multi_logloss: 0.293369	valid_1's multi_logloss: 0.505351
[5000]	training's multi_logloss: 0.250005	valid_1's multi_logloss: 0.495088
[6000]	training's multi_logloss: 0.215872	valid_1's multi_logloss: 0.489965
Early stopping, best iteration is:
[6740]	training's multi_logloss: 0.194533	valid_1's multi_logloss: 0.488181


Training until validation scores don't improve for 100 rounds
[1000]	training's multi_logloss: 0.553975	valid_1's multi_logloss: 0.628925
[2000]	training's multi_logloss: 0.430429	valid_1's multi_logloss: 0.559158
[3000]	training's multi_logloss: 0.352611	valid_1's multi_logloss: 0.52685
[4000]	training's multi_logloss: 0.295393	valid_1's multi_logloss: 0.508663
[500

In [54]:
np.mean(loss)

0.48930819288163957

In [56]:
submission.iloc[:,1:] = sub/5

In [57]:
submission.to_csv('submission/encodering.csv', index = False)