In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss


pd.set_option('display.max_columns',None)

In [2]:
train = pd.read_csv('card_dataset/train.csv')
test = pd.read_csv('card_dataset/test.csv')
submission = pd.read_csv('card_dataset/sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

In [5]:
train.fillna('NAN', inplace = True)
test.fillna('NAN', inplace = True)

In [13]:
dum_train = pd.get_dummies(train)

In [15]:
dum_test = pd.get_dummies(test)

In [17]:
y = dum_train['credit']

In [19]:
x = dum_train.drop(['index', 'credit'], axis = 1)

In [20]:
test_x = dum_test.drop(['index'], axis = 1)

In [22]:
folds = StratifiedKFold(n_splits=5, 
                        shuffle=True, random_state=42)
smote = SMOTE(random_state=0)
X_train_over,y_train_over = smote.fit_sample(x, y)

In [23]:
loss = []
sub = np.zeros((test_x.shape[0], 3))

for n_folds, (train_index, val_index) in enumerate(folds.split(X_train_over, y_train_over)):
    
    print(f'===================================={n_folds+1}============================================')

    X_train, X_val = X_train_over.iloc[train_index], X_train_over.iloc[val_index]
    y_train, y_val = y_train_over.iloc[train_index], y_train_over.iloc[val_index]
    
    
    lgb = LGBMClassifier(n_estimators=3000,
                        is_unbalance = True, n_jobs = -1,
                        num_leaves = 128,
                        max_depth = 16,
                        random_state = 507,
                        learning_rate = 0.005)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            early_stopping_rounds=30,
           verbose=1000)
    y_val_onehot = pd.get_dummies(y_val)
    prediction = lgb.predict_proba(X_val)
    loss.append(log_loss(y_val_onehot, prediction))
    
    sub+= lgb.predict_proba(test_x)
    
    
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.558994	valid_1's multi_logloss: 0.644678
[2000]	training's multi_logloss: 0.421976	valid_1's multi_logloss: 0.55967
[3000]	training's multi_logloss: 0.345135	valid_1's multi_logloss: 0.526317
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.345135	valid_1's multi_logloss: 0.526317


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.55903	valid_1's multi_logloss: 0.644281
[2000]	training's multi_logloss: 0.426568	valid_1's multi_logloss: 0.56494
[3000]	training's multi_logloss: 0.341929	valid_1's multi_logloss: 0.520585
Did not meet early stopping. Best iteration is:
[3000]	training's multi_logloss: 0.341929	valid_1's multi_logloss: 0.520585


Training until validation scores don't improve for 30 rounds
[1000]	training's multi_logloss: 0.56101	valid_1's multi_logloss: 0.646946
[2000]	training's multi_logloss: 0.42

In [24]:
np.mean(loss)

0.5236442575450597

In [25]:
submission.iloc[:,1:] = sub/5

In [None]:
submission.to_csv('submission/NoPreprocessing.csv', index = False)