In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from lightgbm import LGBMClassifier
import gc

In [2]:
train = pd.read_csv('./data/application_train.csv')
test = pd.read_csv('./data/application_test.csv')

In [3]:
def get_categorical_variables(dataframe):
    return [feature for feature in dataframe.columns if dataframe[feature].dtype == 'object']

In [4]:
def encode_categorical_variables(train, test):
    categorical_features = get_categorical_variables(train)
    for feature in categorical_features:
        train[feature], indexer = pd.factorize(train[feature])
        test[feature] = indexer.get_indexer(test[feature])

In [5]:
encode_categorical_variables(train, test)
gc.enable()

In [7]:
# select features and target
Y = train['TARGET']
X_train = train.drop(['TARGET'],1)


excluded_feats = ['SK_ID_CURR']
features = [feature for feature in X_train.columns if feature not in excluded_feats]

In [12]:
# Train LGBM classifier
folds = KFold(n_splits=5, shuffle=True)

oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train)):
    trn_x, trn_y = X_train[features].iloc[trn_idx], Y.iloc[trn_idx]
    val_x, val_y = X_train[features].iloc[val_idx], Y.iloc[val_idx]
    
    clf = LGBMClassifier(
        n_estimators=500,
        num_leaves=15,
        colsample_bytree=.8,
        subsample=.8,
        max_depth=7,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01
    )
    
    clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric='auc', verbose=10, early_stopping_rounds=30
           )
    
    oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    
    sub_preds += clf.predict_proba(test[features], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    

    print(f'Fold {n_fold+1} AUC : {roc_auc_score(val_y, oof_preds[val_idx])}')
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()
    
print(f'Full AUC score {roc_auc_score(Y, oof_preds)}') 

test['TARGET'] = sub_preds

test[['SK_ID_CURR', 'TARGET']].to_csv('submission.csv', index=False, float_format='%.8f')

Training until validation scores don't improve for 30 rounds.
[10]	training's auc: 0.724969	valid_1's auc: 0.721867
[20]	training's auc: 0.730379	valid_1's auc: 0.725963
[30]	training's auc: 0.739955	valid_1's auc: 0.734655
[40]	training's auc: 0.74752	valid_1's auc: 0.741072
[50]	training's auc: 0.754397	valid_1's auc: 0.746728
[60]	training's auc: 0.759351	valid_1's auc: 0.750556
[70]	training's auc: 0.762768	valid_1's auc: 0.753255
[80]	training's auc: 0.766007	valid_1's auc: 0.754727
[90]	training's auc: 0.768626	valid_1's auc: 0.755623
[100]	training's auc: 0.770917	valid_1's auc: 0.756343
[110]	training's auc: 0.773307	valid_1's auc: 0.757008
[120]	training's auc: 0.775603	valid_1's auc: 0.757567
[130]	training's auc: 0.777628	valid_1's auc: 0.757892
[140]	training's auc: 0.779672	valid_1's auc: 0.758229
[150]	training's auc: 0.781806	valid_1's auc: 0.758304
[160]	training's auc: 0.783599	valid_1's auc: 0.75861
[170]	training's auc: 0.785777	valid_1's auc: 0.758822
[180]	training