# Model Feature selection, hyperparameter tuning, and training

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
app_train = pd.read_csv('app_train.csv')
app_test = pd.read_csv('app_test.csv')

In [11]:
print('Training shape: ', app_train.shape)
print('Testing shape: ', app_test.shape)

Training shape:  (307511, 361)
Testing shape:  (48744, 360)


## Remove collinear features

collinear features merupakan feature yang sangat berkorelasi satu dengan yang lainnya. Adanya feature ini menyebabkan learning model menjadi overfit karena varians dari model meningkat, sehingga model akan sangat sensitif terhadap perubahan kecil.
Saya set threshold pada nilai 0.9, jika terdapat nilai korelasi lebih dari 0.9, maka feature tersebut dihapus.

In [3]:
correlation = app_train.corr()

upper = correlation.corr().where(np.triu(np.ones(correlation.shape), k=1).astype(np.bool))
upper.head()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH_x,DAYS_EMPLOYED,DAYS_REGISTRATION,...,STATUS_0_MEAN,STATUS_1_MEAN,STATUS_2_MEAN,STATUS_3_MEAN,STATUS_4_MEAN,STATUS_5_MEAN,STATUS_C_MEAN,STATUS_X_MEAN,STATUS_nan_MEAN,PREVIOUS_LOAN_COUNTS
SK_ID_CURR,,-0.011773,-0.009829,-0.010272,-0.012384,-0.01032,-0.001071,-0.008336,-0.001024,-0.009581,...,0.003357,-0.00185,-0.010326,-0.013329,-0.015889,-0.017766,-0.000335,-0.001882,,0.001789
CNT_CHILDREN,,,0.06665,-0.081345,0.006415,-0.087053,-0.180259,0.73933,0.433684,0.673977,...,0.184563,0.197814,0.02227,0.017723,0.02165,0.016981,-0.172132,-0.017152,,-0.011579
AMT_INCOME_TOTAL,,,,0.339276,0.414387,0.348522,0.371289,0.081557,-0.025511,0.11318,...,-0.014646,-0.018543,-0.034895,-0.047507,-0.04786,-0.02714,-0.033076,0.098954,,0.153967
AMT_CREDIT,,,,,0.928168,0.998997,0.332148,-0.204316,-0.301116,-0.133019,...,-0.188297,-0.201939,-0.062915,-0.068763,-0.066161,-0.041964,0.15546,0.062682,,0.132703
AMT_ANNUITY,,,,,,0.930591,0.381766,-0.081644,-0.206107,-0.027237,...,-0.17647,-0.156915,-0.059184,-0.065268,-0.06357,-0.040668,0.146155,0.052523,,0.113175


In [4]:
threshold = 0.9
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print(to_drop)
print()
print(f"terdapat {len(to_drop)} feature yang saling berkorelasi")

['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT_W_CITY', 'LIVE_REGION_NOT_WORK_REGION', 'LIVE_CITY_NOT_WORK_CITY', 'BASEMENTAREA_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_EMPLOYED_ANOM', 'NAME_INCOME_TYPE_Pension

In [5]:
train = app_train.drop(columns = to_drop)
test = app_test.drop(columns = to_drop)

print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

Training shape:  (307511, 266)
Testing shape:  (48744, 265)


In [6]:
app_train.to_csv('train_deletefeaute.csv')
app_test.to_csv('test_deletefeaute.csv')

## Bayesian Optimization for Hyperparameter Tuning

In [16]:
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score, roc_curve

In [7]:
train = pd.read_csv('train_deletefeaute.csv')
test = pd.read_csv('test_deletefeaute.csv')

In [17]:
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=1000, learning_rate=0.05, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y)
    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction,  max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {'application':'binary','num_iterations': n_estimators, 'learning_rate':learning_rate, 'early_stopping_round':100, 'metric':'auc'}
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 8.99),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (5, 50),
                                            }, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
    # return best parameters
    return lgbBO.max

In [10]:
del opt_params

NameError: name 'opt_params' is not defined

In [9]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

X = train.drop('TARGET', axis=1)
y = train['TARGET']

opt_params = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=3, random_seed=6, n_estimators=100, learning_rate=0.05)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------




[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] Start training from score -2.432484
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[Light



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] St



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] Start training from score -2.432484
[LightGBM] [Info] [bin



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] Start training from score -2.432484
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[Light



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] Start training from score -2.432484
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[Light



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:Bo



| [0m 7       [0m | [0m 0.757   [0m | [0m 1.0     [0m | [0m 0.1     [0m | [0m 5.0     [0m | [0m 0.0     [0m | [0m 5.0     [0m | [0m 41.66   [0m | [0m 0.1     [0m | [0m 38.75   [0m |




[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] St



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] St



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] Start training from score -2.432484
[LightGBM] [Info] [binary:Boos



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] Start training from score -2.432484
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[Light



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [Info] Start training from score -2.432484
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[Light



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432484
[LightGBM] [



[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 345
[LightGBM] [Info] Number of positive: 16550, number of negative: 188458
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32716
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 345
[LightGBM] [Info] [binary:Bo

In [6]:
print(opt_params)

{'target': 0.7655963032907338, 'params': {'bagging_fraction': 0.8040436794880652, 'feature_fraction': 0.7660958764383504, 'lambda_l1': 3.8907837547492523, 'lambda_l2': 2.6100364447404574, 'max_depth': 8.904687185508728, 'min_child_weight': 40.96213538975256, 'min_split_gain': 0.04668645686304026, 'num_leaves': 40.39111270201556}}


# Build Final Model

In [21]:
from sklearn.model_selection import KFold
import gc


def train_model(features, test_features, n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
   
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    #optimal threshold
    optimal_proba_cutoff = []
    
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Iterate through each fold
    for i, (train_indices, valid_indices) in enumerate(k_fold.split(features)):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(
                    nthread=4,
                    n_estimators=100, 
                    learning_rate=0.05,
                    bagging_fraction= 0.8040436794880652, 
                    feature_fraction= 0.7660958764383504, 
                    lambda_l1= 3.8907837547492523, 
                    lambda_l2= 2.6100364447404574, 
                    max_depth= 9, 
                    min_child_weight= 40.96213538975256, 
                    min_split_gain= 0.04668645686304026, 
                    is_unbalance = True,
                    num_leaves= 40)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 100, verbose = 200)
               
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
     
        false_pos_rate, true_pos_rate, proba = roc_curve(valid_labels, out_of_fold[valid_indices])

        optimal_proba_cutoff.append(sorted(list(zip(np.abs(true_pos_rate - false_pos_rate), proba)), key=lambda i: i[0], reverse=True)[0][1])
        
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()

      
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores,
                            }) 
    
    return metrics

In [22]:
metrics = train_model(features = train, test_features=test)
print('Baseline metrics')
print(metrics)

Training Data Shape:  (307511, 360)
Testing Data Shape:  (48744, 360)
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.800546	train's binary_logloss: 0.548492	valid's auc: 0.765424	valid's binary_logloss: 0.55664
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.800107	train's binary_logloss: 0.548922	valid's auc: 0.766538	valid's binary_logloss: 0.557823
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.799196	train's binary_logloss: 0.549944	valid's auc: 0.771015	valid's binary_logloss: 0.558259
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	train's auc: 0.799194	train's binary_logloss: 0.549569	valid's auc: 0.767705	valid's binary_logloss: 0.556715
Training until validation s