# Notebook for Modelling

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import gc
import warnings
import os
import timeit

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

# Using Selected Features

In [28]:
# read in data & selected features from p7_notebook_exploration.ipynb
feats_importance = pd.read_csv('../my_csv_files/feats_importance_model_lgbm.csv')
feats_miss_under30 = pd.read_csv('../my_csv_files/feats_missing_vals_under30_perc.csv')

In [29]:
feats_miss_under30.drop("Unnamed: 0", axis = 1)

Unnamed: 0,0
0,ACTIVE_AMT_CREDIT_SUM_MEAN
1,ACTIVE_AMT_CREDIT_SUM_MAX
2,ACTIVE_DAYS_CREDIT_MEAN
3,ACTIVE_DAYS_CREDIT_UPDATE_MEAN
4,ACTIVE_CREDIT_DAY_OVERDUE_MAX
...,...
541,ORGANIZATION_TYPE_Electricity
542,ORGANIZATION_TYPE_Culture
543,ORGANIZATION_TYPE_Construction
544,ORGANIZATION_TYPE_Cleaning


In [30]:
feats_miss_under30.columns  = ['index1', 'name']
feats_miss_under30 = feats_miss_under30.drop("index1", axis = 1)

In [31]:
final_selected_feats = feats_importance[feats_importance["feature"].isin(feats_miss_under30["name"])]

In [32]:
final_selected_feats = final_selected_feats.reset_index(drop=True)
final_selected_feats

Unnamed: 0,feature,importance
0,EXT_SOURCE_2,23.0
1,EXT_SOURCE_3,22.0
2,INSTAL_DPD_MEAN,7.0
3,DAYS_EMPLOYED_PERC,6.0
4,AMT_ANNUITY,6.0
...,...,...
79,BURO_CREDIT_ACTIVE_Closed_MEAN,1.0
80,APPROVED_DAYS_DECISION_MAX,1.0
81,ACTIVE_DAYS_CREDIT_MEAN,1.0
82,APPROVED_AMT_ANNUITY_MAX,1.0


In [72]:
# the code below is from a previous run, to show that final_selected_feats is equal to feats_combined
# from the csv file : ('../my_csv_files/feats_import_combined_lgbm128_mssing.csv')
# we will overwrite that file in the cell below
#feats_combined.equals(final_selected_feats)

True

In [7]:
final_selected_feats.to_csv('../my_csv_files/feats_import_combined_lgbm128_mssing.csv')

In [33]:
feats_for_modelling = list(final_selected_feats["feature"]) + ['TARGET', 'SK_ID_CURR']

# Undersampling 
#### (SMOTE did not improve results in exploration phase) 

In [34]:
merged_all_files_csv = pd.read_csv('../my_csv_files/MY_merged_all_files.csv')

In [35]:
merged_all_files = merged_all_files_csv.iloc[:,1:]
merged_with_target = merged_all_files[merged_all_files['TARGET'].notnull()]
merged_no_inf = merged_with_target[~merged_with_target.isin([np.inf, -np.inf]).any(1)]

In [36]:
merged_no_inf

Unnamed: 0,index,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,...,,,,,,,,,,
1,1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,...,,,,,,,,,,
2,2,100004,0.0,0,1,0,0,67500.0,135000.0,6750.0,...,,,,,,,,,,
3,3,100006,0.0,1,0,0,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,100007,0.0,0,0,0,0,121500.0,513000.0,21865.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,307506,456251,0.0,0,0,1,0,157500.0,254700.0,27558.0,...,,,,,,,,,,
307503,307507,456252,0.0,1,0,0,0,72000.0,269550.0,12001.5,...,,,,,,,,,,
307504,307508,456253,0.0,1,0,0,0,153000.0,677664.0,29979.0,...,,,,,,,,,,
307505,307509,456254,1.0,1,0,0,0,171000.0,370107.0,20205.0,...,,,,,,,,,,


In [37]:
feats_for_modelling

['EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'INSTAL_DPD_MEAN',
 'DAYS_EMPLOYED_PERC',
 'AMT_ANNUITY',
 'DAYS_EMPLOYED',
 'PAYMENT_RATE',
 'ACTIVE_DAYS_CREDIT_MAX',
 'APPROVED_CNT_PAYMENT_MEAN',
 'ANNUITY_INCOME_PERC',
 'PREV_CNT_PAYMENT_MEAN',
 'DAYS_BIRTH',
 'CLOSED_DAYS_CREDIT_MAX',
 'POS_MONTHS_BALANCE_SIZE',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'INSTAL_DAYS_ENTRY_PAYMENT_MAX',
 'POS_SK_DPD_DEF_MAX',
 'PREV_APP_CREDIT_PERC_MEAN',
 'APPROVED_AMT_CREDIT_MIN',
 'BURO_AMT_CREDIT_SUM_DEBT_MEAN',
 'APPROVED_AMT_ANNUITY_MEAN',
 'DAYS_ID_PUBLISH',
 'INSTAL_PAYMENT_PERC_SUM',
 'BURO_CREDIT_TYPE_Microloan_MEAN',
 'INSTAL_AMT_PAYMENT_SUM',
 'NAME_FAMILY_STATUS_Married',
 'INSTAL_PAYMENT_DIFF_SUM',
 'REGION_RATING_CLIENT_W_CITY',
 'ACTIVE_AMT_CREDIT_SUM_MEAN',
 'POS_MONTHS_BALANCE_MEAN',
 'INSTAL_AMT_INSTALMENT_MAX',
 'PREV_NAME_TYPE_SUITE_nan_MEAN',
 'INSTAL_AMT_INSTALMENT_SUM',
 'INSTAL_COUNT',
 'FLAG_DOCUMENT_3',
 'INSTAL_DAYS_ENTRY_PAYMENT_MEAN',
 'FLAG_DOCUMENT_18',
 'CODE_GENDER',
 'DEF_30_CNT_SOCI

In [38]:
final_selected_feats["feature"]

0                       EXT_SOURCE_2
1                       EXT_SOURCE_3
2                    INSTAL_DPD_MEAN
3                 DAYS_EMPLOYED_PERC
4                        AMT_ANNUITY
                   ...              
79    BURO_CREDIT_ACTIVE_Closed_MEAN
80        APPROVED_DAYS_DECISION_MAX
81           ACTIVE_DAYS_CREDIT_MEAN
82          APPROVED_AMT_ANNUITY_MAX
83    ACTIVE_AMT_CREDIT_SUM_DEBT_SUM
Name: feature, Length: 84, dtype: object

In [39]:
data_select_feats = merged_no_inf[feats_for_modelling]

In [40]:
import re

# Divide the data into training/validation and test data

data_select_feats = data_select_feats.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


In [41]:
# Function to calculate missing values by column
def missing_values_table(df):
        """
        df : data for which you want to know the missing values
        """
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns.sort_values('% of Total Values', 
                                                                          ascending=False).round(1)
        #mis_val_table_ren_columns = mis_val_table_ren_columns[
        #    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        #'% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
               "There are " + str(mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].shape[0]) +
               " columns that have missing values.")
        
        #print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        #    "There are " + str(mis_val_table_ren_columns.shape[0]) +
        #      " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
    

In [42]:
missing_values_table(data_select_feats).tail(25)

Your selected dataframe has 86 columns.
There are 71 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
INSTAL_AMT_INSTALMENT_SUM,15868,5.2
INSTAL_AMT_INSTALMENT_MAX,15868,5.2
INSTAL_PAYMENT_DIFF_SUM,15868,5.2
INSTAL_AMT_PAYMENT_SUM,15868,5.2
INSTAL_PAYMENT_PERC_SUM,15868,5.2
INSTAL_DPD_MEAN,15868,5.2
INSTAL_COUNT,15868,5.2
DEF_30_CNT_SOCIAL_CIRCLE,1021,0.3
EXT_SOURCE_2,660,0.2
AMT_GOODS_PRICE,278,0.1


In [43]:
# when a feature has few missing values, remove the individuals for which the value is missing
# if under 0.5% of the rows
data_select_feats = data_select_feats[data_select_feats['AMT_GOODS_PRICE'].notnull()]
data_select_feats = data_select_feats[data_select_feats['EXT_SOURCE_2'].notnull()]
data_select_feats = data_select_feats[data_select_feats['DEF_30_CNT_SOCIAL_CIRCLE'].notnull()]
data_select_feats = data_select_feats[data_select_feats['ANNUITY_INCOME_PERC'].notnull()]
data_select_feats = data_select_feats[data_select_feats['AMT_ANNUITY'].notnull()]
data_select_feats = data_select_feats[data_select_feats['PAYMENT_RATE'].notnull()]

In [44]:
missing_values_table(data_select_feats).head(35)

Your selected dataframe has 86 columns.
There are 68 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
ACTIVE_AMT_CREDIT_SUM_MEAN,89602,29.3
ACTIVE_AMT_CREDIT_SUM_MAX,89602,29.3
ACTIVE_AMT_CREDIT_SUM_DEBT_SUM,89601,29.3
ACTIVE_DAYS_CREDIT_MEAN,89601,29.3
ACTIVE_DAYS_CREDIT_MAX,89601,29.3
ACTIVE_DAYS_CREDIT_UPDATE_MEAN,89601,29.3
CLOSED_DAYS_CREDIT_MIN,76708,25.1
CLOSED_AMT_CREDIT_SUM_MAX,76708,25.1
CLOSED_DAYS_CREDIT_MAX,76708,25.1
EXT_SOURCE_3,60392,19.8


#### Code for the Undersampling

In [45]:
data_select_feats.shape

(305522, 86)

In [46]:
data_select_feats[~data_select_feats.isin([np.inf, -np.inf]).any(1)].shape

(305522, 86)

In [47]:
data_select_feats["TARGET"].value_counts()

0.0    280808
1.0     24714
Name: TARGET, dtype: int64

In [48]:
# get random indices of individuals for which TARGET = 0

random_indices = np.random.choice(list(data_select_feats.\
                                       TARGET[data_select_feats["TARGET"] == 0.0].index),
                                  25000, replace=False)

In [49]:
len(random_indices)

25000

In [50]:
df_ft_selec_underbal = pd.concat([data_select_feats.loc[list(random_indices)], 
                                  data_select_feats[data_select_feats["TARGET"] == 1.0]])

In [51]:
df_ft_selec_underbal.to_csv("../my_csv_files/df_ft_selec_underbal.csv")
df_ft_selec_underbal

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,INSTAL_DPD_MEAN,DAYS_EMPLOYED_PERC,AMT_ANNUITY,DAYS_EMPLOYED,PAYMENT_RATE,ACTIVE_DAYS_CREDIT_MAX,APPROVED_CNT_PAYMENT_MEAN,ANNUITY_INCOME_PERC,...,APPROVED_AMT_APPLICATION_MAX,BURO_CREDIT_ACTIVE_Active_MEAN,APPROVED_CNT_PAYMENT_SUM,BURO_CREDIT_ACTIVE_Closed_MEAN,APPROVED_DAYS_DECISION_MAX,ACTIVE_DAYS_CREDIT_MEAN,APPROVED_AMT_ANNUITY_MAX,ACTIVE_AMT_CREDIT_SUM_DEBT_SUM,TARGET,SK_ID_CURR
271728,0.649531,0.420611,0.000000,0.204794,47911.5,-2777.0,0.097680,-313.0,24.000000,0.354900,...,356400.0,1.000000,24.0,0.000000,-727.0,-521.666667,20197.170,256450.500,0.0,415015
146490,0.365630,0.579727,0.454545,0.035096,24318.0,-365.0,0.031568,-647.0,12.000000,0.216160,...,29205.0,0.500000,12.0,0.500000,-1632.0,-647.000000,3654.135,318802.500,0.0,269855
280340,0.768628,0.553165,0.000000,0.097303,20250.0,-1425.0,0.079505,-130.0,21.000000,0.112500,...,94500.0,0.400000,42.0,0.600000,-182.0,-346.000000,10749.375,539514.000,0.0,424771
37526,0.520049,0.336062,0.000000,0.071389,28278.0,-912.0,0.054247,-388.0,12.000000,0.179543,...,76950.0,0.250000,12.0,0.750000,-846.0,-440.500000,6880.545,3223962.000,0.0,143471
127703,0.436026,,,0.015072,9000.0,-193.0,0.050000,,,0.050000,...,,,,,,,,,0.0,248113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307444,0.329708,0.360613,0.000000,0.307320,32746.5,-3048.0,0.072770,-227.0,9.000000,0.158196,...,70641.0,0.363636,18.0,0.636364,-410.0,-355.250000,5364.945,4728279.015,1.0,456186
307471,0.583214,0.424130,0.172589,0.179264,46809.0,-2405.0,0.035919,-170.0,17.777778,0.325062,...,900000.0,0.400000,160.0,0.600000,-894.0,-477.750000,50106.735,747031.500,1.0,456215
307477,0.713524,0.511892,6.522843,0.152441,19975.5,-3147.0,0.067258,-354.0,30.000000,0.088780,...,270000.0,0.666667,60.0,0.333333,-942.0,-879.500000,17900.910,2797449.840,1.0,456225
307485,0.615261,0.397946,0.035714,0.017364,23089.5,-286.0,0.044294,-203.0,6.000000,0.102620,...,81810.0,1.000000,12.0,0.000000,-307.0,-203.000000,9115.560,223943.895,1.0,456233


In [52]:
# Create arrays and dataframes to store results
feats = [f for f in data_select_feats.columns if f not in ['TARGET', 'Unnamed: 0', 'Unnamed0',
                                                           'SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV',
                                                           'index']]

from sklearn.model_selection import train_test_split
# Load and split dataset
x_train, x_test, y_train, y_test = train_test_split(df_ft_selec_underbal[feats], 
                                                    df_ft_selec_underbal['TARGET'], 
                                                    test_size=0.3, random_state = 22)

# SMOTE 
See tries in Pycaret.

# Hyperparameter Search: from hyperopt

In [40]:
from sklearn.metrics import confusion_matrix

def my_comp_score(y_true, y_pred):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    tn_weight = 1
    fp_weight = 0
    fn_weight = -10
    tp_weight = 0

    # gain function for company, true positives and false positives don't matter that much to us
    # we want to penalise the false negatives, and we want to say yes to true negatives
    gain = tp*(tp_weight) + tn*(tn_weight) + fp*(fp_weight) + fn*(fn_weight)
    
    # best represents scenario where there are no false negatives or false postives
    # so all false positives would be correctly shown as negative
    # and all false negatives would be correctly show as positive
    best = (tn + fp) * tn_weight + (tp + fn) * tp_weight
    
    # baseline is a naive model that predicts non default(negative) for everyone
    # but all true positives and false negatives would be incorrectly shown as negative
    baseline = (tn + fp) * tn_weight + (tp + fn) * fn_weight
    
    score = (gain - baseline) / (best - baseline)
    
    return score

def my_comp_score_probs(y_true, y_pred):
    
    # for now y_pred is the probability that the value is 1
    y_use = [1 if i >= 0.5 else 0 for i in y_pred]
    
    y_use = pd.Series(y_use)

    tn, fp, fn, tp = confusion_matrix(y_true, y_use).ravel()
    
    tn_weight = 1
    fp_weight = 0
    fn_weight = -10
    tp_weight = 0

    # gain function for company, true positives and false positives don't matter that much to us
    # we want to penalise the false negatives, and we want to say yes to true negatives
    gain = tp*(tp_weight) + tn*(tn_weight) + fp*(fp_weight) + fn*(fn_weight)
    
    # best represents scenario where there are no false negatives or false postives
    # so all false positives would be correctly shown as negative
    # and all false negatives would be correctly show as positive
    best = (tn + fp) * tn_weight + (tp + fn) * tp_weight
    
    # baseline is a naive model that predicts non default(negative) for everyone
    # so all true positives and false negatives would be incorrectly shown as negative
    baseline = (tn + fp) * tn_weight + (tp + fn) * fn_weight
    
    score = ((gain - baseline) / (best - baseline))

    return score



In [41]:
from sklearn.metrics import fbeta_score, make_scorer
from hyperopt import fmin, tpe, hp, anneal, Trials
from hyperopt import tpe
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [42]:
kf = KFold(n_splits = 5)

def gb_mse_cv(params, random_state = 22, cv = kf, X = x_train, y= y_train):
    # the function gets a set of variable parameters in "param"
    params = {'learning_rate': params['learning_rate'],
              'num_leaves': int(params['num_leaves']),
              'subsample' : params['subsample'],
              'max_depth': int(params['max_depth']),
              'reg_alpha' : params['reg_alpha'],
              'reg_lambda' : params['reg_lambda'],
              'min_child_weight': int(params['min_child_weight']),
              'min_data_in_leaf': int(params['min_data_in_leaf']),
              'min_split_gain' :  params['min_split_gain']
              }
    #              'min_data_in_leaf': int(params['min_data_in_leaf']),
    
    # we use this params to create a new LGBM Regressor
    model = LGBMClassifier(random_state = random_state, **params)
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X, y, cv=cv, 
                             scoring= make_scorer(my_comp_score_probs, 
                                                                      needs_proba= True), 
                             n_jobs=-1).mean()

    return score

In [43]:
%%time

n_iter = 50

# possible values of parameters
space = {'learning_rate': hp.loguniform('learning_rate', -5, 0),
         'num_leaves' : hp.quniform('num_leaves', 2, 40, 1),
         'subsample' : hp.loguniform('subsample', -4, 0),
         'max_depth' : hp.quniform('max_depth', 2, 14, 1),
         'reg_alpha' : hp.loguniform('reg_alpha', -4, 0),
         'reg_lambda' : hp.loguniform('reg_lambda', -4, 0),
         'min_child_weight': hp.loguniform('min_child_weight', -9, -4),
         'min_data_in_leaf': hp.quniform('min_data_in_leaf', 2, 40, 1),
         'min_split_gain' : hp.loguniform('min_split_gain', -4, 0)
        }

# trials will contain logging information
trials = Trials()

best=fmin(fn=gb_mse_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(58) # fixing random state for the reproducibility
         )

# computing the score on the test set
model = LGBMClassifier(random_state = 54,
                       learning_rate = best['learning_rate'],
                       num_leaves = int(best['num_leaves']), 
                       subsample = best['subsample'],
                       max_depth = int(best['max_depth']),
                       reg_alpha = best['reg_alpha'],
                       reg_lambda = best['reg_lambda'],
                       min_child_weight = int(best['min_child_weight']),
                       min_data_in_leaf = int(best['min_data_in_leaf']),
                       min_split_gain = best['min_split_gain']
                       )

model.fit(x_train, y_train)

print("Best  {:.3f} params {}".format( gb_mse_cv(best), best))


100%|██████████| 50/50 [02:10<00:00,  2.61s/trial, best loss: -0.6651776152021613]
Best  -0.665 params {'learning_rate': 0.057407968892762215, 'max_depth': 11.0, 'min_child_weight': 0.0006033060730689025, 'min_data_in_leaf': 12.0, 'min_split_gain': 0.8373311160535064, 'num_leaves': 24.0, 'reg_alpha': 0.5455970807970137, 'reg_lambda': 0.6320635105816337, 'subsample': 0.3929027551466778}
CPU times: user 9.41 s, sys: 1.68 s, total: 11.1 s
Wall time: 2min 15s


# Launching LightGBM

In [53]:
def my_comp_score_lgbm(y_true, y_pred):
    
    # for now y_pred is the probability that the value is 1
    y_use = [1 if i >= 0.5 else 0 for i in y_pred]
    
    y_use = pd.Series(y_use)

    tn, fp, fn, tp = confusion_matrix(y_true, y_use).ravel()
    
    tn_weight = 1
    fp_weight = 0
    fn_weight = -10
    tp_weight = 0

    # gain function for company, true positives and false positives don't matter that much to us
    # we want to penalise the false negatives, and we want to say yes to true negatives
    gain = tp*(tp_weight) + tn*(tn_weight) + fp*(fp_weight) + fn*(fn_weight)
    
    # best represents scenario where there are no false negatives or false postives
    # so all false positives would be correctly shown as negative
    # and all false negatives would be correctly show as positive
    best = (tn + fp) * tn_weight + (tp + fn) * tp_weight
    
    # baseline is a naive model that predicts non default(negative) for everyone
    # but all true positives and false negatives would be incorrectly shown as negative
    baseline = (tn + fp) * tn_weight + (tp + fn) * fn_weight
    
    score = ((gain - baseline) / (best - baseline))

    return 'my score', score, True

In [47]:
num_folds= 10
stratified= False

# Cross validation model
if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    
# Create arrays and dataframes to store results
oof_preds = np.zeros(df_ft_selec_underbal.shape[0])
#sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()

In [55]:
df_ft_selec_underbal[feats].to_csv("../my_csv_files/MY_train_x.csv")

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_ft_selec_underbal[feats], 
                                                            df_ft_selec_underbal['TARGET'])):
    train_x, train_y = df_ft_selec_underbal[feats].iloc[train_idx], df_ft_selec_underbal['TARGET'].iloc[train_idx]
    valid_x, valid_y = df_ft_selec_underbal[feats].iloc[valid_idx], df_ft_selec_underbal['TARGET'].iloc[valid_idx]
    
    valid_x.to_csv("../my_csv_files/MY_valid_x.csv")
    valid_y.to_csv("../my_csv_files/MY_valid_y.csv")
    
    # Light GBM parameters found by my hyperparameter search
    clf = LGBMClassifier(
        nthread = 4,
        n_estimators = 10000,
        learning_rate = 0.12406563722973285,
        num_leaves = 29,
        max_depth = 8,
        reg_alpha = 0.8267847013350155,
        reg_lambda = 0.2632332988417265,
        min_child_weight = 21,
        subsample = 0.35048202158006775,
        min_data_in_leaf = 16,
        min_split_gain = 0.10758484903193334,
        silent = -1,
        verbose = -1, )
    
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            verbose= 200, early_stopping_rounds= 200, eval_metric = my_comp_score_lgbm)

    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    
    valid_preds = clf.predict(valid_x, num_iteration = clf.best_iteration_)
    valid_preds_proba = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis = 0)
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
    del train_x, train_y, 
    #valid_x, valid_y
    gc.collect()

Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.454481	training's my score: 0.775794	valid_1's binary_logloss: 0.572555	valid_1's my score: 0.673953
Early stopping, best iteration is:
[96]	training's binary_logloss: 0.5066	training's my score: 0.727508	valid_1's binary_logloss: 0.568983	valid_1's my score: 0.666948
Fold  1 AUC : 0.774643
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.451595	training's my score: 0.779144	valid_1's binary_logloss: 0.57074	valid_1's my score: 0.659331
Early stopping, best iteration is:
[57]	training's binary_logloss: 0.532214	training's my score: 0.707241	valid_1's binary_logloss: 0.573283	valid_1's my score: 0.666465
Fold  2 AUC : 0.771371
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.452401	training's my score: 0.775988	valid_1's binary_logloss: 0.576862	valid_1's my score: 0.664753
Early stopping, best it

In [54]:
import pickle
import joblib

In [57]:
joblib.dump(clf, "../Models/lgbm__trained_myscore_final.sav")
pickle.dump(clf, open("../Models/lgbm_trained_myscore_final.pickle", 'wb'))


In [55]:
clf = joblib.load("../Models/lgbm__trained_myscore_final.sav")

# Results : Scores 

In [58]:
from sklearn.metrics import *
recall_score(valid_y, valid_preds)

0.6981818181818182

In [59]:
from sklearn.metrics import confusion_matrix
conf_lgm = confusion_matrix(valid_y, valid_preds)
conf_lgm

array([[1758,  738],
       [ 747, 1728]])

In [60]:
my_comp_score(valid_y, valid_preds)

0.6683636363636364

# Results : SHAP

In [62]:
import shap

explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(df_ft_selec_underbal[feats])

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


In [98]:
# Visualize a single prediction
# The shap plot shows features that contribute to pushing the output from the base value to the 
# actual predicted value
# Red colour pushes the predictions higher (towards positive)
# Blue colour indicates the predicitions lower (towards negative)
shap.force_plot(explainer.expected_value[1], shap_values[1][950,:], df_ft_selec_underbal[feats].iloc[950,:])