# Notebook for Modelling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import gc
import warnings
import os
import timeit

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

# Using Selected Features

In [28]:
# read in data & selected features from p7_notebook_exploration.ipynb
feats_importance = pd.read_csv('../my_csv_files/feats_importance_model_lgbm.csv')
feats_miss_under30 = pd.read_csv('../my_csv_files/feats_missing_vals_under30_perc.csv')

In [29]:
feats_miss_under30.drop("Unnamed: 0", axis = 1)

Unnamed: 0,0
0,ACTIVE_AMT_CREDIT_SUM_MEAN
1,ACTIVE_AMT_CREDIT_SUM_MAX
2,ACTIVE_DAYS_CREDIT_MEAN
3,ACTIVE_DAYS_CREDIT_UPDATE_MEAN
4,ACTIVE_CREDIT_DAY_OVERDUE_MAX
...,...
541,ORGANIZATION_TYPE_Electricity
542,ORGANIZATION_TYPE_Culture
543,ORGANIZATION_TYPE_Construction
544,ORGANIZATION_TYPE_Cleaning


In [30]:
feats_miss_under30.columns  = ['index1', 'name']
feats_miss_under30 = feats_miss_under30.drop("index1", axis = 1)

In [31]:
final_selected_feats = feats_importance[feats_importance["feature"].isin(feats_miss_under30["name"])]

In [32]:
final_selected_feats = final_selected_feats.reset_index(drop=True)
final_selected_feats

Unnamed: 0,feature,importance
0,EXT_SOURCE_2,23.0
1,EXT_SOURCE_3,22.0
2,INSTAL_DPD_MEAN,7.0
3,DAYS_EMPLOYED_PERC,6.0
4,AMT_ANNUITY,6.0
...,...,...
79,BURO_CREDIT_ACTIVE_Closed_MEAN,1.0
80,APPROVED_DAYS_DECISION_MAX,1.0
81,ACTIVE_DAYS_CREDIT_MEAN,1.0
82,APPROVED_AMT_ANNUITY_MAX,1.0


In [72]:
# the code below is from a previous run, to show that final_selected_feats is equal to feats_combined
# from the csv file : ('../my_csv_files/feats_import_combined_lgbm128_mssing.csv')
# we will overwrite that file in the cell below
#feats_combined.equals(final_selected_feats)

True

In [7]:
final_selected_feats.to_csv('../my_csv_files/feats_import_combined_lgbm128_mssing.csv')

In [33]:
feats_for_modelling = list(final_selected_feats["feature"]) + ['TARGET', 'SK_ID_CURR']

# Undersampling 
#### (SMOTE did not improve results in exploration phase, see pycaret) 

In [2]:
merged_all_files_csv = pd.read_csv('../my_csv_files/MY_merged_all_files.csv')

In [3]:
merged_all_files_csv

Unnamed: 0.1,Unnamed: 0,index,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,0,100002,1.0,0,0,0,0,202500.0,406597.5,...,,,,,,,,,,
1,1,1,100003,0.0,1,0,1,0,270000.0,1293502.5,...,,,,,,,,,,
2,2,2,100004,0.0,0,1,0,0,67500.0,135000.0,...,,,,,,,,,,
3,3,3,100006,0.0,1,0,0,0,135000.0,312682.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,4,100007,0.0,0,0,0,0,121500.0,513000.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356246,356250,48739,456221,,1,0,0,0,121500.0,412560.0,...,,,,,,,,,,
356247,356251,48740,456222,,1,0,1,2,157500.0,622413.0,...,,,,,,,,,,
356248,356252,48741,456223,,1,1,0,1,202500.0,315000.0,...,,,,,,,,,,
356249,356253,48742,456224,,0,0,1,0,225000.0,450000.0,...,,,,,,,,,,


In [4]:
merged_all_files = merged_all_files_csv.iloc[:,1:]
merged_with_target = merged_all_files[merged_all_files['TARGET'].notnull()]
merged_no_inf = merged_with_target[~merged_with_target.isin([np.inf, -np.inf]).any(1)]

In [5]:
merged_no_inf

Unnamed: 0,index,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,...,,,,,,,,,,
1,1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,...,,,,,,,,,,
2,2,100004,0.0,0,1,0,0,67500.0,135000.0,6750.0,...,,,,,,,,,,
3,3,100006,0.0,1,0,0,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,100007,0.0,0,0,0,0,121500.0,513000.0,21865.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,307506,456251,0.0,0,0,1,0,157500.0,254700.0,27558.0,...,,,,,,,,,,
307503,307507,456252,0.0,1,0,0,0,72000.0,269550.0,12001.5,...,,,,,,,,,,
307504,307508,456253,0.0,1,0,0,0,153000.0,677664.0,29979.0,...,,,,,,,,,,
307505,307509,456254,1.0,1,0,0,0,171000.0,370107.0,20205.0,...,,,,,,,,,,


In [37]:
feats_for_modelling

['EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'INSTAL_DPD_MEAN',
 'DAYS_EMPLOYED_PERC',
 'AMT_ANNUITY',
 'DAYS_EMPLOYED',
 'PAYMENT_RATE',
 'ACTIVE_DAYS_CREDIT_MAX',
 'APPROVED_CNT_PAYMENT_MEAN',
 'ANNUITY_INCOME_PERC',
 'PREV_CNT_PAYMENT_MEAN',
 'DAYS_BIRTH',
 'CLOSED_DAYS_CREDIT_MAX',
 'POS_MONTHS_BALANCE_SIZE',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'INSTAL_DAYS_ENTRY_PAYMENT_MAX',
 'POS_SK_DPD_DEF_MAX',
 'PREV_APP_CREDIT_PERC_MEAN',
 'APPROVED_AMT_CREDIT_MIN',
 'BURO_AMT_CREDIT_SUM_DEBT_MEAN',
 'APPROVED_AMT_ANNUITY_MEAN',
 'DAYS_ID_PUBLISH',
 'INSTAL_PAYMENT_PERC_SUM',
 'BURO_CREDIT_TYPE_Microloan_MEAN',
 'INSTAL_AMT_PAYMENT_SUM',
 'NAME_FAMILY_STATUS_Married',
 'INSTAL_PAYMENT_DIFF_SUM',
 'REGION_RATING_CLIENT_W_CITY',
 'ACTIVE_AMT_CREDIT_SUM_MEAN',
 'POS_MONTHS_BALANCE_MEAN',
 'INSTAL_AMT_INSTALMENT_MAX',
 'PREV_NAME_TYPE_SUITE_nan_MEAN',
 'INSTAL_AMT_INSTALMENT_SUM',
 'INSTAL_COUNT',
 'FLAG_DOCUMENT_3',
 'INSTAL_DAYS_ENTRY_PAYMENT_MEAN',
 'FLAG_DOCUMENT_18',
 'CODE_GENDER',
 'DEF_30_CNT_SOCI

In [38]:
final_selected_feats["feature"]

0                       EXT_SOURCE_2
1                       EXT_SOURCE_3
2                    INSTAL_DPD_MEAN
3                 DAYS_EMPLOYED_PERC
4                        AMT_ANNUITY
                   ...              
79    BURO_CREDIT_ACTIVE_Closed_MEAN
80        APPROVED_DAYS_DECISION_MAX
81           ACTIVE_DAYS_CREDIT_MEAN
82          APPROVED_AMT_ANNUITY_MAX
83    ACTIVE_AMT_CREDIT_SUM_DEBT_SUM
Name: feature, Length: 84, dtype: object

In [6]:
# use to perform feature selection
#data_select_feats = merged_no_inf[feats_for_modelling]

# use to keep all features 
data_select_feats = merged_no_inf

In [7]:
import re

# Divide the data into training/validation and test data

data_select_feats = data_select_feats.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


In [9]:
# Function to calculate missing values by column
def missing_values_table(df):
        """
        df : data for which you want to know the missing values
        """
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns.sort_values('% of Total Values', 
                                                                          ascending=False).round(1)
        #mis_val_table_ren_columns = mis_val_table_ren_columns[
        #    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        #'% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
               "There are " + str(mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].shape[0]) +
               " columns that have missing values.")
        
        #print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        #    "There are " + str(mis_val_table_ren_columns.shape[0]) +
        #      " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
    

In [41]:
# Function to calculate missing values by column
def missing_values_table(df):
        """
        df : data for which you want to know the missing values
        """
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns.sort_values('% of Total Values', 
                                                                          ascending=False).round(1)
        #mis_val_table_ren_columns = mis_val_table_ren_columns[
        #    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        #'% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
               "There are " + str(mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].shape[0]) +
               " columns that have missing values.")
        
        #print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        #    "There are " + str(mis_val_table_ren_columns.shape[0]) +
        #      " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns
    

In [27]:
missing_values_table(data_select_feats).tail(190)

Your selected dataframe has 798 columns.
There are 610 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
EXT_SOURCE_2,660,0.2
AMT_GOODS_PRICE,278,0.1
AMT_ANNUITY,12,0.0
ANNUITY_INCOME_PERC,12,0.0
PAYMENT_RATE,12,0.0
...,...,...
ORGANIZATION_TYPE_Electricity,0,0.0
ORGANIZATION_TYPE_Culture,0,0.0
ORGANIZATION_TYPE_Construction,0,0.0
ORGANIZATION_TYPE_Cleaning,0,0.0


In [28]:
# when a feature has few missing values, remove the individuals for which the value is missing
# if under 0.5% of the rows
data_select_feats = data_select_feats[data_select_feats['AMT_GOODS_PRICE'].notnull()]
data_select_feats = data_select_feats[data_select_feats['EXT_SOURCE_2'].notnull()]
data_select_feats = data_select_feats[data_select_feats['DEF_30_CNT_SOCIAL_CIRCLE'].notnull()]
data_select_feats = data_select_feats[data_select_feats['DEF_60_CNT_SOCIAL_CIRCLE'].notnull()]
data_select_feats = data_select_feats[data_select_feats['OBS_30_CNT_SOCIAL_CIRCLE'].notnull()]
data_select_feats = data_select_feats[data_select_feats['ANNUITY_INCOME_PERC'].notnull()]
data_select_feats = data_select_feats[data_select_feats['AMT_ANNUITY'].notnull()]
data_select_feats = data_select_feats[data_select_feats['PAYMENT_RATE'].notnull()]

#### Code for the Undersampling

In [29]:
data_select_feats.shape

(305522, 798)

In [31]:
data_select_feats["TARGET"].value_counts()

0.0    280808
1.0     24714
Name: TARGET, dtype: int64

In [32]:
# get random indices of individuals for which TARGET = 0

random_indices = np.random.choice(list(data_select_feats.\
                                       TARGET[data_select_feats["TARGET"] == 0.0].index),
                                  25000, replace=False)

In [33]:
len(random_indices)

25000

In [34]:
df_ft_selec_underbal = pd.concat([data_select_feats.loc[list(random_indices)], 
                                  data_select_feats[data_select_feats["TARGET"] == 1.0]])

In [131]:
#df_ft_selec_underbal.to_csv("../my_csv_files/df_ft_selec_underbal.csv")
df_ft_selec_underbal

Unnamed: 0,index,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
207582,207586,340612,0.0,1,1,1,0,135000.0,339241.5,16627.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
152296,152299,276529,0.0,1,1,0,0,67500.0,900000.0,32458.5,...,,,,,,,,,,
231323,231327,367950,0.0,0,1,1,0,180000.0,545040.0,27814.5,...,,,,,,,,,,
290900,290904,437002,0.0,0,0,1,0,360000.0,781920.0,28084.5,...,,,,,,,,,,
45165,45167,152318,0.0,1,1,1,0,225000.0,269550.0,21163.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307444,307448,456186,1.0,0,0,1,1,207000.0,450000.0,32746.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0
307471,307475,456215,1.0,1,0,1,1,144000.0,1303200.0,46809.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.0
307477,307481,456225,1.0,0,0,0,0,225000.0,297000.0,19975.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0
307485,307489,456233,1.0,1,0,0,0,225000.0,521280.0,23089.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0


In [132]:
# Create arrays and dataframes to store results
feats = [f for f in data_select_feats.columns if f not in ['TARGET', 'Unnamed: 0', 'Unnamed0',
                                                           'SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV',
                                                           'index']]

from sklearn.model_selection import train_test_split
# Load and split dataset
x_train, x_test, y_train, y_test = train_test_split(df_ft_selec_underbal[feats], 
                                                    df_ft_selec_underbal['TARGET'], 
                                                    test_size=0.3, random_state = 22)

In [37]:
keep_track = pd.DataFrame(index = ['No Feature Selection, Undersampling', 'Feature Selection, Undersampling'], 
                  columns=['AUC', 'Recall', 'My Scorer'])

In [57]:
keep_track_threshs = pd.DataFrame(index = ['Undersampling, threshold : 0.5', 
                                           'Undersampling, threshold : 0.45',
                                           'Undersampling, threshold : 0.4'], 
                                  columns=['AUC', 'Recall', 'My Scorer'])

# SMOTE 
See tries in Pycaret.

# Hyperparameter Search: from hyperopt

In [133]:
from sklearn.metrics import confusion_matrix

def my_comp_score(y_true, y_pred):

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    tn_weight = 1
    fp_weight = 0
    fn_weight = -10
    tp_weight = 0

    # gain function for company, true positives and false positives don't matter that much to us
    # we want to penalise the false negatives, and we want to say yes to true negatives
    gain = tp*(tp_weight) + tn*(tn_weight) + fp*(fp_weight) + fn*(fn_weight)
    
    # best represents scenario where there are no false negatives or false postives
    # so all false positives would be correctly shown as negative
    # and all false negatives would be correctly show as positive
    best = (tn + fp) * tn_weight + (tp + fn) * tp_weight
    
    # baseline is a naive model that predicts non default(negative) for everyone
    # but all true positives and false negatives would be incorrectly shown as negative
    baseline = (tn + fp) * tn_weight + (tp + fn) * fn_weight
    
    score = (gain - baseline) / (best - baseline)
    
    return score

def my_comp_score_probs(y_true, y_pred):
    
    # for now y_pred is the probability that the value is 1
    y_use = [1 if i >= 0.4 else 0 for i in y_pred]
    
    y_use = pd.Series(y_use)

    tn, fp, fn, tp = confusion_matrix(y_true, y_use).ravel()
    
    tn_weight = 1
    fp_weight = 0
    fn_weight = -10
    tp_weight = 0

    # gain function for company, true positives and false positives don't matter that much to us
    # we want to penalise the false negatives, and we want to say yes to true negatives
    gain = tp*(tp_weight) + tn*(tn_weight) + fp*(fp_weight) + fn*(fn_weight)
    
    # best represents scenario where there are no false negatives or false postives
    # so all false positives would be correctly shown as negative
    # and all false negatives would be correctly show as positive
    best = (tn + fp) * tn_weight + (tp + fn) * tp_weight
    
    # baseline is a naive model that predicts non default(negative) for everyone
    # so all true positives and false negatives would be incorrectly shown as negative
    baseline = (tn + fp) * tn_weight + (tp + fn) * fn_weight
    
    score = ((gain - baseline) / (best - baseline))

    return score



In [134]:
from sklearn.metrics import fbeta_score, make_scorer
from hyperopt import fmin, tpe, hp, anneal, Trials
from hyperopt import tpe
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [135]:
kf = KFold(n_splits = 5)

def gb_mse_cv(params, random_state = 22, cv = kf, X = x_train, y= y_train):
    # the function gets a set of variable parameters in "param"
    params = {'learning_rate': params['learning_rate'],
              'num_leaves': int(params['num_leaves']),
              'subsample' : params['subsample'],
              'max_depth': int(params['max_depth']),
              'reg_alpha' : params['reg_alpha'],
              'reg_lambda' : params['reg_lambda'],
              'min_child_weight': params['min_child_weight'],
              'min_split_gain' :  params['min_split_gain']
              }
    #          'min_data_in_leaf': int(params['min_data_in_leaf']),
    
    # we use this params to create a new LGBM Regressor
    model = LGBMClassifier(random_state = random_state, **params)
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X, y, cv=cv, 
                             scoring= make_scorer(my_comp_score_probs, 
                                                                      needs_proba= True), 
                             n_jobs=-1).mean()

    return score

In [137]:
%%time

n_iter = 50

# possible values of parameters
space = {'learning_rate': hp.loguniform('learning_rate', -4, 0),
         'num_leaves' : hp.quniform('num_leaves', 2, 40, 1),
         'subsample' : hp.loguniform('subsample', -4, 0),
         'max_depth' : hp.quniform('max_depth', 2, 14, 1),
         'reg_alpha' : hp.loguniform('reg_alpha', -4, 0),
         'reg_lambda' : hp.loguniform('reg_lambda', -4, 0),
         'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
         'min_split_gain' : hp.loguniform('min_split_gain', -4, 0)
        }
# 'min_data_in_leaf': hp.quniform('min_data_in_leaf', 2, 40, 1),
#  "min_child_weight": hp.quniform('min_child_weight', 2, 50, 2),
#  'min_child_weight': hp.loguniform('min_child_weight', -5, 0),

# trials will contain logging information
trials = Trials()

best=fmin(fn=gb_mse_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.RandomState(58) # fixing random state for the reproducibility
         )

# computing the score on the test set
model = LGBMClassifier(random_state = 54,
                       learning_rate = best['learning_rate'],
                       num_leaves = int(best['num_leaves']), 
                       subsample = best['subsample'],
                       max_depth = int(best['max_depth']),
                       reg_alpha = best['reg_alpha'],
                       reg_lambda = best['reg_lambda'],
                       min_child_weight = best['min_child_weight'],
                       min_split_gain = best['min_split_gain']
                       )
# min_data_in_leaf = int(best['min_data_in_leaf']),

model.fit(x_train, y_train)

print("Best  {:.3f} params {}".format( gb_mse_cv(best), best))


100%|██████████| 50/50 [10:54<00:00, 13.10s/trial, best loss: -0.8305753240484757]
Best  -0.831 params {'learning_rate': 0.021244673062841227, 'max_depth': 13.0, 'min_child_weight': 5, 'min_split_gain': 0.21240150614336456, 'num_leaves': 2.0, 'reg_alpha': 0.14478355702201645, 'reg_lambda': 0.916518960046144, 'subsample': 0.21165336323057224}
CPU times: user 11.7 s, sys: 7.18 s, total: 18.9 s
Wall time: 11min 4s


In [138]:
x_train
#x_test


Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
256855,1,0,0,1,90000.0,269550.0,21294.0,225000.0,0.006296,-10624,...,,,,,,,,,,
73026,1,0,0,0,112500.0,675000.0,19071.0,675000.0,0.028663,-19907,...,,,,,,,,,,
255015,0,0,0,1,144000.0,309420.0,11794.5,202500.0,0.018850,-14734,...,,,,,,,,,,
71338,0,1,0,0,112500.0,772686.0,25056.0,553500.0,0.020713,-18970,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87.0
65003,1,0,0,1,157500.0,611095.5,33282.0,486000.0,0.008474,-10726,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49581,1,0,0,0,126000.0,427500.0,20056.5,427500.0,0.022800,-21853,...,,,,,,,,,,
127860,1,0,1,0,135000.0,519633.0,37944.0,481500.0,0.035792,-9003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
83337,1,0,0,1,180000.0,270000.0,13500.0,270000.0,0.046220,-13698,...,,,,,,,,,,
55878,1,0,0,0,405000.0,900000.0,38263.5,900000.0,0.022800,-11790,...,,,,,,,,,,


In [97]:
y_train
#y_test

256855    0.0
73026     1.0
255015    0.0
71338     1.0
65003     1.0
         ... 
49581     0.0
127860    0.0
83337     1.0
55878     1.0
234143    1.0
Name: TARGET, Length: 34799, dtype: float64

# Launching LightGBM

In [148]:
def my_comp_score_lgbm(y_true, y_pred):
    
    # for now y_pred is the probability that the value is 1
    y_use = [1 if i >= 0.5 else 0 for i in y_pred]
    
    y_use = pd.Series(y_use)

    tn, fp, fn, tp = confusion_matrix(y_true, y_use).ravel()
    
    tn_weight = 1
    fp_weight = 0
    fn_weight = -10
    tp_weight = 0

    # gain function for company, true positives and false positives don't matter that much to us
    # we want to penalise the false negatives, and we want to say yes to true negatives
    gain = tp*(tp_weight) + tn*(tn_weight) + fp*(fp_weight) + fn*(fn_weight)
    
    # best represents scenario where there are no false negatives or false postives
    # so all false positives would be correctly shown as negative
    # and all false negatives would be correctly show as positive
    best = (tn + fp) * tn_weight + (tp + fn) * tp_weight
    
    # baseline is a naive model that predicts non default(negative) for everyone
    # but all true positives and false negatives would be incorrectly shown as negative
    baseline = (tn + fp) * tn_weight + (tp + fn) * fn_weight
    
    score = ((gain - baseline) / (best - baseline))

    return 'my score', score, True

In [143]:
num_folds= 10
stratified= False

# Cross validation model
if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    
# Create arrays and dataframes to store results
oof_preds = np.zeros(df_ft_selec_underbal.shape[0])
#sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()

In [144]:
#df_ft_selec_underbal[feats].to_csv("../my_csv_files/MY_train_x.csv")

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_ft_selec_underbal[feats], 
                                                            df_ft_selec_underbal['TARGET'])):
    train_x, train_y = df_ft_selec_underbal[feats].iloc[train_idx], df_ft_selec_underbal['TARGET'].iloc[train_idx]
    valid_x, valid_y = df_ft_selec_underbal[feats].iloc[valid_idx], df_ft_selec_underbal['TARGET'].iloc[valid_idx]
    
    valid_x.to_csv("../my_csv_files/MY_valid_x.csv")
    valid_y.to_csv("../my_csv_files/MY_valid_y.csv")
    
    # Light GBM parameters found by my hyperparameter search
    clf = LGBMClassifier(
        nthread = 4,
        n_estimators = 10000,
        learning_rate = 0.021244673062841227,
        num_leaves = 2,
        subsample = 0.21165336323057224,
        max_depth = 13,
        reg_alpha = 0.14478355702201645,
        reg_lambda = 0.916518960046144,
        min_split_gain = 0.21240150614336456,
        min_child_weight = 5,
        silent = -1,
        verbose = -1, )
    # min_data_in_leaf = 18,
    
     #clf = LGBMClassifier(
     #       nthread=4,
     #       n_estimators=10000,
     #       learning_rate=0.02,
     #       num_leaves=34,
           ## colsample_bytree=0.9497036,
     #       subsample=0.8715623,
     #       max_depth=8,
     #       reg_alpha=0.041545473,
     #       reg_lambda=0.0735294,
     #       min_split_gain=0.0222415,
     #       min_child_weight=39.3259775,
     #       silent=-1,
     #       verbose=-1, )
#
    
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            verbose= 200, early_stopping_rounds= 200, eval_metric = my_comp_score_lgbm)

    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    
    valid_preds = clf.predict(valid_x, num_iteration = clf.best_iteration_)
    valid_preds_proba = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis = 0)
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
    del train_x, train_y, 
    #valid_x, valid_y
    gc.collect()

Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.622672	training's my score: 0.814197	valid_1's binary_logloss: 0.624964	valid_1's my score: 0.825161
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.691924	training's my score: 0.898731	valid_1's binary_logloss: 0.691922	valid_1's my score: 0.899839
Fold  1 AUC : 0.620167
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.622486	training's my score: 0.815769	valid_1's binary_logloss: 0.626214	valid_1's my score: 0.813704
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.691902	training's my score: 0.898759	valid_1's binary_logloss: 0.692	valid_1's my score: 0.899597
Fold  2 AUC : 0.610847
Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.623229	training's my score: 0.812681	valid_1's binary_logloss: 0.621816	valid_1's my score: 0.815072
Early stopping, best iter

In [162]:
model.predict(x_train)

array([1., 0., 0., ..., 1., 1., 0.])

In [164]:
valid_preds = model.predict(valid_x)
#valid_preds_proba = clf.predict_proba(model)

In [51]:
import pickle
import joblib

In [153]:
joblib.dump(clf, "../Models/lgbm__trained_myscore4.sav")
pickle.dump(clf, open("../Models/lgbm_trained_myscore4.pickle", 'wb'))


In [55]:
#clf = joblib.load("../Models/lgbm__trained_myscore_final.sav")

# 0.45 thresh 50/50 [10:54<00:00, 13.10s/trial, best loss: -0.8305753240484757]
Best  -0.831 params {'learning_rate': 0.021244673062841227, 'max_depth': 13.0, 'min_child_weight': 5, 'min_split_gain': 0.21240150614336456, 'num_leaves': 2.0, 'reg_alpha': 0.14478355702201645, 'reg_lambda': 0.916518960046144, 'subsample': 0.21165336323057224}
CPU times: user 11.7 s, sys: 7.18 s, total: 18.9 s
Wall time: 11min 4s

In [165]:
valid_y

129       0.0
7672      0.0
220842    0.0
273149    0.0
63205     0.0
         ... 
306460    1.0
306647    1.0
306782    1.0
307264    1.0
307406    1.0
Name: TARGET, Length: 4971, dtype: float64

In [166]:
pd.DataFrame(valid_preds).value_counts()

0.0    2570
1.0    2401
dtype: int64

In [167]:
from sklearn.metrics import *
recall_score(valid_y, valid_preds)

0.6395959595959596

In [168]:
roc_auc_score(valid_y, valid_preds)

0.6559358003108002

In [169]:
my_comp_score(valid_y, valid_preds)

0.6065454545454545

In [152]:
from sklearn.metrics import confusion_matrix
conf_lgm = confusion_matrix(valid_y, valid_preds)
conf_lgm

array([[2496,    0],
       [2475,    0]])

In [130]:
pd.DataFrame(conf_lgm, index = ["0.0", "1.0"], columns = ["0", "1.0"])

Unnamed: 0,0,1.0
0.0,2496,0
1.0,2475,0


# 0.5 thresh 100% 50/50 [13:11<00:00, 15.84s/trial, best loss: -0.6788864362975149]
[LightGBM] [Warning] min_data_in_leaf is set=6, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=6
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
Best  -0.679 params {'learning_rate': 0.06918858038426348, 'max_depth': 10.0, 'min_child_weight': 0.0023797618435441262, 'min_data_in_leaf': 6.0, 'min_split_gain': 0.784632540519317, 'num_leaves': 31.0, 'reg_alpha': 0.224542379959367, 'reg_lambda': 0.7244741599689299, 'subsample': 0.04934324763844042}
CPU times: user 49.7 s, sys: 9.68 s, total: 59.4 s
Wall time: 13min 40s

In [53]:
from sklearn.metrics import *
recall_score(valid_y, valid_preds)

0.7038383838383838

In [54]:
roc_auc_score(valid_y, valid_preds)

0.7086900252525253

In [56]:
my_comp_score(valid_y, valid_preds)

0.674949494949495

In [55]:
from sklearn.metrics import confusion_matrix
conf_lgm = confusion_matrix(valid_y, valid_preds)
conf_lgm

array([[1781,  715],
       [ 733, 1742]])

In [72]:
pd.DataFrame(conf_lgm, index = ["0.0", "1.0"], columns = ["0", "1.0"])

Unnamed: 0,0,1.0
0.0,1781,715
1.0,733,1742


In [70]:
pd.crosstab(valid_y, valid_preds)

col_0,0.0,1.0
TARGET,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1781,715
1.0,733,1742


## Keep tracks

In [68]:
keep_track_threshs

Unnamed: 0,AUC,Recall,My Scorer
"Undersampling, threshold : 0.5",0.70869,0.703838,0.674949
"Undersampling, threshold : 0.45",,,
"Undersampling, threshold : 0.4",,,


In [67]:
keep_track_threshs.loc["Undersampling, threshold : 0.45",'My Scorer'] = my_comp_score(valid_y, valid_preds)

# Results : Scores 

In [58]:
from sklearn.metrics import *
recall_score(valid_y, valid_preds)

0.6981818181818182

In [59]:
from sklearn.metrics import confusion_matrix
conf_lgm = confusion_matrix(valid_y, valid_preds)
conf_lgm

array([[1758,  738],
       [ 747, 1728]])

In [60]:
my_comp_score(valid_y, valid_preds)

0.6683636363636364

# Results : SHAP

In [62]:
import shap

explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(df_ft_selec_underbal[feats])

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


In [98]:
# Visualize a single prediction
# The shap plot shows features that contribute to pushing the output from the base value to the 
# actual predicted value
# Red colour pushes the predictions higher (towards positive)
# Blue colour indicates the predicitions lower (towards negative)
shap.force_plot(explainer.expected_value[1], shap_values[1][950,:], df_ft_selec_underbal[feats].iloc[950,:])