# Exploration Notebook : Pycaret

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import gc
import warnings
import os
import re

from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
print(os.listdir("../Projet+Mise+en+prod+-+home-credit-default-risk"))

['application_test.csv', '.DS_Store', 'HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'application_train.csv', 'bureau.csv', 'previous_application.csv', 'bureau_balance.csv', 'sample_submission.csv']


In [3]:
# read in csv file from p7_notebook_exploration.ipynb
merged_all = pd.read_csv('../my_csv_files/MY_merged_all_files.csv')

In [4]:
# get an idea of what the data is like, print it's shape and the few first rows
print('Training data shape: ', merged_all.shape)
merged_all.head()

Training data shape:  (356251, 799)


Unnamed: 0.1,Unnamed: 0,index,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,0,100002,1.0,0,0,0,0,202500.0,406597.5,...,,,,,,,,,,
1,1,1,100003,0.0,1,0,1,0,270000.0,1293502.5,...,,,,,,,,,,
2,2,2,100004,0.0,0,1,0,0,67500.0,135000.0,...,,,,,,,,,,
3,3,3,100006,0.0,1,0,0,0,135000.0,312682.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,4,100007,0.0,0,0,0,0,121500.0,513000.0,...,,,,,,,,,,


In [6]:
# remove columns we don't need for modelling, rename when needed
# and remove rows where TARGET value is missing
merged_all = merged_all.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
train_df = merged_all[merged_all['TARGET'].notnull()]
test_df = merged_all[merged_all['TARGET'].isnull()]

# create a list of features that will be used for modelling
feats = [f for f in train_df.columns if f not in ['SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV',
                                                  'Unnamed: 0', 'Unnamed0',
                                                  'index']]
train_df[feats]

Unnamed: 0,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,1.0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,0.018801,...,,,,,,,,,,
1,0.0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,,,,,,,,,,
2,0.0,0,1,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,...,,,,,,,,,,
3,0.0,1,0,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,0.0,0,0,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0.0,0,0,1,0,157500.0,254700.0,27558.0,225000.0,0.032561,...,,,,,,,,,,
307503,0.0,1,0,0,0,72000.0,269550.0,12001.5,225000.0,0.025164,...,,,,,,,,,,
307504,0.0,1,0,0,0,153000.0,677664.0,29979.0,585000.0,0.005002,...,,,,,,,,,,
307505,1.0,1,0,0,0,171000.0,370107.0,20205.0,319500.0,0.005313,...,,,,,,,,,,


## Classification with Pycaret, with all features and all rows (no undersampling, no SMOTE)

In [4]:
# import classification module 
from pycaret.classification import * 

#intialize the setup (in Notebook env)
exp_clf = setup(train_df[feats], target = 'TARGET', fold_shuffle = True,
                train_size = 0.7, data_split_shuffle = True, session_id = 2)


Unnamed: 0,Description,Value
0,session_id,2
1,Target,TARGET
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(307507, 797)"
5,Missing Values,True
6,Numeric Features,563
7,Categorical Features,233
8,Ordinal Features,False
9,High Cardinality Features,False


In [5]:
compare_models(include = ["gbc", "lightgbm", "ada", "ridge", "svm", "nb"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9197,0.772,0.0215,0.5566,0.0415,0.0357,0.0977,308.633
lightgbm,Light Gradient Boosting Machine,0.9197,0.7811,0.0364,0.5402,0.0683,0.0587,0.1247,17.092
ada,Ada Boost Classifier,0.9189,0.7612,0.035,0.4683,0.0651,0.0545,0.1106,63.804
ridge,Ridge Classifier,0.9187,0.0,0.0024,0.1867,0.0046,0.0027,0.01,15.506
svm,SVM - Linear Kernel,0.865,0.0,0.0746,0.0971,0.0708,0.0089,0.0108,59.113
nb,Naive Bayes,0.2178,0.5648,0.8766,0.0839,0.1532,0.0069,0.0274,3.747


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=2, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
compare_models(include = ["ridge", "rf", "qda", "ada", "lda", "catboost"])

IntProgress(value=0, description='Processing: ', max=34)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9194,0.7145,0.0015,0.76,0.003,0.0027,0.0302,2652.027
ridge,Ridge Classifier,0.9187,0.0,0.0024,0.1867,0.0046,0.0027,0.01,13.572


In [7]:
compare_models(exclude = ["knn", "rf", "ada", "ridge", "qda", "catboost", "et", "lda"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9197,0.772,0.0215,0.5566,0.0415,0.0357,0.0977,311.866
lightgbm,Light Gradient Boosting Machine,0.9197,0.7811,0.0364,0.5402,0.0683,0.0587,0.1247,14.773
xgboost,Extreme Gradient Boosting,0.9185,0.772,0.0628,0.4614,0.1105,0.093,0.1472,1575.449
svm,SVM - Linear Kernel,0.865,0.0,0.0746,0.0971,0.0708,0.0089,0.0108,60.715
dt,Decision Tree Classifier,0.8548,0.5424,0.1699,0.1492,0.1589,0.0798,0.08,864.213
nb,Naive Bayes,0.2178,0.5648,0.8766,0.0839,0.1532,0.0069,0.0274,4.443
lr,Logistic Regression,0.0919,0.0586,0.0001,0.01,0.0001,0.0,0.0002,698.929


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=2, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

## Classification with Pycaret, with all features and SMOTE (oversampling)

In [4]:
#import classification module, this time using the fix_imbalance integrated function in the 
# pycaret setup
from pycaret.classification import * 

#intialize the setup (in Notebook env)
exp_clf = setup(train_df[feats], target = 'TARGET', fold_shuffle = True,
                train_size = 0.7, data_split_shuffle=True, session_id = 2,
                fix_imbalance = True)

Unnamed: 0,Description,Value
0,session_id,2
1,Target,TARGET
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(307507, 797)"
5,Missing Values,True
6,Numeric Features,563
7,Categorical Features,233
8,Ordinal Features,False
9,High Cardinality Features,False


In [5]:
del feats
del train_df
gc.collect()

0

In [None]:
compare_models(include = ["nb", "lightgbm", "ridge", "svm"])  

IntProgress(value=0, description='Processing: ', max=24)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.1657,0.5478,0.9212,0.0824,0.1512,0.0037,0.0189,223.555


In [None]:
compare_models(include = ["ada", "ridge", "svm", "nb"])  

IntProgress(value=0, description='Processing: ', max=24)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9122,0.697,0.0409,0.2364,0.0692,0.0471,0.0677,399.021


In [None]:
compare_models(include = ["gbc", "lightgbm", "ada", "ridge", "svm", "nb"])

IntProgress(value=0, description='Processing: ', max=34)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9192,0.7408,0.0062,0.4457,0.0123,0.0101,0.0447,1085.358


# Classification with Pycaret, with undersampling

In [10]:
df_undersampl = train_df[feats]

In [8]:
# check how many instances we have where TARGET == 1
# this will help us choose how many rows we will keep where TARGET == 0
sum(train_df['TARGET'] == 1.0)

24825

In [12]:
np.random.choice(df_undersampl.TARGET[df_undersampl["TARGET"] == 0.0],
                 25000, replace=False)

array([0., 0., 0., ..., 0., 0., 0.])

In [13]:
np.random.choice(list(df_undersampl.TARGET[df_undersampl["TARGET"] == 0.0].index), 
                 25000, replace=False)

array([288536, 287210,  15453, ..., 178375,  59616, 182193])

In [14]:
# get indexes of rows we want to keep for the undersampling
random_indices = np.random.choice(list(df_undersampl.TARGET[df_undersampl["TARGET"] == 0.0].index), 
                 25000, replace=False)

In [16]:
# combine all the rows where TARGET == 0 and the rows we kept where TARGET == 0
df_us = pd.concat([df_undersampl.loc[list(random_indices)], 
           df_undersampl[df_undersampl["TARGET"] == 1.0]])

In [17]:
#import classification module 
from pycaret.classification import * 

#intialize the setup (in Notebook env)
exp_clf = setup(df_us, target = 'TARGET', fold_shuffle = True,
                train_size = 0.7, data_split_shuffle=True, session_id = 2)

Unnamed: 0,Description,Value
0,session_id,2
1,Target,TARGET
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(49825, 796)"
5,Missing Values,True
6,Numeric Features,548
7,Categorical Features,247
8,Ordinal Features,False
9,High Cardinality Features,False


In [18]:
compare_models(include = ["lightgbm", "gbc", "ada", "ridge", "svm", "nb"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7101,0.7795,0.7101,0.7082,0.7091,0.4202,0.4202,2.395
gbc,Gradient Boosting Classifier,0.7046,0.7742,0.7041,0.7028,0.7035,0.4092,0.4093,34.361
ridge,Ridge Classifier,0.7032,0.0,0.7013,0.702,0.7016,0.4064,0.4064,0.569
ada,Ada Boost Classifier,0.6959,0.7618,0.6872,0.6973,0.6922,0.3918,0.3918,6.675
nb,Naive Bayes,0.5186,0.5656,0.8857,0.5094,0.6468,0.0406,0.0596,0.298
svm,SVM - Linear Kernel,0.503,0.0,0.5815,0.518,0.4586,0.0068,0.0139,1.012


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=2, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)