In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import gc
import warnings
import os
import time

from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

#warnings.simplefilter(action='ignore', category=FutureWarning)

print(os.listdir("../Projet+Mise+en+prod+-+home-credit-default-risk"))

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))



['application_test.csv', '.DS_Store', 'HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'application_train.csv', 'bureau.csv', 'previous_application.csv', 'bureau_balance.csv', 'sample_submission.csv']


In [2]:
# load training data from combined csv files
# this is the df as is right after combining all the csv files
# the null targets are still in this df
app_train = pd.read_csv('../my_csv_files/MY_df_for_pycaret.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

Training data shape:  (356251, 799)


Unnamed: 0.1,Unnamed: 0,index,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,0,100002,1.0,0,0,0,0,202500.0,406597.5,...,,,,,,,,,,
1,1,1,100003,0.0,1,0,1,0,270000.0,1293502.5,...,,,,,,,,,,
2,2,2,100004,0.0,0,1,0,0,67500.0,135000.0,...,,,,,,,,,,
3,3,3,100006,0.0,1,0,0,0,135000.0,312682.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,4,100007,0.0,0,0,0,0,121500.0,513000.0,...,,,,,,,,,,


In [3]:
import re

app_train = app_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
train_df = app_train[app_train['TARGET'].notnull()]
test_df = app_train[app_train['TARGET'].isnull()]
#feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
feats = [f for f in train_df.columns if f not in ['SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV',
                                                  'Unnamed: 0',
                                                  'index']]
train_df[feats]
train_df['TARGET']

0         1.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
307502    0.0
307503    0.0
307504    0.0
307505    1.0
307506    0.0
Name: TARGET, Length: 307507, dtype: float64

In [4]:
print(train_df[feats].shape)
print(train_df['TARGET'].shape)

(307507, 797)
(307507,)


## Classification with PyCaret, with all features and all rows (no undersampling, no SMOTE)

In [5]:
#import classification module 
from pycaret.classification import * 

#intialize the setup (in Notebook env)
exp_clf = setup(train_df[feats], target = 'TARGET', fold_shuffle = True,
                train_size = 0.7, data_split_shuffle = True, session_id = 4)
#imputation_type='iterative')


Unnamed: 0,Description,Value
0,session_id,4
1,Target,TARGET
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(307507, 797)"
5,Missing Values,True
6,Numeric Features,563
7,Categorical Features,233
8,Ordinal Features,False
9,High Cardinality Features,False


In [6]:
def my_comp_score(y_true, y_pred):
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    tn_weight = 1
    fp_weight = 0
    fn_weight = -10
    tp_weight = 0

    # gain function for company, true positives and false positives don't matter that much to us
    # we want to penalise the false negatives, and we want to say yes to true negatives
    gain = tp*(tp_weight) + tn*(tn_weight) + fp*(fp_weight) + fn*(fn_weight)
    
    # best represents scenario where there are no false negatives or false postives
    # so all false positives would be correctly shown as negative
    # and all false negatives would be correctly show as positive
    best = (tn + fp) * tn_weight + (tp + fn) * tp_weight
    
    # baseline is a naive model that predicts non default(negative) for everyone
    # but all true positives and false negatives would be incorrectly shown as negative
    baseline = (tn + fp) * tn_weight + (tp + fn) * fn_weight
    
    score = (gain - baseline) / (best - baseline)
    
    return score

def my_comp_score_probs(y_true, y_pred):
    
    # for now y_pred is the probability that the value is 1
    y_use = [1 if i >= 0.5 else 0 for i in y_pred]
    
    y_use = pd.Series(y_use)

    tn, fp, fn, tp = confusion_matrix(y_true, y_use).ravel()
    
    tn_weight = 1
    fp_weight = 0
    fn_weight = -10
    tp_weight = 0

    # gain function for company, true positives and false positives don't matter that much to us
    # we want to penalise the false negatives, and we want to say yes to true negatives
    gain = tp*(tp_weight) + tn*(tn_weight) + fp*(fp_weight) + fn*(fn_weight)
    
    # best represents scenario where there are no false negatives or false postives
    # so all false positives would be correctly shown as negative
    # and all false negatives would be correctly show as positive
    best = (tn + fp) * tn_weight + (tp + fn) * tp_weight
    
    # baseline is a naive model that predicts non default(negative) for everyone
    # so all true positives and false negatives would be incorrectly shown as negative
    baseline = (tn + fp) * tn_weight + (tp + fn) * fn_weight
    
    score = ((gain - baseline) / (best - baseline))

    return score

In [7]:
add_metric("my_own_score", name = "My Custom Scorer", score_func = my_comp_score)

Name                                           My Custom Scorer
Display Name                                   My Custom Scorer
Score Function       <function my_comp_score at 0x7f8eb1546e50>
Scorer                               make_scorer(my_comp_score)
Target                                                     pred
Args                                                         {}
Greater is Better                                          True
Multiclass                                                 True
Custom                                                     True
Name: my_own_score, dtype: object

In [None]:
compare_models(include = ["gbc", "lightgbm", "ada", "ridge", "svm", "nb"])

IntProgress(value=0, description='Processing: ', max=34)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,My Custom Scorer,TT (Sec)


In [7]:
compare_models(exclude = ["knn", "rf", "ada", "ridge", "qda", "catboost", "et", "lda"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9197,0.772,0.0215,0.5566,0.0415,0.0357,0.0977,311.866
lightgbm,Light Gradient Boosting Machine,0.9197,0.7811,0.0364,0.5402,0.0683,0.0587,0.1247,14.773
xgboost,Extreme Gradient Boosting,0.9185,0.772,0.0628,0.4614,0.1105,0.093,0.1472,1575.449
svm,SVM - Linear Kernel,0.865,0.0,0.0746,0.0971,0.0708,0.0089,0.0108,60.715
dt,Decision Tree Classifier,0.8548,0.5424,0.1699,0.1492,0.1589,0.0798,0.08,864.213
nb,Naive Bayes,0.2178,0.5648,0.8766,0.0839,0.1532,0.0069,0.0274,4.443
lr,Logistic Regression,0.0919,0.0586,0.0001,0.01,0.0001,0.0,0.0002,698.929


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=2, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
# for other second pycaret notebook
#compare_models(include = ["gbc", "lightgbm", "ada", "ridge", "svm", "nb"])