In [2]:
import pycaret as pc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
from pycaret.classification import *

In [8]:
data = pd.read_csv('credit_risk_dataset.csv')
data.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [6]:
clf = setup(data, target='loan_status')

Unnamed: 0,Description,Value
0,session_id,1038
1,Target,loan_status
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(32581, 12)"
5,Missing Values,True
6,Numeric Features,7
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


Age — numerical variable; age in years

Income — numerical variable; annual income in dollars

Home status — categorical variable; “rent”, “mortgage” or “own”

Employment length — numerical variable; employment length in years

Loan intent — categorical variable; “education”, “medical”, “venture”, “home improvement”, “personal” or “debt consolidation”

Loan amount — numerical variable; loan amount in dollars

Loan grade — categorical variable; “A”, “B”, “C”, “D”, “E”, “F” or “G”

Interest rate — numerical variable; interest rate in percentage

Loan to income ratio — numerical variable; between 0 and 1

Historical default — binary, categorical variable; “Y” or “N”

Loan status — binary, numerical variable; 0 (no default) or 1 (default) → this is going to be our target variable

In [9]:
# Removing missing rows

data = data.dropna(axis=0)
data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [10]:
# Encoding Categorical (Not ORDINAL data)
# One hot encoding of categorical variables
df = pd.get_dummies(data=data,columns=['person_home_ownership','loan_intent','loan_grade','cb_person_default_on_file'])
df

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,22,59000,123.0,35000,16.02,1,0.59,3,0,0,...,0,0,0,0,1,0,0,0,0,1
1,21,9600,5.0,1000,11.14,0,0.10,2,0,0,...,0,0,1,0,0,0,0,0,1,0
2,25,9600,1.0,5500,12.87,1,0.57,3,1,0,...,0,0,0,1,0,0,0,0,1,0
3,23,65500,4.0,35000,15.23,1,0.53,2,0,0,...,0,0,0,1,0,0,0,0,1,0
4,24,54400,8.0,35000,14.27,1,0.55,4,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0,0.11,30,1,0,...,0,0,0,1,0,0,0,0,1,0
32577,54,120000,4.0,17625,7.49,0,0.15,19,1,0,...,0,1,0,0,0,0,0,0,1,0
32578,65,76000,3.0,35000,10.99,1,0.46,28,0,0,...,0,0,1,0,0,0,0,0,1,0
32579,56,150000,5.0,15000,11.48,0,0.10,26,1,0,...,0,0,1,0,0,0,0,0,1,0


In [12]:
len(df.columns)

27

In [11]:
clf = setup(df, target='loan_status')

Unnamed: 0,Description,Value
0,session_id,2255
1,Target,loan_status
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(28638, 27)"
5,Missing Values,False
6,Numeric Features,26
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [14]:
clf = setup(df, target='loan_status', normalize=True, normalize_method='maxabs', remove_outliers= True)

Unnamed: 0,Description,Value
0,session_id,3933
1,Target,loan_status
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(28638, 27)"
5,Missing Values,False
6,Numeric Features,26
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [15]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.9404,0.9392,0.7154,0.9786,0.8264,0.7916,0.8058,7.58
lightgbm,Light Gradient Boosting Machine,0.9389,0.9418,0.7112,0.974,0.8219,0.7862,0.8004,0.152
xgboost,Extreme Gradient Boosting,0.9382,0.9473,0.7271,0.9498,0.8235,0.7869,0.7973,12.225
rf,Random Forest Classifier,0.9374,0.9278,0.7022,0.975,0.8163,0.7797,0.7951,0.858
gbc,Gradient Boosting Classifier,0.9332,0.9257,0.6953,0.9559,0.8049,0.7659,0.7801,1.018
et,Extra Trees Classifier,0.9246,0.9108,0.6667,0.9347,0.7781,0.7342,0.7495,0.783
knn,K Neighbors Classifier,0.9001,0.854,0.5961,0.8578,0.7028,0.6451,0.6609,0.672
dt,Decision Tree Classifier,0.8937,0.8424,0.7575,0.7208,0.7386,0.672,0.6724,0.055
ada,Ada Boost Classifier,0.8927,0.8973,0.6313,0.7854,0.6998,0.6354,0.6411,0.336
lr,Logistic Regression,0.8744,0.8665,0.5225,0.7711,0.6223,0.5504,0.5657,0.407


<catboost.core.CatBoostClassifier at 0x7f6500cace20>

  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  _warn_prf(average, mo

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [18]:
dt = create_model('dt')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8971,0.8492,0.7698,0.7275,0.7481,0.6835,0.6839
1,0.8898,0.8417,0.7619,0.7059,0.7328,0.6635,0.6643
2,0.8913,0.8466,0.7725,0.707,0.7383,0.6699,0.671
3,0.8929,0.8393,0.7507,0.7201,0.7351,0.668,0.6682
4,0.895,0.8416,0.7533,0.7263,0.7396,0.6738,0.674
5,0.9018,0.8541,0.7751,0.7418,0.7581,0.6965,0.6968
6,0.8986,0.8502,0.7698,0.733,0.751,0.6874,0.6877
7,0.8887,0.8251,0.7196,0.7196,0.7196,0.6501,0.6501
8,0.8855,0.8311,0.7407,0.7,0.7198,0.6479,0.6483
9,0.896,0.8456,0.7619,0.7273,0.7442,0.679,0.6793


In [19]:
catb = create_model('catboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9491,0.956,0.7513,0.9895,0.8541,0.824,0.8355
1,0.9454,0.9459,0.7434,0.9757,0.8438,0.8115,0.8225
2,0.9396,0.9412,0.7116,0.9782,0.8239,0.7886,0.8031
3,0.9375,0.9274,0.695,0.985,0.8149,0.7787,0.7958
4,0.9375,0.9318,0.7029,0.9743,0.8166,0.7802,0.7953
5,0.9475,0.9466,0.7434,0.9894,0.8489,0.8179,0.8302
6,0.9391,0.9391,0.6984,0.9925,0.8199,0.7845,0.8021
7,0.9359,0.9209,0.6984,0.9706,0.8123,0.7749,0.7902
8,0.9333,0.9446,0.6878,0.9665,0.8037,0.7649,0.781
9,0.9396,0.9384,0.7222,0.9647,0.826,0.7904,0.8025


In [20]:
lightg = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.947,0.9533,0.7513,0.9759,0.849,0.8175,0.8278
1,0.9412,0.9476,0.7354,0.9586,0.8323,0.7974,0.8077
2,0.9407,0.9463,0.7116,0.9853,0.8264,0.7918,0.807
3,0.9354,0.9303,0.687,0.9811,0.8081,0.7707,0.7884
4,0.9354,0.9372,0.6976,0.9669,0.8105,0.7728,0.7877
5,0.9449,0.9503,0.7381,0.9789,0.8416,0.809,0.8209
6,0.938,0.9443,0.6958,0.9887,0.8168,0.7808,0.7983
7,0.9354,0.9248,0.6958,0.9705,0.8105,0.7728,0.7884
8,0.9338,0.9449,0.6931,0.9632,0.8062,0.7675,0.7826
9,0.9375,0.9384,0.7063,0.9709,0.8178,0.7812,0.7956


In [21]:
xgb = create_model('xgboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.947,0.9622,0.754,0.9727,0.8495,0.8179,0.8277
1,0.9412,0.9452,0.746,0.9463,0.8343,0.7992,0.8075
2,0.937,0.947,0.7222,0.9479,0.8198,0.7825,0.7931
3,0.9354,0.9392,0.7003,0.9635,0.8111,0.7733,0.7876
4,0.9338,0.944,0.7188,0.9313,0.8114,0.7721,0.7816
5,0.9417,0.9594,0.7381,0.9588,0.8341,0.7994,0.8095
6,0.9428,0.9463,0.7249,0.9821,0.8341,0.8004,0.8139
7,0.9307,0.9329,0.709,0.9241,0.8024,0.7612,0.7711
8,0.9349,0.9475,0.7169,0.941,0.8138,0.7752,0.7857
9,0.9375,0.9495,0.7407,0.9302,0.8247,0.7873,0.7949


In [22]:
rf = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9444,0.9432,0.7328,0.9823,0.8394,0.8066,0.8193
1,0.9402,0.9265,0.7222,0.9681,0.8273,0.792,0.8044
2,0.9402,0.9319,0.7169,0.9748,0.8262,0.7911,0.8047
3,0.9354,0.9182,0.687,0.9811,0.8081,0.7707,0.7884
4,0.9307,0.927,0.679,0.9588,0.795,0.7548,0.771
5,0.9422,0.9302,0.7196,0.9855,0.8318,0.7979,0.8123
6,0.9359,0.9321,0.6931,0.9776,0.8111,0.7739,0.7905
7,0.9333,0.9134,0.6772,0.9808,0.8013,0.7628,0.7817
8,0.9322,0.9336,0.6825,0.9663,0.8,0.7607,0.7773
9,0.9391,0.9219,0.7116,0.9746,0.8226,0.7869,0.8011


In [25]:
dt_tuned = tune_model(dt, optimize='Recall')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9323,0.8825,0.6825,0.9663,0.8,0.7607,0.7773
1,0.9302,0.8814,0.6931,0.9391,0.7976,0.7565,0.7693
2,0.9297,0.8792,0.6561,0.9841,0.7873,0.7472,0.7692
3,0.9244,0.8562,0.6419,0.9641,0.7707,0.7276,0.7492
4,0.927,0.8596,0.6499,0.9722,0.779,0.7373,0.7588
5,0.9375,0.8832,0.7037,0.9744,0.8172,0.7807,0.7957
6,0.9317,0.8618,0.672,0.9769,0.7962,0.7569,0.776
7,0.9265,0.8589,0.6481,0.9722,0.7778,0.7358,0.7575
8,0.9223,0.8656,0.6455,0.9457,0.7673,0.7226,0.7416
9,0.927,0.8754,0.6561,0.965,0.7811,0.7392,0.759


In [24]:
catb_tuned = tune_model(catb, optimize='Recall')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9449,0.9466,0.7487,0.9659,0.8435,0.8107,0.8204
1,0.9428,0.9326,0.7487,0.9529,0.8385,0.8044,0.813
2,0.9375,0.9366,0.7249,0.9481,0.8216,0.7845,0.7949
3,0.937,0.9195,0.7029,0.9707,0.8154,0.7786,0.7933
4,0.938,0.9294,0.7188,0.9576,0.8212,0.7846,0.7965
5,0.9422,0.9442,0.7407,0.9589,0.8358,0.8015,0.8113
6,0.9386,0.9304,0.7169,0.9644,0.8225,0.7863,0.7989
7,0.9354,0.9102,0.7037,0.9603,0.8122,0.7743,0.788
8,0.9349,0.9401,0.7116,0.9472,0.8127,0.7742,0.7858
9,0.938,0.9252,0.7275,0.9483,0.8234,0.7866,0.7967


In [26]:
lightg_tuned = tune_model(lightg, optimize='Recall')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9381,0.953,0.7381,0.9362,0.8254,0.7884,0.7967
1,0.9333,0.9419,0.7407,0.9061,0.8151,0.775,0.7808
2,0.9328,0.9388,0.709,0.9371,0.8072,0.7675,0.7784
3,0.9333,0.9296,0.7003,0.9496,0.8061,0.7669,0.7799
4,0.9286,0.9328,0.7029,0.917,0.7958,0.7534,0.7632
5,0.938,0.9479,0.7434,0.9305,0.8265,0.7893,0.7967
6,0.9317,0.9383,0.6984,0.9429,0.8024,0.7623,0.7748
7,0.9275,0.9216,0.6931,0.9225,0.7915,0.7487,0.76
8,0.9244,0.9368,0.6772,0.9209,0.7805,0.7361,0.7488
9,0.9296,0.9342,0.7143,0.9122,0.8012,0.7592,0.7676


In [27]:
xgb_tuned = tune_model(xgb, optimize='recall')

IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.








Traceback (most recent call last):
  File "/home/leonardo/.virtualenvs/imageProcess/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/leonardo/.virtualenvs/imageProcess/lib/python3.8/site-packages/pycaret/internal/pipeline.py", line 118, in fit
    result = super().fit(X, y=y, **fit_kwargs)
  File "/home/leonardo/.virtualenvs/imageProcess/lib/python3.8/site-packages/imblearn/pipeline.py", line 281, in fit
    self._final_estimator.fit(Xt, yt, **fit_params)
  File "/home/leonardo/.virtualenvs/imageProcess/lib/python3.8/site-packages/catboost/core.py", line 4673, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/home/leonardo/.virtualenvs/imageProcess/lib/python3.8/site-packages/catboost/core.py", line 1978, in _fit
    train_params = self._prepare_train_params(
  File "/hom





KeyboardInterrupt: 

In [28]:
rf_tuned = tune_model(rf, optimize='F1')

IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.5s


KeyboardInterrupt: 