In [1]:
import pandas as pd
import numpy as np

from pycaret.classification import *

In [2]:
# Bug with latest version of pycaret - need to revert to old version of sklearn
#!pip install --user --force-reinstall scikit-learn==1.2.1

In [2]:
raw_df = pd.read_csv("./Loan_status_2007-2020Q3.gzip", compression=None, low_memory=False)

### I. Preprocessing Step

In [3]:
accepted_df = raw_df[~raw_df.loan_amnt.isna()]

In [4]:
df = accepted_df.copy()

In [5]:
# Drop columns with more than 90% missing values
df.dropna(axis=1, thresh=len(df) * 0.9, inplace=True)

# Drop rows with any missing values
df.dropna(inplace=True)  

In [6]:
# Interest Rate
df["int_rate"] = df["int_rate"].str.rstrip("%").astype("float") / 100.0

# Employment Length
df["emp_length"] = df["emp_length"].str.extract(r"(\d+)")
df["emp_length"] = pd.to_numeric(df["emp_length"], errors="coerce")

In [7]:
# Application Purpose
df["purpose"] = df["purpose"].str.replace("_", " ")
df["purpose"] = df["purpose"].str.title()

In [8]:
# Application Purpose
df["purpose"] = np.where(df["purpose"] == "Car", "Major Purchase", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "Vacation", "Major Purchase", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "Small Business", "Major Purchase", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "Wedding", "Major Purchase", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "Moving", "Other", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "House", "Other", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "House", "Other", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "Educational", "Other", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "Renewable Energy", "Other", df["purpose"])
df["purpose"] = np.where(df["purpose"] == "Medical", "Other", df["purpose"])

In [9]:
# Grade
df["grade"] = np.where(df["grade"] == "D", "D and Below", df["grade"])
df["grade"] = np.where(df["grade"] == "E", "D and Below", df["grade"])
df["grade"] = np.where(df["grade"] == "F", "D and Below", df["grade"])
df["grade"] = np.where(df["grade"] == "G", "D and Below", df["grade"])

In [10]:
# Home Ownership Type
df["home_ownership"] = np.where(df["home_ownership"] == "ANY", "OTHER", df["home_ownership"])
df["home_ownership"] = np.where(df["home_ownership"] == "NONE", "OTHER", df["home_ownership"])

### II. Supervised Learning - Set Up

In [11]:
# Target: Delinquent in last 2 years (flag)
df["flag_delinq2yrs"] = np.where(df["delinq_2yrs"] == 0, 0, 1)

In [12]:
supervised_features = [
    "loan_amnt", 
    "term", 
    "int_rate", 
    "grade", 
    "home_ownership", 
    "annual_inc", 
    "purpose", 
    "emp_length", 
    "flag_delinq2yrs"
]

In [13]:
df_fit = df[supervised_features]

In [14]:
# home_ownership = "OTHER" is 0.1%
df_fit = df_fit[df_fit["home_ownership"] != "OTHER"]

In [15]:
df_fit.head()

Unnamed: 0,loan_amnt,term,int_rate,grade,home_ownership,annual_inc,purpose,emp_length,flag_delinq2yrs
42537,4800.0,36 months,0.1099,B,MORTGAGE,39600.0,Home Improvement,2,0
42538,27050.0,36 months,0.1099,B,OWN,55000.0,Debt Consolidation,10,0
42539,12000.0,36 months,0.0762,A,MORTGAGE,96500.0,Debt Consolidation,3,0
42540,14000.0,36 months,0.1285,B,RENT,88000.0,Debt Consolidation,4,1
42541,12000.0,36 months,0.0662,A,MORTGAGE,105000.0,Debt Consolidation,10,0


In [24]:
get_config("X_train_transformed")

Unnamed: 0,loan_amnt,term,int_rate,grade_C,grade_D and Below,grade_A,grade_B,home_ownership_OWN,home_ownership_MORTGAGE,home_ownership_RENT,annual_inc,purpose_Debt Consolidation,purpose_Credit Card,purpose_Home Improvement,purpose_Other,purpose_Major Purchase,emp_length
2811302,0.548718,0.000000,0.376947,1.00000,0.0,0.0,0.00000,1.00000,0.00000,0.0,0.011364,1.0,0.000000,0.000000,0.000000,0.0,0.000000
2096270,0.247436,0.000000,0.454829,0.00000,1.0,0.0,0.00000,0.00000,1.00000,0.0,0.004728,1.0,0.000000,0.000000,0.000000,0.0,0.000000
96814,0.102564,0.000000,0.385903,1.00000,0.0,0.0,0.00000,0.00000,1.00000,0.0,0.009364,1.0,0.000000,0.000000,0.000000,0.0,0.111111
1810677,0.435897,0.000000,0.087617,0.00000,0.0,1.0,0.00000,0.00000,1.00000,0.0,0.007728,0.0,1.000000,0.000000,0.000000,0.0,1.000000
2156189,0.487179,0.000000,0.053349,0.00000,0.0,1.0,0.00000,0.00000,1.00000,0.0,0.011092,0.0,0.000000,1.000000,0.000000,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3982968,0.230769,0.000000,0.571560,0.00000,1.0,0.0,0.00000,0.00000,1.00000,0.0,0.003637,0.0,0.162526,0.000000,0.837474,0.0,0.555556
3982969,0.564103,1.000000,0.336872,0.89484,0.0,0.0,0.10516,0.89484,0.10516,0.0,0.005910,1.0,0.000000,0.000000,0.000000,0.0,1.000000
3982970,0.316466,0.078239,0.532609,0.00000,1.0,0.0,0.00000,0.00000,1.00000,0.0,0.003637,1.0,0.000000,0.000000,0.000000,0.0,1.000000
3982971,0.487179,0.000000,0.087023,0.00000,0.0,1.0,0.00000,0.00000,1.00000,0.0,0.008819,0.0,0.531344,0.468656,0.000000,0.0,1.000000


In [16]:
# Check if target variable is balanced or not
df_fit["flag_delinq2yrs"].mean()

0.18002257886186437

In [17]:
supervised_setup = setup(
    df_fit, 
    target = 'flag_delinq2yrs', 
    categorical_features = ["term", "grade", "home_ownership", "purpose"], 
    normalize = True,
    normalize_method = "minmax",
    fix_imbalance = True,
    session_id = 221
)

Unnamed: 0,Description,Value
0,Session id,221
1,Target,flag_delinq2yrs
2,Target type,Binary
3,Original data shape,"(2360615, 9)"
4,Transformed data shape,"(3418095, 18)"
5,Transformed train set shape,"(2709910, 18)"
6,Transformed test set shape,"(708185, 18)"
7,Ordinal features,1
8,Numeric features,4
9,Categorical features,4


### III. Supervised Learning - Training

In [19]:
supervised_models = compare_models(include = ["lr", "ridge", "lda", "xgboost", "rf"], sort = "Accuracy")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.8158,0.6238,0.0188,0.3076,0.0354,0.015,0.035,154.262
rf,Random Forest Classifier,0.7791,0.5722,0.1002,0.2343,0.1404,0.0365,0.0408,256.1
lr,Logistic Regression,0.5633,0.6127,0.6073,0.23,0.3336,0.0981,0.1238,74.281
ridge,Ridge Classifier,0.5583,0.0,0.6058,0.2273,0.3306,0.0931,0.1182,42.263
lda,Linear Discriminant Analysis,0.5583,0.6073,0.6056,0.2273,0.3305,0.0931,0.1181,39.179


Processing:   0%|          | 0/25 [00:00<?, ?it/s]

In [20]:
save_compare_models = pull()
save_compare_models.reset_index().to_csv("table_compare_supervised_models.csv", index=False)

In [22]:
print(supervised_models)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=221, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='auto',
              use_label_encoder=True, validate_parameters=1, verbosity=0)


In [23]:
xgb = create_model("xgboost")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.816,0.6239,0.018,0.3107,0.0341,0.0146,0.0349
1,0.8155,0.6218,0.0191,0.3029,0.0359,0.0149,0.0342
2,0.8163,0.6247,0.0186,0.3236,0.0352,0.0159,0.0382
3,0.8158,0.621,0.0206,0.3196,0.0387,0.0173,0.0393
4,0.816,0.6239,0.0195,0.318,0.0368,0.0163,0.038
5,0.8155,0.625,0.0204,0.3097,0.0383,0.0164,0.037
6,0.8153,0.6265,0.0191,0.298,0.0358,0.0145,0.0331
7,0.8161,0.6251,0.0161,0.2987,0.0306,0.0124,0.0306
8,0.8169,0.6234,0.0169,0.3318,0.0321,0.015,0.038
9,0.8154,0.6245,0.0186,0.2983,0.035,0.0142,0.0328


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
save_model(xgb, 'xgb_model')
print(xgb)

Transformation Pipeline and Model Successfully Saved
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=221, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='auto',
              use_label_encoder=True, validate_parameters=1, verbosity=0)


In [18]:
xgb = load_model("xgb_model")

Transformation Pipeline and Model Successfully Loaded


### IV. Supervised Learning - Performance Metrics

In [19]:
evaluate_model(xgb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [20]:
evaluate_model(xgb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [21]:
evaluate_model(xgb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [22]:
evaluate_model(xgb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [23]:
evaluate_model(xgb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### V. Supervised Learning - Hyperparameter Tuning

In [25]:
#tuned_xgb = tune_model(xgb)

In [None]:
#save_model(tuned_xgb, 'tuned_xgb_model')
#print(tuned_xgb)

In [None]:
#evaluate_model(tuned_xgb)

In [None]:
#evaluate_model(tuned_xgb)