# Goal: predict whether a loan will end up with maximum profits or not

---
#### Target variable: `outcome` 
* Type: **Categorical** 
* Model type: Classification 
* Sourced from: `zeroBalCode`
* Data: 
    - "0" means "Closed" (i.e. a successful outcome for Fannie Mae)
    - "1" means "Default" (i.e. a negative outcome)

---
#### This Notebook:
* Input required: The output files from "Scott - Data Pre - 2 - 50 50 split train test" notebook
    - ../data/DataPre-2-5050-split-2011-test.csv
* Outputs generated: Decision on what model to use

#### Expected Workflow
1. Scott - Data Pre - 1 - Feature EEE
2. Scott - Data Pre - 2 - 50 50 split train test
3. Scott - Model - 1- PyCaret Setup and Create Model

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pycaret.classification import *
#!pip install pycaret

from sklearn.feature_selection import VarianceThreshold

import winsound

# Tell Jupyter to display all text, not just "the last" and print()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

%pwd

def DoneNotice(duration_ms = 1000):
    duration = duration_ms  # milliseconds
    freq = 440  #Hz
    winsound.Beep(freq, duration)

from IPython.display import Markdown, display
def Important(html_tag, message, color):
    colorstr = f"<{html_tag} style='color:{color}'>{message}</{html_tag}>"
    display(Markdown(colorstr))

'C:\\Users\\Scott\\Desktop\\Project3_2\\Scott\\ML EDA'

# Importing the data

In [3]:
df = pd.read_csv("../data/DataPre-2-5050-split-2011-test.csv")

# Remove the weird unnamed column
df.drop(['Unnamed: 0'], 1, inplace=True)

print(df.shape)

df.sample(5)

(17228, 12)


Unnamed: 0,origChannel,origIntRate,origUPB,origLTV,numBorrowers,origDebtIncRatio,loanPurp,worstCreditScore,bankNumber,stateNumber,mSA,zeroBalCode
14827,3,4.375,134000,80,1,39,2,684,54,10,0,1
2551,1,5.25,138000,70,1,40,1,652,4,46,41620,0
16123,1,5.0,146000,78,2,22,2,642,80,36,0,1
10907,1,6.375,141000,80,1,36,1,625,4,39,37980,1
9805,3,4.875,275000,74,1,41,2,682,32,13,19780,0


# Pycaret - Setup with categorical definition

#### Normalization
https://pycaret.org/normalization/

> `normalize: bool, default = False` - When set to True, the feature space is transformed using the normalized_method param. **Generally, linear algorithms perform better with normalized data** however, the results may vary and it is advised to run multiple experiments to evaluate the benefit of normalization.

In [8]:
%%time

model_setup = setup(
    df
    , target = 'zeroBalCode' # PyCaret will list this as "Label"
    , pca = False 
    # , ignore_low_variance = True # Variance is calculated using the ratio of unique values to the number of samples, and the ratio of the most common value to the frequency of the second most common value.
    , normalize = True
    , ignore_features = None
    # , remove_outliers = True # outliers from the training data are removed using PCA linear dimensionality reduction using the Singular Value Decomposition technique.
    , silent = True
    , profile = False
    , categorical_features = [
            'origChannel'
            , 'loanPurp'
            , 'bankNumber'
            , 'stateNumber'
            , 'mSA'
        ]
    , numeric_features = [
        'origIntRate'
        , 'origUPB'
        , 'origLTV'
        , 'numBorrowers'
        , 'origDebtIncRatio'
        , 'worstCreditScore'
    ]
)

DoneNotice(2000)

Important("h1", "PyCaret setup completed", 'blue')

# session_id - if you ever want to reprint the results later, pass the session_id to setup()
#      and it will run the setup using the same split of test/train

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,4660
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(17228, 12)"
4,Missing Values,False
5,Numeric Features,6
6,Categorical Features,5
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


<h1 style='color:blue'>PyCaret setup completed</h1>

Wall time: 3.02 s


# Decide which model to use

In [15]:


model_results=compare_models(
    fold=10
    , blacklist = None
    ,  round = 4 # number of decimal places to round to. 4 is default
    ,  sort = 'Recall'
    , turbo = True # True by default. Auto blacklists models that have longer training times. When True, rbfsvm, gpc and mlp are excluded due to longer training times. If you set to False, could take a long time
)
model_results

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Gaussian Process Classifier,0.7437,0.8117,0.769,0.7098,0.7382,0.4879
1,CatBoost Classifier,0.7409,0.816,0.7602,0.7093,0.7338,0.4819
2,Light Gradient Boosting Machine,0.7337,0.8107,0.7494,0.7035,0.7256,0.4675
3,Extreme Gradient Boosting,0.7194,0.7922,0.739,0.6875,0.7123,0.4392
4,SVM - Radial Kernel,0.6998,0.7754,0.7386,0.6619,0.698,0.4013
5,Extra Trees Classifier,0.7555,0.8458,0.7285,0.7455,0.7369,0.5086
6,Gradient Boosting Classifier,0.7174,0.7924,0.7275,0.6887,0.7075,0.4345
7,MLP Classifier,0.7208,0.7769,0.7226,0.6956,0.7087,0.4408
8,Decision Tree Classifier,0.6991,0.7,0.7156,0.6679,0.6909,0.3984
9,Ada Boost Classifier,0.7088,0.7775,0.7132,0.6819,0.6972,0.4171


# Results
Full test set: Extra Trees Classifier

In [10]:
%%time
# Can influence performance by reducing # of folds (10 is def.) or adding blacklists/exclusions
# Regression has about 21 models
# Classification has about 15 models
# Logistic Regression - ‘lr’
# K Nearest Neighbour - ‘knn’
# Naives Bayes - ‘nb’
# Decision Tree - ‘dt’
# SVM (Linear) - ‘svm’
# Gaussian Process - ‘gpc’
# Ridge Classifier - ‘ridge’
# Random Forest - ‘rf’
# Quadratic Disc. Analysis - ‘qda’
# AdaBoost - ‘ada’
# Linear Disc. Analysis - ‘lda’
# Extra Trees Classifier - ‘et’
# Extreme Gradient Boosting - ‘xgboost’
# Light Gradient Boosting - ‘lightgbm’
# Cat Boost Classifier - ‘catboost’
####### Off by default (enable w "turbo = False")
# Multi Level Perceptron - ‘mlp’
# Gradient Boosting Classifier - ‘gbc’
# SVM (RBF) - ‘rbfsvm’
# 
# compare_models(blacklist=['catboost', 'xgboost', 'lightgbm'])
# 
# Slow: 
#    - Gaussian Process Classifier
#    - 

model_results=compare_models(
    fold=10
    , blacklist = [
            'nb'
            , 'knn'
            , 'svm'
            , 'qda'
            , 'rf'
            , 'lda'
            , 'ridge'
        ] 
    ,  round = 4 # number of decimal places to round to. 4 is default
    ,  sort = 'Recall'
    , turbo = True # True by default. Auto blacklists models that have longer training times. When True, rbfsvm, gpc and mlp are excluded due to longer training times. If you set to False, could take a long time
)
model_results

DoneNotice(2000)

IntProgress(value=0, description='Processing: ')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Decision Tree Classifier,0.907,0.9069,0.993,0.8473,0.9144,0.8139
1,Extra Trees Classifier,0.9797,0.997,0.9889,0.971,0.9799,0.9594
2,Light Gradient Boosting Machine,0.8316,0.9066,0.9065,0.7888,0.8434,0.6631
3,Extreme Gradient Boosting,0.7439,0.8152,0.8065,0.7172,0.7591,0.4878
4,Gradient Boosting Classifier,0.7448,0.8183,0.8056,0.7186,0.7595,0.4897
5,Logistic Regression,0.7372,0.8085,0.7773,0.7199,0.7474,0.4744
6,Ada Boost Classifier,0.7109,0.7837,0.7365,0.701,0.7182,0.4218


KeyboardInterrupt: 

In [9]:
# Extra Trees Classifier	 has best balance
et = create_model(
    'et'
    , ensemble = False
    , method = None # if ensemble, choose 'Bagging' or 'Boosting'
    , fold = 10
    , round = 4 # decimal places
    , verbose = True
)

IntProgress(value=0, description='Processing: ', max=14)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa


NameError: name 'X_train' is not defined