# ML Foreclosure Binary Classification - EPOCH2 sample dataset
Only "resolved" outcomes - Either paid off or default 

    target = zeroBalCode (set to object dtype) 
    Binary classification sets zeroBalCode to either "0" or "1" for the 7 possible outcomes
    
    "0" = "Loan Success"
        * 01 = Prepaid or Matured
        * 06 = Repurchased
        * 16 = Reperforming Loan Sale

    "1" =  "Loan Failure" - Most likely loss of money (but not necessarily) one of below conditions:
        * 02 = Third Party Sale
        * 03 = Short Sale
        * 09 = Deed-in-Lieu,REO
        * 15 = Note Sale


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#!pip install pycaret
from imblearn.over_sampling import RandomOverSampler
from pycaret.classification import *
#from pycaret.regression import *
pd.set_option('display.max_columns', None)


# Importing the data - don't use *.ML.csv files!!!
If you have *.MLReady.csv then [click here](#MLReady)

In [2]:
df=pd.read_csv("data/FM_Acq2_Perf_FMAC_EPOCH2_loanAge.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,origChannel,sellerName,origIntRate,origUPB,origLoanTerm,origDate,origLTV,origCLTV,numBorrowers,origDebtIncRatio,loanPurp,propState,zipCode,pMIperct,mortInsType,bestCreditScore,worstCreditScore,avgCreditScore,loanAge,monMatur,mSA,zeroBalCode,deliqGood,deliqBad,deliqMax,fmacRateMax,fmacRateMin,fmacRateAvg,fmacRateVolatility,fredRate,rateDiffAbove,rateDiffBelow,rateDiffAvg,rateDiffAbovePct,rateDiffBelowPct,rateDiffAvgPct
0,2307189,100012021911,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",4.375,417000,180,2009-02-01,27.0,27.0,2.0,14.0,R,NJ,76,0.0,0.0,759.0,756.0,757.5,43,137.0,35620,1.0,39,4,1.0,5.25,5.04,5.13,0.041667,2.87,-0.875,0.665,-0.755,-0.166667,0.131944,-0.147173
1,2307190,100209956357,B,"WELLS FARGO BANK, N.A.",5.125,348000,360,2009-02-01,87.0,87.0,1.0,50.0,R,VT,51,25.0,1.0,689.0,689.0,689.0,90,293.0,0,9.0,75,15,7.0,5.25,5.04,5.13,0.041667,2.87,-0.125,-0.085,-0.005,-0.02381,-0.016865,-0.000975
2,2307191,100250194205,R,OTHER,5.125,65000,360,2009-02-01,95.0,95.0,1.0,17.0,P,IL,613,25.0,1.0,738.0,738.0,738.0,38,322.0,0,9.0,18,20,16.0,5.25,5.04,5.13,0.041667,2.87,-0.125,-0.085,-0.005,-0.02381,-0.016865,-0.000975
3,2307192,100366856388,R,OTHER,4.875,37000,360,2009-02-01,55.0,55.0,1.0,23.0,P,IL,626,0.0,0.0,777.0,777.0,777.0,48,312.0,0,6.0,44,4,5.0,5.25,5.04,5.13,0.041667,2.87,-0.375,0.165,-0.255,-0.071429,0.032738,-0.049708
4,2307193,100392775700,C,"BANK OF AMERICA, N.A.",4.625,195000,360,2009-02-01,52.0,52.0,2.0,54.0,C,NJ,82,0.0,0.0,810.0,703.0,756.5,69,291.0,12100,1.0,68,1,1.0,5.25,5.04,5.13,0.041667,2.87,-0.625,0.415,-0.505,-0.119048,0.082341,-0.098441


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240948 entries, 0 to 240947
Data columns (total 38 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          240948 non-null  int64  
 1   id                  240948 non-null  int64  
 2   origChannel         240948 non-null  object 
 3   sellerName          240948 non-null  object 
 4   origIntRate         240948 non-null  float64
 5   origUPB             240948 non-null  int64  
 6   origLoanTerm        240948 non-null  int64  
 7   origDate            240948 non-null  object 
 8   origLTV             240948 non-null  float64
 9   origCLTV            240948 non-null  float64
 10  numBorrowers        240948 non-null  float64
 11  origDebtIncRatio    240948 non-null  float64
 12  loanPurp            240948 non-null  object 
 13  propState           240948 non-null  object 
 14  zipCode             240948 non-null  int64  
 15  pMIperct            240948 non-nul

## Remove columns not involved in 'Foreclosure or Not' query

In [4]:
df.drop(['Unnamed: 0','id','fredRate',\
         'bestCreditScore','worstCreditScore','monMatur',\
         'deliqGood','deliqBad','deliqMax',\
        'fmacRateMax','fmacRateMin','fmacRateVolatility',\
        'rateDiffAbove','rateDiffBelow','rateDiffAbovePct','rateDiffBelowPct',\
        ],1,inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240948 entries, 0 to 240947
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   origChannel       240948 non-null  object 
 1   sellerName        240948 non-null  object 
 2   origIntRate       240948 non-null  float64
 3   origUPB           240948 non-null  int64  
 4   origLoanTerm      240948 non-null  int64  
 5   origDate          240948 non-null  object 
 6   origLTV           240948 non-null  float64
 7   origCLTV          240948 non-null  float64
 8   numBorrowers      240948 non-null  float64
 9   origDebtIncRatio  240948 non-null  float64
 10  loanPurp          240948 non-null  object 
 11  propState         240948 non-null  object 
 12  zipCode           240948 non-null  int64  
 13  pMIperct          240948 non-null  float64
 14  mortInsType       240948 non-null  float64
 15  avgCreditScore    240948 non-null  float64
 16  loanAge           24

### Check if any columns have a null value

In [6]:
df.apply(lambda x: x.isnull().sum(), axis=0)

origChannel         0
sellerName          0
origIntRate         0
origUPB             0
origLoanTerm        0
origDate            0
origLTV             0
origCLTV            0
numBorrowers        0
origDebtIncRatio    0
loanPurp            0
propState           0
zipCode             0
pMIperct            0
mortInsType         0
avgCreditScore      0
loanAge             0
mSA                 0
zeroBalCode         0
fmacRateAvg         0
rateDiffAvg         0
rateDiffAvgPct      0
dtype: int64

## Engineer originDate into Date features

In [7]:
df['origDate'] = pd.to_datetime(df['origDate'])

In [8]:
df['origYear'] = df['origDate'].dt.year
df['origMonth'] = df['origDate'].dt.month

In [9]:
df.drop(['origDate'],1,inplace=True)

In [10]:
df[['origYear','origMonth']].head()

Unnamed: 0,origYear,origMonth
0,2009,2
1,2009,2
2,2009,2
3,2009,2
4,2009,2


# Turn zero Balance Code Into binary "0" GOOD and "1" BAD

In [11]:
df.zeroBalCode.unique()

array([ 1.,  9.,  6., 15.,  3., 16.,  2.])

In [12]:
df['zeroBalCode'] = df.zeroBalCode.replace([1,6,16],0)
df['zeroBalCode'] = df.zeroBalCode.replace([9,3,2,15],1)

In [13]:
df.zeroBalCode.unique()

array([0., 1.])

In [14]:
#Checking datatypes of individual feature
df.head()

Unnamed: 0,origChannel,sellerName,origIntRate,origUPB,origLoanTerm,origLTV,origCLTV,numBorrowers,origDebtIncRatio,loanPurp,propState,zipCode,pMIperct,mortInsType,avgCreditScore,loanAge,mSA,zeroBalCode,fmacRateAvg,rateDiffAvg,rateDiffAvgPct,origYear,origMonth
0,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",4.375,417000,180,27.0,27.0,2.0,14.0,R,NJ,76,0.0,0.0,757.5,43,35620,0.0,5.13,-0.755,-0.147173,2009,2
1,B,"WELLS FARGO BANK, N.A.",5.125,348000,360,87.0,87.0,1.0,50.0,R,VT,51,25.0,1.0,689.0,90,0,1.0,5.13,-0.005,-0.000975,2009,2
2,R,OTHER,5.125,65000,360,95.0,95.0,1.0,17.0,P,IL,613,25.0,1.0,738.0,38,0,1.0,5.13,-0.005,-0.000975,2009,2
3,R,OTHER,4.875,37000,360,55.0,55.0,1.0,23.0,P,IL,626,0.0,0.0,777.0,48,0,0.0,5.13,-0.255,-0.049708,2009,2
4,C,"BANK OF AMERICA, N.A.",4.625,195000,360,52.0,52.0,2.0,54.0,C,NJ,82,0.0,0.0,756.5,69,12100,0.0,5.13,-0.505,-0.098441,2009,2


# Review values for dates and slice (if needed) the timeframe you want

In [15]:
df.origYear.unique()

array([2009, 2010, 2011, 2012, 2013], dtype=int64)

In [16]:
df['origMonth'].unique()

array([ 2,  3,  5,  4,  6,  7,  8,  9, 10, 11, 12,  1], dtype=int64)

In [17]:
df.head()

Unnamed: 0,origChannel,sellerName,origIntRate,origUPB,origLoanTerm,origLTV,origCLTV,numBorrowers,origDebtIncRatio,loanPurp,propState,zipCode,pMIperct,mortInsType,avgCreditScore,loanAge,mSA,zeroBalCode,fmacRateAvg,rateDiffAvg,rateDiffAvgPct,origYear,origMonth
0,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",4.375,417000,180,27.0,27.0,2.0,14.0,R,NJ,76,0.0,0.0,757.5,43,35620,0.0,5.13,-0.755,-0.147173,2009,2
1,B,"WELLS FARGO BANK, N.A.",5.125,348000,360,87.0,87.0,1.0,50.0,R,VT,51,25.0,1.0,689.0,90,0,1.0,5.13,-0.005,-0.000975,2009,2
2,R,OTHER,5.125,65000,360,95.0,95.0,1.0,17.0,P,IL,613,25.0,1.0,738.0,38,0,1.0,5.13,-0.005,-0.000975,2009,2
3,R,OTHER,4.875,37000,360,55.0,55.0,1.0,23.0,P,IL,626,0.0,0.0,777.0,48,0,0.0,5.13,-0.255,-0.049708,2009,2
4,C,"BANK OF AMERICA, N.A.",4.625,195000,360,52.0,52.0,2.0,54.0,C,NJ,82,0.0,0.0,756.5,69,12100,0.0,5.13,-0.505,-0.098441,2009,2


# Coerce the column dtype (int in this case)

In [18]:
df = df.astype({'origLTV':'int','origCLTV':'int','numBorrowers':'int','origDebtIncRatio':'int',\
               'mortInsType':'int','loanAge':'int',\
               'avgCreditScore':'int','zeroBalCode':'int'})

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240948 entries, 0 to 240947
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   origChannel       240948 non-null  object 
 1   sellerName        240948 non-null  object 
 2   origIntRate       240948 non-null  float64
 3   origUPB           240948 non-null  int64  
 4   origLoanTerm      240948 non-null  int64  
 5   origLTV           240948 non-null  int32  
 6   origCLTV          240948 non-null  int32  
 7   numBorrowers      240948 non-null  int32  
 8   origDebtIncRatio  240948 non-null  int32  
 9   loanPurp          240948 non-null  object 
 10  propState         240948 non-null  object 
 11  zipCode           240948 non-null  int64  
 12  pMIperct          240948 non-null  float64
 13  mortInsType       240948 non-null  int32  
 14  avgCreditScore    240948 non-null  int32  
 15  loanAge           240948 non-null  int32  
 16  mSA               24

<a id='MLReady'></a>

# Train Test Split to create holdout dataset outside of PyCaret

In [20]:
good = df.zeroBalCode.value_counts()[0]
bad = df.zeroBalCode.value_counts()[1]
perct_bad = round(bad/(good + bad)*100,2)
print(f'We have {perct_bad}% Foreclosures in our dataset')

We have 8.96% Foreclosures in our dataset


In [21]:
from sklearn.model_selection import train_test_split

training_features, test_features, \
training_target, test_target, = train_test_split(
    df.drop(['zeroBalCode'], axis=1)
    , df['zeroBalCode']
    , test_size = .1
    , random_state=12
)

In [22]:
# Further split the training data into training/test
x_train, x_val, y_train, y_val = train_test_split(
    training_features
    , training_target
    , test_size = .1
    ,random_state=12
)

In [23]:
# For the training data, randomly sample 
# ros = RandomOverSampler(sampling_strategy='minority')
# x_train_res, y_train_res = ros.fit_sample(x_train, y_train)

In [24]:
print('#############################################')
print('Before oversampling: "Closed" crushes "Default" and causes issues:')
print(training_target.value_counts())
# print('')
# print('Before oversampling: "Closed" and "Default" are equal')
# print(y_train_res.value_counts())

#############################################
Before oversampling: "Closed" crushes "Default" and causes issues:
0    197398
1     19455
Name: zeroBalCode, dtype: int64


# Recombine X_train_res, y_train_res so PyCaret can deal with one df

In [25]:
y_train.shape

(195167,)

In [26]:
df = x_train.copy()

In [27]:
df['zeroBalCode'] = y_train

In [28]:
df.shape

(195167, 23)

# Review metrics last time before run (to have record of starting values)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195167 entries, 166485 to 166828
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   origChannel       195167 non-null  object 
 1   sellerName        195167 non-null  object 
 2   origIntRate       195167 non-null  float64
 3   origUPB           195167 non-null  int64  
 4   origLoanTerm      195167 non-null  int64  
 5   origLTV           195167 non-null  int32  
 6   origCLTV          195167 non-null  int32  
 7   numBorrowers      195167 non-null  int32  
 8   origDebtIncRatio  195167 non-null  int32  
 9   loanPurp          195167 non-null  object 
 10  propState         195167 non-null  object 
 11  zipCode           195167 non-null  int64  
 12  pMIperct          195167 non-null  float64
 13  mortInsType       195167 non-null  int32  
 14  avgCreditScore    195167 non-null  int32  
 15  loanAge           195167 non-null  int32  
 16  mSA            

In [30]:
df.head()

Unnamed: 0,origChannel,sellerName,origIntRate,origUPB,origLoanTerm,origLTV,origCLTV,numBorrowers,origDebtIncRatio,loanPurp,propState,zipCode,pMIperct,mortInsType,avgCreditScore,loanAge,mSA,fmacRateAvg,rateDiffAvg,rateDiffAvgPct,origYear,origMonth,zeroBalCode
166485,B,OTHER,4.25,178000,360,54,54,1,50,C,MA,27,0.0,0,712,84,39300,4.106,0.144,0.035071,2011,9,0
121042,R,"WELLS FARGO BANK, N.A.",4.625,92000,180,80,80,1,38,C,SC,298,0.0,0,763,55,12260,4.564,0.061,0.013365,2010,7,0
139151,C,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",4.375,163000,240,54,87,2,32,R,CA,917,0.0,0,676,42,31080,4.3,0.075,0.017442,2010,11,0
86969,C,"GMAC MORTGAGE, LLC",4.375,180000,360,80,80,1,36,P,CA,946,0.0,0,777,81,41860,4.93,-0.555,-0.112576,2009,12,0
22472,R,"WELLS FARGO BANK, N.A.",4.25,53000,180,36,36,2,37,R,NC,272,0.0,0,810,44,24660,5.0025,-0.7525,-0.150425,2009,3,0


# Pycaret - Setup with categorical definition

### Use complex setup 

In [31]:
model_setup = setup(
    df
    , target = 'zeroBalCode' # PyCaret will list this as "Label"
    , normalize = True
    , transformation = False
    , pca = False 
    , ignore_low_variance = False # Variance is calculated using the ratio of unique values to the number of samples, and the ratio of the most common value to the frequency of the second most common value. 
    , ignore_features = None
    , handle_unknown_categorical = True
    , remove_outliers = True
    , remove_multicollinearity = False
    , polynomial_features = False
    , trigonometry_features = False
    , feature_selection = False
    , feature_interaction = False
    , fix_imbalance = False
    , silent = True
    , profile = False
    # outliers from the training data are removed using PCA linear dimensionality reduction using the Singular Value Decomposition technique.
    , bin_numeric_features = [
        'avgCreditScore'
        ] # Set to True to bin numerics using K Means
    , categorical_features = [
         'origChannel'
        , 'loanPurp'
        , 'origYear'
        , 'origMonth'
        , 'mortInsType'
        , 'origLoanTerm'
        ]
    , high_cardinality_features = [
         'propState'
        , 'sellerName'
        , 'mSA'
        , 'zipCode'
    ]
    , high_cardinality_method = 'clustering'
    , numeric_features = [
        'origIntRate'
        , 'numBorrowers'
        , 'origUPB'
        , 'origLTV'
        , 'origCLTV'
        , 'pMIperct'
        , 'loanAge'
        , 'origDebtIncRatio'
        , 'avgCreditScore'
        , 'rateDiffAvg'
        , 'rateDiffAvgPct'
        , 'fmacRateAvg'
    ]
)

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,7425
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(195167, 23)"
4,Missing Values,False
5,Numeric Features,14
6,Categorical Features,8
7,Ordinal Features,False
8,High Cardinality Features,True
9,High Cardinality Method,clustering


## Compare Models with no blacklist exclusions

In [32]:
model_results=compare_models(sort='Recall') #blacklist = ['tr','ransac'],'lar','par','huber','llar','lasso','en','ridge','omp','br','svm'])
model_results

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Naive Bayes,0.714,0.7379,0.6047,0.1754,0.2719,0.1564,0.203,0.0975
1,Quadratic Discriminant Analysis,0.6814,0.5981,0.4143,0.1235,0.181,0.0592,0.0779,0.4158
2,Decision Tree Classifier,0.8469,0.5598,0.211,0.1827,0.1958,0.1117,0.1121,1.3694
3,K Neighbors Classifier,0.9022,0.6477,0.0643,0.273,0.104,0.0727,0.0948,3.6708
4,Extreme Gradient Boosting,0.9113,0.7985,0.0596,0.4804,0.106,0.0882,0.1453,6.2595
5,CatBoost Classifier,0.9117,0.8059,0.0501,0.5003,0.091,0.0761,0.1369,15.0821
6,Light Gradient Boosting Machine,0.9121,0.8051,0.0406,0.5327,0.0754,0.0637,0.1288,0.6263
7,Ada Boost Classifier,0.9109,0.7917,0.0367,0.4453,0.0678,0.0551,0.1076,5.6767
8,Random Forest Classifier,0.91,0.704,0.0346,0.3904,0.0636,0.0499,0.0945,0.3708
9,Gradient Boosting Classifier,0.9119,0.8025,0.0202,0.5324,0.039,0.0328,0.0907,22.7855


GaussianNB(priors=None, var_smoothing=1e-09)

## Create models 

### NB Classifier

In [33]:
nb = create_model('nb', fold=10)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7112,0.7335,0.5908,0.1711,0.2654,0.1488,0.1933
1,0.7091,0.7236,0.5742,0.1668,0.2584,0.1409,0.1828
2,0.7133,0.7357,0.5977,0.1736,0.2691,0.1532,0.1988
3,0.7152,0.7345,0.6112,0.1774,0.275,0.1599,0.2075
4,0.7171,0.7493,0.6234,0.1808,0.2803,0.166,0.2155
5,0.7138,0.738,0.6065,0.1756,0.2723,0.1569,0.2038
6,0.7187,0.7388,0.6065,0.1784,0.2757,0.1613,0.208
7,0.7092,0.7418,0.6222,0.1759,0.2742,0.1583,0.208
8,0.7164,0.743,0.6108,0.1779,0.2756,0.1608,0.2083
9,0.7157,0.7407,0.6038,0.1762,0.2728,0.1577,0.2041


## Evaluate Models

In [35]:
evaluate_model(nb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

## Tune Model

In [36]:
tuned_nb = tune_model(nb, optimize = 'Recall')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7112,0.7335,0.5908,0.1711,0.2654,0.1488,0.1933
1,0.7091,0.7236,0.5742,0.1668,0.2584,0.1409,0.1828
2,0.7133,0.7357,0.5977,0.1736,0.2691,0.1532,0.1988
3,0.7152,0.7345,0.6112,0.1774,0.275,0.1599,0.2075
4,0.7171,0.7493,0.6234,0.1808,0.2803,0.166,0.2155
5,0.7138,0.738,0.6065,0.1756,0.2723,0.1569,0.2038
6,0.7187,0.7388,0.6065,0.1784,0.2757,0.1613,0.208
7,0.7092,0.7418,0.6222,0.1759,0.2742,0.1583,0.208
8,0.7164,0.743,0.6108,0.1779,0.2756,0.1608,0.2083
9,0.7157,0.7407,0.6038,0.1762,0.2728,0.1577,0.2041


In [37]:
evaluate_model(tuned_nb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [38]:
final_nb = finalize_model(tuned_nb)

In [39]:
evaluate_model(final_nb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

# Predict models

In [40]:
predicted_final_nb = predict_model(final_nb)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.7148,0.7383,0.6126,0.1773,0.2751,0.16,0.2079


In [41]:
predicted_nb = predict_model(nb)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.7144,0.7378,0.6142,0.1774,0.2752,0.1602,0.2083


In [42]:
predicted_nb_tuned = predict_model(tuned_nb)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.7144,0.7378,0.6142,0.1774,0.2752,0.1602,0.2083


# The golden moment... put 10% sample test data through the models...

# Recombine test_features, test_target at first train/test/split
 This "test" set was split BEFORE Oversampling. PyCaret takes the "train" from this split and splits it internally again. Since we have no control of how PyCaret does this we needed to do a train/test split before we input the training set into PyCaret. Now we will recombine the test_features and test_target into one new df

In [43]:
test_features.shape

(24095, 22)

In [44]:
test_target.shape

(24095,)

In [45]:
test_df = test_features.copy()

In [46]:
test_df['zeroBalCode'] = test_target

In [47]:
test_df.shape

(24095, 23)

In [48]:
totalTestCount = test_df.zeroBalCode.value_counts().sum()
posTestCount = test_df.zeroBalCode.value_counts()[1]
print('#############################################')
print('This "Test" set has not been oversamplied')
print(test_df.zeroBalCode.value_counts())
print(f'Positive samples are {round(posTestCount/totalTestCount*100,2)}%')

#############################################
This "Test" set has not been oversamplied
0    21971
1     2124
Name: zeroBalCode, dtype: int64
Positive samples are 8.82%


## Create Confusion Matrix output functions

In [49]:
def calc_confusion(row):
    if ((row['zeroBalCode'] == 0) & (row['Label'] == 0)):
        value = 'TrueNegative'
    elif ((row['zeroBalCode'] == 0) & (row['Label'] == 1)):
        value = 'FalsePositive'
    elif ((row['zeroBalCode'] == 1) & (row['Label'] == 1)):
        value = 'TruePositive'
    elif ((row['zeroBalCode'] == 1) & (row['Label'] == 0)):
        value = 'FalseNegative'
    else:
        value = 'Undefined'
    return value

In [50]:
def cM(df):
    print(f'           ##############################')
    print(f'           #             #              #')
    print(f'         0 #    {df["TrueNegative"]}     #     {df["FalsePositive"]}      #')
    print(f'  True     #             #              #')
    print(f'  Class    ##############################')
    print(f'           #             #              #')
    print(f'         1 #    {df["FalseNegative"]}      #     {df["TruePositive"]}      #')
    print(f'           #             #              #')
    print(f'           ##############################')
    print(f'                  0              1        ')
    print(f'                  Predicted Class           ')


In [51]:
def cMCalc():
    # 0 or 2 = Predict equals truth, 1 = Predict doesn't equal truth
    totalNegative = confusionMatrix['TrueNegative'] + confusionMatrix['FalseNegative']
    totalPositive = confusionMatrix['TruePositive'] + confusionMatrix['FalsePositive']
    total = totalNegative + totalPositive
    trueNegativePct = confusionMatrix['TrueNegative'] / total * 100
    falseNegativePct = confusionMatrix['FalseNegative'] / total * 100
    truePositivePct = confusionMatrix['TruePositive'] / total * 100
    falsePositivePct = confusionMatrix['FalsePositive'] / total * 100

    # Accuracy Calculation
    accuracy = (confusionMatrix['TruePositive'] + confusionMatrix['TrueNegative'])/(totalPositive + totalNegative)
    # Recall Calculation
    recall = (confusionMatrix['TruePositive']/(confusionMatrix['TruePositive'] + confusionMatrix['FalseNegative']))
    # Precision Calculation
    precision = (confusionMatrix['TruePositive']/(confusionMatrix['TruePositive'] + confusionMatrix['FalsePositive']))

    print(f'TrueNegative:  {round(trueNegativePct,2)}%')
    print(f'FalseNegative:  {round(falseNegativePct,2)}%')
    print(f'TruePositive:   {round(truePositivePct,2)}%')
    print(f'FalsePositive: {round(falsePositivePct,2)}%')
    print(f'#####################################################')
    print(f'Accuracy: {round(accuracy,2)*100}%   Recall: {round(recall,2)*100}%   Precision: {round(precision,2)*100}%')
    print(f'#####################################################')

# Evaluate Models

In [52]:
predictions = predict_model(final_nb, data=test_df)

In [53]:
pd.set_option('max_columns',None)
predictions.head()

Unnamed: 0,origChannel,sellerName,origIntRate,origUPB,origLoanTerm,origLTV,origCLTV,numBorrowers,origDebtIncRatio,loanPurp,propState,zipCode,pMIperct,mortInsType,avgCreditScore,loanAge,mSA,fmacRateAvg,rateDiffAvg,rateDiffAvgPct,origYear,origMonth,zeroBalCode,Label,Score
0,R,"WELLS FARGO BANK, N.A.",4.75,248000,360,90,90,1,44,R,MD,210,25.0,1,703,72,12580,4.8575,-0.1075,-0.022131,2009,5,0,1,0.8417
1,C,"BANK OF AMERICA, N.A.",4.875,289000,360,89,89,2,40,R,RI,28,25.0,1,776,70,39300,4.8575,0.0175,0.003603,2009,5,0,0,0.1697
2,R,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",4.875,230000,360,80,80,2,41,C,NJ,77,0.0,0,784,68,35620,4.8575,0.0175,0.003603,2009,5,0,0,0.001
3,R,PHH MORTGAGE CORPORATION,3.75,140000,180,80,80,1,22,C,TN,373,0.0,0,670,71,0,3.675,0.075,0.020408,2012,6,0,1,0.6661
4,R,OTHER,4.0,220000,360,80,80,2,39,C,MI,491,0.0,0,745,83,35660,3.602,0.398,0.110494,2012,8,1,1,0.7747


In [54]:
results = predictions[['zeroBalCode','Label','Score']]

In [55]:
predictions.Label.value_counts()

0    18273
1     5822
Name: Label, dtype: int64

In [56]:
results['Confusion'] = results.apply(calc_confusion, axis=1)

In [57]:
confusionMatrix = results.Confusion.value_counts().to_dict()

In [58]:
cM(confusionMatrix)

           ##############################
           #             #              #
         0 #    17202     #     4769      #
  True     #             #              #
  Class    ##############################
           #             #              #
         1 #    1071      #     1053      #
           #             #              #
           ##############################
                  0              1        
                  Predicted Class           


In [59]:
cMCalc()

TrueNegative:  71.39%
FalseNegative:  4.44%
TruePositive:   4.37%
FalsePositive: 19.79%
#####################################################
Accuracy: 76.0%   Recall: 50.0%   Precision: 18.0%
#####################################################


In [82]:
#save_model(final_nb, 'np.final5pct_092020')