In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import numpy as np

model = '../models/justinTest'
test = '../data/MLReady/FM_FULL_EPOCH3_MLReady.csv'
target = 'zeroBalCode'
predictors = ['origIntRate', 'origUPB', 'origLTV', 'origDebtIncRatio', 'stateNumber', 'fredRate']

In [22]:
%%time 

from pycaret.classification import *
the_model = load_model(model)

print(the_model)

Transformation Pipeline and Model Sucessfully Loaded
[Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      ml_usecase='classification',
                                      numerical_features=['origLTV',
                                                          'origDebtIncRatio'],
                                      target='zeroBalCode', time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                numeric_strategy='mean',
                                target_variable=None)),
                (...
                ('group', Empty()), ('nonliner', Empty()), ('scaling', Empty()),
                ('P_transform', Empty()), ('pt_target', Empty()),
                ('binn', Empty()), ('rem_outliers', Empty()),
                ('cl

# Load in random 100 rows of test data from Epoch 3

In [43]:
%%time

import random

random_rows = 1000

n = sum(1 for line in open(test)) - 1 # number of records in file (excludes header)
skip = sorted(random.sample(range(1,n+1),n-random_rows)) # the 0-indexed header will not be included in the skip list

dfTest = pd.read_csv(test, skiprows=skip)

dfTest.head()

Wall time: 98.8 ms


Unnamed: 0.1,Unnamed: 0,origChannel,origIntRate,origUPB,origLTV,numBorrowers,origDebtIncRatio,borrCreditScore,loanPurp,zipCode,pMIperct,mortInsType,bestCreditScore,worstCreditScore,avgCreditScore,bankNumber,stateNumber,mSA,zeroBalCode,fmacRateMax,fmacRateMin,fmacRateAvg,fmacRateVolatility,fredRate,rateDiffAbove,rateDiffBelow,rateDiffAvg,rateDiffAbovePct,rateDiffBelowPct,rateDiffAvgPct,origYear,origMonth
0,3,3,4.49,170000,54,2,39,649,1,117,0.0,0,675,649,662,26,35,35620,0,4.35,4.16,4.255,0.045673,2.65,0.14,-0.33,0.235,0.032184,-0.079327,0.055229,2013,11
1,20,3,4.75,100000,80,2,25,678,2,360,0.0,0,678,657,667,54,1,0,0,4.35,4.16,4.255,0.045673,2.65,0.4,-0.59,0.495,0.091954,-0.141827,0.116334,2013,11
2,40,2,4.742,292000,80,1,41,668,2,606,0.0,0,668,668,668,26,15,16980,0,4.35,4.16,4.255,0.045673,2.65,0.392,-0.582,0.487,0.090115,-0.139904,0.114454,2013,11
3,51,3,4.875,117000,95,1,33,757,2,539,30.0,2,757,757,757,29,51,0,0,4.35,4.16,4.255,0.045673,2.65,0.525,-0.715,0.62,0.12069,-0.171875,0.145711,2013,11
4,84,1,5.5,162000,74,2,18,651,1,704,0.0,0,657,651,654,80,19,35380,0,4.35,4.16,4.255,0.045673,2.65,1.15,-1.34,1.245,0.264368,-0.322115,0.292597,2013,11


In [44]:
# Drop the previous index column
dfTest.drop(['Unnamed: 0'], axis=1, inplace=True)

In [45]:
# Get just the model inputs
def select_columns(data_frame, column_names):
    new_frame = data_frame.loc[:, column_names]
    return new_frame

final_columns = np.append(predictors, target)

dfTestData = select_columns(dfTest, final_columns)

dfTestData.sample(5)

Unnamed: 0,origIntRate,origUPB,origLTV,origDebtIncRatio,stateNumber,fredRate,zeroBalCode
552,4.25,352000,80,37,24,2.05,0
415,3.625,190000,78,37,16,2.12,0
373,4.99,454000,79,41,4,2.0,0
23,4.5,115000,70,40,19,2.65,0
145,4.25,288000,74,43,4,2.48,0


# Predict!
Notice the last two columns 'Label' and 'Score'. 
- Label is the prediction 
- Score is the probability of the prediction
The predicted results are concatenated to the original dataset while all transformations including imputation of missing values (in this case None), categorical encoding, feature extraction etc. are performed automatically under the hood and you do not have to manage the pipeline manually.

In [46]:
%%time 

unseen_predictions = predict_model(model, data=dfTestData)
unseen_predictions.head()

Wall time: 1.14 s


Unnamed: 0,origIntRate,origUPB,origLTV,origDebtIncRatio,stateNumber,fredRate,zeroBalCode,Label,Score
0,4.49,170000,54,39,35,2.65,0,0,0.0696
1,4.75,100000,80,25,1,2.65,0,0,0.2674
2,4.742,292000,80,41,15,2.65,0,0,0.2179
3,4.875,117000,95,33,51,2.65,0,0,0.3947
4,5.5,162000,74,18,19,2.65,0,0,0.1361


In [47]:
results = unseen_predictions[[target,'Label','Score']]

In [48]:
def calc_confusion(row):
    if ((row[target] == 0) & (row['Label'] == 0)):
        value = 'TrueNegative'
    elif ((row[target] == 0) & (row['Label'] == 1)):
        value = 'FalseNegative'
    elif ((row[target] == 1) & (row['Label'] == 1)):
        value = 'TruePositive'
    elif ((row[target] == 1) & (row['Label'] == 0)):
        value = 'FalsePositive'
    else:
        value = 'Undefined'
    return value

results['Confusion'] = results.apply(calc_confusion, axis=1)

confusionMatrix = results.Confusion.value_counts().to_dict()

confusionMatrix

{'TrueNegative': 934,
 'FalsePositive': 58,
 'FalseNegative': 7,
 'TruePositive': 1}

In [60]:
print(f'           |-------------|--------------|')
print(f'           | (TN)        |         (FP) |')
print(f'    True 0 |    {confusionMatrix["TrueNegative"]}      |     {confusionMatrix["FalsePositive"]}       |')
print(f'   Class   |             |              |')
print(f'           --------------|--------------|')
print(f'           |             |              |')
print(f'         1 |    {confusionMatrix["FalseNegative"]}        |     {confusionMatrix["TruePositive"]}        |')
print(f'           | (FN)        |         (TP) |')
print(f'           |-------------|--------------|')
print(f'                0              1         ')
print(f'                  Predicted Class           ')

           |-------------|--------------|
           | (TN)        |         (FP) |
    True 0 |    934      |     58       |
   Class   |             |              |
           --------------|--------------|
           |             |              |
         1 |    7        |     1        |
           | (FN)        |         (TP) |
           |-------------|--------------|
                0              1         
                  Predicted Class           


In [None]:
origIntRate
origUPB
origLTV
origDebtIncRatio
stateNumber
fredRate

print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(Xtest)