# Evaluating Classifiers

In [None]:
import pickle

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import plot_roc_curve, auc, roc_curve

### Loading Penguins again

In [None]:
# moved everything up to the feature engineering to a separate module
from penguin_dataset import preprocess_penguins

Xtrain, ytrain, Xval, yval, Xtest, ytest = preprocess_penguins()

### Predictions for the test data

In [None]:
m = pickle.load(open('../models/penguin_forest.pkl', 'rb'))

ypred = m.predict(Xtest)

In [None]:
probs = pd.DataFrame(m.predict_proba(Xtest), columns=['Ade', 'Chin', 'Gen'])
probs

### Confusion Matrix

In [None]:
conf = confusion_matrix(ytest, ypred)
conf

In [None]:
sns.heatmap(conf,
            xticklabels = np.unique(ytest),
            yticklabels = np.unique(ytest),
            cmap = 'Blues',
            annot=True,
            fmt='g'
            )

# fmt is used to switch off scientific notation
plt.xlabel('Predicted')
plt.ylabel('Actual')

### Accuracy

* The percentage of correct guesses
* 0.0 worst - 1.0 best
* simple way of assessing the model
* fails with imbalanced classes

$acc = \dfrac{TP+TN}{TP+TN+FP+FN}$


In [None]:
accuracy_score(ytest, ypred) #compare predictive results to actual results

### Precision

* High Precision = more relevant than irrelevant results returned (at the expense of missing some relevant ones)
* Also called False Positive Rate
* High when false positives are low

$precision = \dfrac{TP}{TP+FP}$
    
### Recall
    * High Recall = most of the relevant results returned, (at the expense of including bad results)
    * Also called True Positive Rate
    * High when false negatives are low 
    * Rec = TP/ (TP+FN)

* Remember the two scenarios! Which one suits which measure?

### F1 Score 

$F1 = \dfrac{precision \cdot recall}{precision + recall}$

In [None]:
# these only works for binary classification
# precision_score(ytest, ypred)
# recall_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred, zero_division='warn'))

## Receiver Operator Characteristic

**ROC plots True Positive Rate (TPR, y axis) against False Positive Rate (FPR, x axis).**

In [None]:
# retrain a binary classification
ytrain_adelie = ytrain == 'Adelie'
ytest_adelie = ytest == 'Adelie'

In [None]:
m.fit(Xtrain, ytrain_adelie)

In [None]:
probs=m.predict_proba(Xtest) 
fpr, tpr, threshold = roc_curve(ytest_adelie, probs[:,0]);

plot_roc_curve(m, Xtest, ytest_adelie)  
plt.title("Precision-Recall vs Threshold Chart")

plt.ylabel("TPR, Recall")
plt.xlabel("FPR")
plt.legend(loc="lower left")
plt.ylim([0,1.05])

### Area Under Curve

In [None]:
# > 0.7 acceptable. Better > 0.8
auc_score = auc(tpr, fpr)
print(f"Area under the curve: {auc_score:5.3}")