In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import *

## Purpose
This notebook will compare some of the sklearn evaluation criteria against fake model outputs
(Threshold for positive class is >= 0.5)

### positive class and negative class are balanced

In [102]:
balanced_y = [0] * 10 + [1] * 10
outputs = [
    {'output': [0.1] * 10 + [0.9] * 10,
    'des' : 'highly confident correct predictions'},
    {'output': [0.4] * 10 + [0.6] * 10,
    'des' : 'marginally confident correct predictions'},
    {'output': [0.2] * 4 + [0.4] * 6 + [0.6] * 6 + [0.8] * 4,
    'des' : 'distributed correct predictions'},
    {'output': [0.9] * 10 + [0.1] * 10,
    'des' : 'highly confident incorrect predictions'},
    {'output': [0.6] * 10 + [0.4] * 10,
    'des' : 'marginally confident incorrect predictions'},
    {'output': [0.8] * 4 + [0.6] * 6 + [0.4] * 6 + [0.2] * 4,
    'des' : 'distributed incorrect predictions'},
    {'output': [0.1] * 18 + [0.9] * 2,
    'des' : 'highly confident skewed negative'},
    {'output': [0.1] * 2 + [0.9] * 18,
    'des' : 'highly confident skewed positive'},
    {'output': [0.4] * 18 + [0.6] * 2,
    'des' : 'marginally confident skewed negative'},
    {'output': [0.4] * 2 + [0.6] * 18,
    'des' : 'marginally confident skewed positive'},
    {'output': [0.4999] * 10 + [1] * 10,
    'des' : 'highly confident positive marginally confident negative'},
    {'output': [0.0001] * 10 + [0.51] * 10,
    'des' : 'marginally confident positive highly confident negative'}
]

In [109]:
def precision(y_true, y_pred):
    y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
    paired = list(zip(y_pred, y_true))
    TP = float(len([x for x in paired if x[0] == 1 and x[1] == 1]))
    FP = float(len([x for x in paired if x[0] == 1 and x[1] == 0]))
    return 0.0 if (TP + FP) == 0 else TP/(TP + FP)

def recall(y_true, y_pred):
    y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
    paired = list(zip(y_pred, y_true))
    TP = float(len([x for x in paired if x[0] == 1 and x[1] == 1]))
    FN = float(len([x for x in paired if x[0] == 0 and x[1] == 1]))
    return 0.0 if (TP + FN) == 0 else TP/(TP + FN)

def f1_score(p,r):
    return 0.0 if p+r == 0 else (2*p*r)/(p+r)

def logloss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred))
    
def matt_coeff(y_true, y_pred):
    y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
    paired = list(zip(y_pred, y_true))
    TP = float(len([x for x in paired if x[0] == 1 and x[1] == 1]))
    TN = float(len([x for x in paired if x[0] == 0 and x[1] == 0]))
    FP = float(len([x for x in paired if x[0] == 1 and x[1] == 0]))
    FN = float(len([x for x in paired if x[0] == 0 and x[1] == 1]))
    return -1 if (np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))) == 0 else (TP * TN - FP * FN) / (np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)))

def specificity(y_true, y_pred):
    y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
    paired = list(zip(y_pred, y_true))
    TN = float(len([x for x in paired if x[0] == 0 and x[1] == 0]))
    FP = float(len([x for x in paired if x[0] == 1 and x[1] == 0]))
    return TN/(TN + FP)

def accuracy(y_true, y_pred):
    y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
    paired = list(zip(y_pred, y_true))
    return float(len([x for x in paired if x[0] == x[1]]))/float(len(y_true))
    
results = []
for output in outputs:
    p = precision(balanced_y, output['output'])
    r = recall(balanced_y, output['output'])
    results.append({
        'Description' : output['des'],
        'Precision' : p,
        'Recall/Sensitivity' : r,
        'F1' : f1_score(p,r),
        'LogLoss' : logloss(balanced_y, output['output']),
        'Matt_corr' : matt_coeff(balanced_y, output['output']),
        'Specificity' : specificity(balanced_y, output['output']),
        'roc_auc' : roc_auc_score(balanced_y, [1 if x >= 0.5 else 0 for x in output['output']]),
        'accuracy' : accuracy(balanced_y, output['output'])
    })
pd.DataFrame(results)

Unnamed: 0,Description,F1,LogLoss,Matt_corr,Precision,Recall/Sensitivity,Specificity,accuracy,roc_auc
0,highly confident correct predictions,1.0,0.316082,1.0,1.0,1.0,1.0,1.0,1.0
1,marginally confident correct predictions,1.0,1.532477,1.0,1.0,1.0,1.0,1.0,1.0
2,distributed correct predictions,1.0,1.244795,1.0,1.0,1.0,1.0,1.0,1.0
3,highly confident incorrect predictions,0.0,6.907755,-1.0,0.0,0.0,0.0,0.0,0.0
4,marginally confident incorrect predictions,0.0,2.748872,-1.0,0.0,0.0,0.0,0.0,0.0
5,distributed incorrect predictions,0.0,3.442019,-1.0,0.0,0.0,0.0,0.0,0.0
6,highly confident skewed negative,0.0,6.907755,-1.0,0.0,0.0,1.0,0.85,0.5
7,highly confident skewed positive,0.26087,0.316082,-1.0,0.15,1.0,0.0,0.15,0.5
8,marginally confident skewed negative,0.0,2.748872,-1.0,0.0,0.0,1.0,0.85,0.5
9,marginally confident skewed positive,0.26087,1.532477,-1.0,0.15,1.0,0.0,0.15,0.5


### minor positive major negative

In [110]:
balanced_y = [0] * 17 + [1] * 3
outputs = [
    {'output': [0.1] * 17 + [0.9] * 3,
    'des' : 'highly confident correct predictions'},
    {'output': [0.4] * 17 + [0.6] * 3,
    'des' : 'marginally confident correct predictions'},
    {'output': [0.2] * 7 + [0.4] * 10 + [0.6] * 2 + [0.8] * 1,
    'des' : 'distributed correct predictions'},
    {'output': [0.9] * 17 + [0.1] * 3,
    'des' : 'highly confident incorrect predictions'},
    {'output': [0.6] * 17 + [0.4] * 3,
    'des' : 'marginally confident incorrect predictions'},
    {'output': [0.8] * 7 + [0.6] * 10 + [0.4] * 2 + [0.2] * 1,
    'des' : 'distributed incorrect predictions'},
    {'output': [0.1] * 20,
    'des' : 'highly confident skewed negative'},
    {'output': [0.9] * 20,
    'des' : 'highly confident skewed positive'},
    {'output': [0.4] * 20,
    'des' : 'marginally confident skewed negative'},
    {'output': [0.6] * 20,
    'des' : 'marginally confident skewed positive'}
]

In [111]:
results = []
for output in outputs:
    p = precision(balanced_y, output['output'])
    r = recall(balanced_y, output['output'])
    results.append({
        'Description' : output['des'],
        'Precision' : p,
        'Recall/Sensitivity' : r,
        'F1' : f1_score(p,r),
        'LogLoss' : logloss(balanced_y, output['output']),
        'Matt_corr' : matt_coeff(balanced_y, output['output']),
        'Specificity' : specificity(balanced_y, output['output']),
        'roc_auc' : roc_auc_score(balanced_y, [1 if x >= 0.5 else 0 for x in output['output']]),
        'accuracy' : accuracy(balanced_y, output['output'])
    })
pd.DataFrame(results)

Unnamed: 0,Description,F1,LogLoss,Matt_corr,Precision,Recall/Sensitivity,Specificity,accuracy,roc_auc
0,highly confident correct predictions,1.0,0.316082,1.0,1.0,1.0,1.0,1.0,1.0
1,marginally confident correct predictions,1.0,1.532477,1.0,1.0,1.0,1.0,1.0,1.0
2,distributed correct predictions,1.0,1.244795,1.0,1.0,1.0,1.0,1.0,1.0
3,highly confident incorrect predictions,0.0,6.907755,-1.0,0.0,0.0,0.0,0.0,0.0
4,marginally confident incorrect predictions,0.0,2.748872,-1.0,0.0,0.0,0.0,0.0,0.0
5,distributed incorrect predictions,0.0,3.442019,-1.0,0.0,0.0,0.0,0.0,0.0
6,highly confident skewed negative,0.0,6.907755,-1.0,0.0,0.0,1.0,0.85,0.5
7,highly confident skewed positive,0.26087,0.316082,-1.0,0.15,1.0,0.0,0.15,0.5
8,marginally confident skewed negative,0.0,2.748872,-1.0,0.0,0.0,1.0,0.85,0.5
9,marginally confident skewed positive,0.26087,1.532477,-1.0,0.15,1.0,0.0,0.15,0.5
