# Evaluation Metrics for Classification

Only the accuracy score for a classification model gives an incomplete picture of your model’s performance. The following evaluation metrics should help gain perspective of the practical usability of a classifier model.

## Reference
* https://towardsdatascience.com/evaluation-metrics-for-classification-409568938a7d
* http://www.acheronanalytics.com/acheron-blog/how-do-machines-learn-bias-data-science
* http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
* 

## Data generation

In [63]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from scipy.stats import itemfreq

np.random.seed(666)

# Total sample size ------------------------------------------------------------
n = 100000
n_features = 20 # Total number of variables (features)
n_informative = 2 # The number of informative variables (features).
n_redundant = 2 # The number of redundant variables (features).
# Samples used for training the models
train_samples = 70000 

X, y = datasets.make_classification(n_samples=n, 
                                    n_features=n_features,
                                    n_informative=n_informative, 
                                    n_redundant=n_redundant)
X = ( X * 5 ) + 50

X_train = X[:train_samples]
X_test = X[train_samples:]
y_train = y[:train_samples]
y_test = y[train_samples:]

print("Shape of X_train:", X_train.shape )
print("Shape of y_train:", y_train.shape, "\n" )
print("Shape of X_test:", X_test.shape )
print("Shape of y_test:", y_test.shape, "\n" )

# Sample data
print("Sample y ", y_train[1:5])
print("Sample x \n", X_train[1:5], "\n" )
# Frequencia
print("Frequency \n", itemfreq( y )) 



Shape of X_train: (70000, 20)
Shape of y_train: (70000,) 

Shape of X_test: (30000, 20)
Shape of y_test: (30000,) 

Sample y  [1 1 1 0]
Sample x 
 [[64.94595194 53.13656135 53.01191613 56.21747762 55.32043575 44.5712013
  48.25344799 45.74158019 50.52065627 47.40033855 41.84799332 49.32500722
  43.73067361 43.02025042 50.78457529 43.09460704 44.32853147 50.86347644
  52.21223543 42.09577081]
 [44.50695934 57.99924053 53.93509367 63.61508376 42.217573   45.33012077
  53.09110498 46.79193816 42.99073914 51.0364739  54.01282072 50.82707778
  51.57016636 44.67104363 45.8775338  44.6275318  53.37676574 42.39480428
  47.71598573 55.71275738]
 [53.10043068 56.81066531 43.10342728 55.46104955 47.07898005 54.57684427
  51.65586498 58.33450312 50.04754331 50.96330756 49.23096942 57.67504612
  50.76146027 53.85844883 50.07657868 44.99553942 54.44174816 48.79662772
  46.97121116 48.17265511]
 [47.0155751  48.3385013  38.87871553 41.00694044 51.2573889  49.39894809
  48.60502111 51.23965781 56.5427

## Define functions for Evaluaton Metrics for Classification

This code defines Precision, Recell, F-measure, and Accuracy for Keras 2.

In [0]:
from keras import backend as K

def mcor(y_true, y_pred):
     #matthews_correlation
     y_pred_pos = K.round(K.clip(y_pred, 0, 1))
     y_pred_neg = 1 - y_pred_pos
  
     y_pos = K.round(K.clip(y_true, 0, 1))
     y_neg = 1 - y_pos
  
     tp = K.sum(y_pos * y_pred_pos)
     tn = K.sum(y_neg * y_pred_neg)
  
     fp = K.sum(y_neg * y_pred_pos)
     fn = K.sum(y_pos * y_pred_neg)
  
     numerator = (tp * tn - fp * fn)
     denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
  
     return numerator / (denominator + K.epsilon())

def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def accuracy(y_true, y_pred):
    """
        Accuracy metric.

        Only computes a Accuracy.

        Computes true positives, true negatives, false positives
        and false negatives.
        
        Accuracy = TP+TN/TP+FP+FN+TN
        
        REF: https://tryolabs.com/blog/2013/03/25/why-accuracy-alone-bad-measure-classification-tasks-and-what-we-can-do-about-it/
    """
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos
 
 
    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos
 
 
    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)
 
 
    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)
    
    accuracy = (tp+tn)/(tp+fp+fn+tn)
    return(accuracy)
  
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

# you can use it like this
# model.compile(loss='binary_crossentropy',
#               optimizer= "adam",
#               metrics=[mcor,recall, f1])

## Model 1
Model shows how to use, in Keras 2, custom defined functions ```metrics=[accuracy,precision,recall])``` . These functions are no more provided in Keras 2 by default.

In [65]:
model = Sequential()
model.add(Dense(64, input_dim=20, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=[accuracy,precision,recall])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=128)

score = model.evaluate(X_test, y_test, verbose=0, batch_size=128)
print('Test score:', score[0])
print('Test accuracy:', score[1])

# Prediction 
y_pred = model.predict(X_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 8/20
Epoch 9/20
Epoch 10/20

Epoch 11/20
Epoch 12/20
Epoch 13/20

Epoch 14/20
Epoch 15/20
Epoch 16/20

Epoch 17/20
Epoch 18/20
Epoch 19/20

Epoch 20/20
Test score: 0.24956810023387274
Test accuracy: 0.9088333333651225


## Model 2
This model uses ```metrics=['accuracy'])```, default accuracy measure of Keras 2.

In [66]:
model = Sequential()
model.add(Dense(64, input_dim=20, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          epochs=20,
          batch_size=128)

score = model.evaluate(X_test, y_test, verbose=0, batch_size=128)
print('Test score:', score[0])
print('Test accuracy:', score[1])

# Prediction 
y_pred = model.predict(X_test)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20

Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

Epoch 18/20
Epoch 19/20
Epoch 20/20
Test score: 0.23711982057491937
Test accuracy: 0.9077666666348775


## Model 3
This model implements the same previous model in a GPU.

In [67]:
import numpy as np
np.random.seed(666)

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from scipy.stats import itemfreq

import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

np.random.seed(666)

with tf.device('/gpu:0'):
  X, y = datasets.make_classification(n_samples=100000, n_features=20,
                                      n_informative=2, n_redundant=2)

  train_samples = 70000 # Samples used for training the models

  X_train = X[:train_samples]
  X_test = X[train_samples:]
  y_train = y[:train_samples]
  y_test = y[train_samples:]

  print("Shape of X_train:", X_train.shape )
  print("Shape of y_train:", y_train.shape, "\n" )
  print("Shape of X_test:", X_test.shape )
  print("Shape of y_test:", y_test.shape, "\n" )

  # Sample data
  print("Sample y ", y_train[1:5])
  print("Sample x \n", X_train[1:5], "\n" )
  # Frequencia
  print("Frequency \n", itemfreq( y )) 
    
  model = Sequential()
  model.add(Dense(64, input_dim=20, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(64, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(loss='binary_crossentropy',
                optimizer='rmsprop',
                metrics=['accuracy'])

  model.fit(X_train, y_train,
            epochs=20,
            batch_size=128)
  score = model.evaluate(X_test, y_test, batch_size=128)
  print(score)

Found GPU at: /device:GPU:0
Shape of X_train: (70000, 20)
Shape of y_train: (70000,) 

Shape of X_test: (30000, 20)
Shape of y_test: (30000,) 

Sample y  [1 1 1 0]
Sample x 
 [[ 2.98919039  0.62731227  0.60238323  1.24349552  1.06408715 -1.08575974
  -0.3493104  -0.85168396  0.10413125 -0.51993229 -1.63040134 -0.13499856
  -1.25386528 -1.39594992  0.15691506 -1.38107859 -1.13429371  0.17269529
   0.44244709 -1.58084584]
 [-1.09860813  1.59984811  0.78701873  2.72301675 -1.5564854  -0.93397585
   0.618221   -0.64161237 -1.40185217  0.20729478  0.80256414  0.16541556
   0.31403327 -1.06579127 -0.82449324 -1.07449364  0.67535315 -1.52103914
  -0.45680285  1.14255148]
 [ 0.62008614  1.36213306 -1.37931454  1.09220991 -0.58420399  0.91536885
   0.331173    1.66690062  0.00950866  0.19266151 -0.15380612  1.53500922
   0.15229205  0.77168977  0.01531574 -1.00089212  0.88834963 -0.24067446
  -0.60575777 -0.36546898]
 [-0.59688498 -0.33229974 -2.22425689 -1.79861191  0.25147778 -0.12021038
  -0

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20

Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.21178311232328415, 0.9160666666984558]


## Impact of Standardization
The model below, is equivalent to model 1. However, all data (predictors) are standardized.

REFERENCE: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [68]:
#mean_X = mean(X)
#mean_Y = mean(y)
#sd_X = sd(X)
#sd_Y = sd(Y)
from sklearn.preprocessing import StandardScaler

# Standardization of X
std = StandardScaler(copy=True, with_mean=True, with_std=True)
std = std.fit(X)
X_std = std.transform(X)
print(X_std[1:5])


X_train = X_std[:train_samples]
X_test = X_std[train_samples:]
y_train = y[:train_samples]
y_test = y[train_samples:]

print("Shape of X_train:", X_train.shape )
print("Shape of y_train:", y_train.shape, "\n" )
print("Shape of X_test:", X_test.shape )
print("Shape of y_test:", y_test.shape, "\n" )

# Sample data
print("Sample y ", y_train[1:5])
print("Sample x \n", X_train[1:5], "\n" )
# Frequencia
print("Frequency \n", itemfreq( y )) 

[[ 2.98996918  0.61800881  0.46626613  0.95613576  1.06147415 -1.08968617
  -0.35555945 -0.84174552  0.10518837 -0.51963654 -1.63628803 -0.11655378
  -1.2497653  -1.40117807  0.1601449  -1.39100283 -1.13605005  0.17237447
   0.44075883 -1.57697059]
 [-1.09537199  1.57270675  0.60947031  2.09153589 -1.55498828 -0.93762498
   0.61049051 -0.63242725 -1.39941584  0.20632752  0.80075038  0.1463433
   0.31511071 -1.07008951 -0.82063344 -1.08313307  0.67735211 -1.52142631
  -0.45994077  1.14330883]
 [ 0.62228909  1.33935179 -1.0707481   0.84003759 -0.5842318   0.91509836
   0.32388202  1.6678073   0.01065244  0.19171967 -0.1572209   1.34489608
   0.15368129  0.77256692  0.01863647 -1.00922311  0.89079059 -0.24101147
  -0.60913581 -0.36298516]
 [-0.59395045 -0.32400234 -1.72608942 -1.37840945  0.25013928 -0.12237282
  -0.28535248  0.25392673  1.30850669 -0.56654184 -0.4159032   1.19836344
   1.49776031  0.64009164 -0.06988056  1.40233901 -0.61643214  1.67408777
   0.7085058  -0.70506509]]
Shap

In [69]:


model = Sequential()
model.add(Dense(64, input_dim=20, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=[accuracy,precision,recall])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=128)

score = model.evaluate(X_test, y_test, verbose=0, batch_size=128)
print('Test score:', score[0])
print('Test accuracy:', score[1])

# Prediction 
y_pred = model.predict(X_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 8/20
Epoch 9/20
Epoch 10/20

Epoch 11/20
Epoch 12/20
Epoch 13/20

Epoch 14/20
Epoch 15/20
Epoch 16/20

Epoch 17/20
Epoch 18/20
Epoch 19/20

Epoch 20/20
Test score: 0.21285472310384115
Test accuracy: 0.9161666666984558


## Precision

Precision is the percentage of correctly classified true positives as a percentage of the positive predictions. High precision means that you correctly label as many of the true positives as possible. For example, a medical diagnostic tool should be very precise because not catching an illness can cause an illness to worsen.

## Recall

Recall on the other hand is the percentage of relevant elements returned. For example, if you search for Harry Potter books on Google, recall will be the number of Harry Potter titles returned divided by seven.

Ideally we will have a recall of 1. In this case, it might be a nuisance, and a terrible user experience to sift through irrelevant search results. Additionally, if a user does not see relevant results, they will likely not make any purchases, which eventually could hurt the bottom line. 

## Accuracy

Accuracy is a measure of all the correct predictions as a percentage of the total predictions. Accuracy does poorly as a measure of model performance especially where you have unbalanced classes.11
For precision, recall, accuracy, and confusion matrices to make sense to begin with, the training data should be representative of the population such that the model learns how to classify correctly. 

## F-1 score

The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:

```
F1 = 2 * (precision * recall) / (precision + recall)
```
In the multi-class and multi-label case, this is the weighted average of the F1 score of each class.


## Extra code
Interesting, but not important code. Development.

In [0]:
from keras import backend as K

def matthews_correlation(y_true, y_pred):
    """Matthews correlation metric.
# Aliases

    It is only computed as a batch-wise average, not globally.

    Computes the Matthews correlation coefficient measure for quality
    of binary classification problems.
    """
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def fbeta_score(y_true, y_pred, beta=1):
    """Computes the F score.

    The F score is the weighted harmonic mean of precision and recall.
    Here it is only computed as a batch-wise average, not globally.

    This is useful for multi-label classification, where input samples can be
    classified as sets of labels. By only using accuracy (precision) a model
    would achieve a perfect score by simply assigning every class to every
    input. In order to avoid this, a metric should penalize incorrect class
    assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
    computes this, as a weighted mean of the proportion of correct class
    assignments vs. the proportion of incorrect class assignments.

    With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
    correct classes becomes more important, and with beta > 1 the metric is
    instead weighted towards penalizing incorrect class assignments.
    """
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score


def fmeasure(y_true, y_pred):
    """Computes the f-measure, the harmonic mean of precision and recall.

    Here it is only computed as a batch-wise average, not globally.
    """
    return fbeta_score(y_true, y_pred, beta=1)


# aliases
fscore = f1score = fmeasure

In [0]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
np.random.seed(666)

# Generate dummy data ----------------------------------------------------------
n = 100000
perc_train = 0.9
perc_test = 1 - perc_train
n_variables = 20 # number of features (variables)

X_train = np.random.random((int(n*perc_train), 20))
y_train = np.random.randint(2, size=(int(n*perc_train), 1))
X_test = np.random.random((n - int(n*perc_train), 20))
y_test = np.random.randint(2, size=(n - int(n*perc_train), 1))

print("Shape of X_train:", X_train.shape )
print("Shape of y_train:", y_train.shape, "\n" )
print("Shape of X_test:", X_test.shape )
print("Shape of y_test:", y_test.shape, "\n" )

# Sample data
print("Sample y ", y_train[1:5])
print("Sample x \n", X_train[1:5], "\n" )
# Frequencia
print("Frequency \n", itemfreq( y )) 
