# Evaluation matrices (multiclass classification)


## Setting up
- Iris data
- 3 classes
- 2 features
- Logistic regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Iris data
dataObj = load_iris()

# X data (features)
X = dataObj.data[:, [1, 2]]

# y data
y = dataObj.target

print(np.unique(y))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Constructing a pipeline object
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(random_state=0, C=1))])

pipe_lr.fit(X_train, y_train)

## Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = pipe_lr.predict(X_test)
confusion_matrix(y_true=y_test, y_pred=y_pred)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(estimator=pipe_lr, X=X_test, y_true=y_test)  
plt.show()  

Compute class-wise (default) multilabel confusion matrix to evaluate the accuracy of a classification, and output confusion matrices for each class or sample.

Note that I used `np.flip` to reverse the order of the element to make the output consistent with what we used previously.

In [None]:
import seaborn as sns
from sklearn.metrics import multilabel_confusion_matrix

#Multilabel confusion matrix
matrices = multilabel_confusion_matrix(y_true=y_test, y_pred=y_pred)

nc = np.unique(y).shape[0]
fig, axs = plt.subplots(nc, 1, figsize=(5,nc*4))
for idx, m in enumerate(matrices):
    sns.heatmap(np.flip(m), annot=True, cmap='Blues', ax=axs[idx])
    axs[idx].set_title(f'Class {idx}')


## Accuracy, Precision, Recall, F1

`Support` is the number of true instances for each label.

#### Summary using `classification_report`

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred,  digits=4))

#### Using `score` functions

Calculate average

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Accuracy
ACC = accuracy_score(y_true=y_test, y_pred=y_pred)
print(f"Accuracy:{ACC:6.3f}")
print('-'*20)
# Precision
for average in ['macro', 'weighted', 'micro']:
    PRE = precision_score(y_true=y_test, y_pred=y_pred, average=average)
    print(f"Precision ({average}):{PRE:6.3f}")
print('-'*20)
# Recall
for average in ['macro', 'weighted', 'micro']:
    REC = recall_score(y_true=y_test, y_pred=y_pred, average=average)
    print(f"Recall ({average}):{REC:6.3f}")
print('-'*20)
# F1
for average in ['macro', 'weighted', 'micro']:
    F1 = f1_score(y_true=y_test, y_pred=y_pred, average=average)
    print(f"F1 Score ({average}):{F1:6.3f}")
print('-'*20)

Calculate classwise values

In [None]:
PRE = precision_score(y_true=y_test, y_pred=y_pred, average=None)
REC = recall_score(y_true=y_test, y_pred=y_pred, average=None)
F1 = f1_score(y_true=y_test, y_pred=y_pred, average=None)
print(PRE, REC, F1)
dft = pd.DataFrame( [PRE, REC, F1]).transpose()
dft.columns=['precision', 'recall', 'f1']
dft.index.names = ['class']
dft

Using `precision_recall_fscore_support` functions

Calculate average

In [None]:
from sklearn.metrics import precision_recall_fscore_support

arr = []
for average in ['macro', 'weighted', 'micro']:
    prfs = precision_recall_fscore_support(y_test, y_pred, average=average)
    print(prfs)
    data = {'average': average, 'precision': prfs[0], "recall": prfs[1], "f1": prfs[2] }
    arr.append(data)

dft = pd.DataFrame.from_records(arr, index='average')
display(dft)

Calculate classwise values

In [None]:
prfs = precision_recall_fscore_support(y_test, y_pred, average=None)
dft = pd.DataFrame(prfs).transpose()
dft.columns = ['precision', 'recall', 'f1', 'support']
dft.index.names = ['class']
display(dft)

## Using precision in grid search

In [None]:
from sklearn.metrics import make_scorer

# Making score
scorer = make_scorer(precision_score, average='micro')

In [None]:
pipe_lr.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = { 'clf__C': [0.001, 0.01, 0.1, 1] }

# Grid search. Note the "scoring" argument
gs = GridSearchCV(estimator=pipe_lr,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=10,
                  n_jobs=-1)
                  
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
df = pd.DataFrame(gs.cv_results_)
df = df.sort_values(by=['rank_test_score'])
display(df)