# Metrics

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import mglearn
import pandas as pd
from sklearn.datasets import load_iris,make_moons,make_blobs,make_regression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix,precision_score,recall_score,f1_score,
    roc_curve,roc_auc_score,precision_recall_curve,accuracy_score,classification_report,mean_squared_error,r2_score)

### Part 1: Regression

A simple linear regression problem:

In [None]:
X,y=make_regression(n_samples=1000, n_features=1,noise=10.0)

plt.scatter(X,y)
plt.show()

In [None]:
model=LinearRegression().fit(X,y)

In [None]:
y_pred=model.predict(X)

plt.scatter(X,y,alpha=0.1)
plt.scatter(X,y_pred,color='red',s=0.1)
plt.show()

In [None]:
print("Mean squared error: {}".format(mean_squared_error(y,y_pred)))
print("R2 score: {}".format(r2_score(y,y_pred)))

Rescaling the data:

In [None]:
y=0.01*y

In [None]:
model=LinearRegression().fit(X,y)
y_pred=model.predict(X)

plt.scatter(X,y,alpha=0.1)
plt.scatter(X,y_pred,color='red',s=0.1)
plt.show()

In [None]:
print("Mean squared error: {}".format(mean_squared_error(y,y_pred)))
print("R2 score: {}".format(r2_score(y,y_pred)))

### Part 2: Binary Classification

We use the synthetic 'moons' data:

In [None]:
X, y = make_moons(n_samples=1000, noise=0.25, random_state=4)
X_train,X_test,y_train,y_test = train_test_split(X,y, stratify=y)
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)
plt.show()

We look at the performance of SVM classifiers

In [None]:
kernel_svm = SVC(kernel='rbf', gamma=0.1).fit(X_train,y_train)
predictions = kernel_svm.predict(X_test)
print("Confusion matrix:  \n{}\n".format(confusion_matrix(y_test,predictions)))
print("Precision: \n{}\n".format(precision_score(y_test,predictions,pos_label=1)))
print("Recall: \n{}\n".format(recall_score(y_test,predictions,pos_label=1)))
print("F1: \n{}".format(f1_score(y_test,predictions,pos_label=1)))


Now looking at the ROC curve:

In [None]:
fprSVM,tprSVM,thresh = roc_curve(y_test,kernel_svm.decision_function(X_test))
plt.plot(fprSVM,tprSVM)
print("Area under curve: \n{}".format(roc_auc_score(y_test,kernel_svm.decision_function(X_test))) )

**Exercise:** try some other classifier(s) on this dataset, and compare their ROC-curves. Do you find a classifier that strictly dominates  another in the sense that its ROC curve is always above the other?

###  Part 3: Multiclass Metrics

Constructing the confusion matrix from the slides from imaginary true and predicted label arrays:

In [None]:
truelabels = np.arange(220)
truelabels[0:100]=1
truelabels[100:110]=2
truelabels[110:120]=3
truelabels[120:220]=4
predlabels = np.arange(220)
predlabels[0:89]=1
predlabels[89:93]=2
predlabels[93:97]=3
predlabels[97:100]=4
predlabels[100:103]=2
predlabels[103:106]=3
predlabels[106:110]=4
predlabels[110:112]=1
predlabels[112:120]=3
predlabels[120:121]=1
predlabels[121:122]=3
predlabels[122:220]=4

In [None]:
print("Confusion matrix:  \n{}\n".format(confusion_matrix(truelabels,predlabels)))
print("Accuracy: \n{}\n".format(accuracy_score(truelabels,predlabels)))

Now let's look at the averaged binary scores:



In [None]:
print("One-vs-all measures: \n{}\n".format(classification_report(truelabels,predlabels)))
print("Macro average F1: \n{}\n".format(f1_score(truelabels,predlabels,average='macro')))
print("Micro average F1: \n{}\n".format(f1_score(truelabels,predlabels,average='micro')))

### Part 4 Calibration

We use a fairly big sample from the make_moons data generator:

In [None]:
X, y = make_moons(n_samples=30000, noise=0.25, random_state=3)
X_train,X_test,y_train,y_test = train_test_split(X,y, stratify=y, random_state=42)
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)
plt.show()

Learn a Naive Bayes and a Neural network model:

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=[10],activation='tanh',solver='lbfgs', max_iter=2000, random_state=0).fit(X_train, y_train)
nb = GaussianNB().fit(X_train, y_train)

Construct histograms showing the distribution of probability predictions for the positive class. Histograms that are more concentrated at the extreme ends represent classifiers that are more 'confident' in their predictions

In [None]:
#print(mlp.predict_proba(X_test))
nnposprobas=mlp.predict_proba(X_test)[:,1]
nbposprobas=nb.predict_proba(X_test)[:,1]
pddf = pd.DataFrame({'NN' : nnposprobas, 'NB' : nbposprobas})
pddf.plot.hist(bins=20,alpha=0.4)
plt.show()

This distribution says nothing about calibration. For that we create a graph that plots the value *b* of the predicted probability for the positive class against the ratio of actually positive datapoints in the small interval (*b*,*b+binwidth*). We also plot a relative measure for how many datapoints fall into each bin.

In [None]:
posprobas=nbposprobas
posprobas=nnposprobas


binwidth=0.05
bins = np.arange(0,1,binwidth)

for posprobas,label in zip( (nbposprobas,nnposprobas),("Naive Bayes","Neural Net")  ):
    predperc = np.zeros(bins.size)
    binexamples = np.zeros(bins.size)
    
    for i,b in enumerate(bins):
        preds = y_test[(posprobas >= b) & (posprobas < b+binwidth) ]
        predperc[i] = np.sum(preds)/preds.size
        binexamples[i]=preds.size
        
    binexamples*=1/np.max(binexamples)
        
    plt.plot(bins,predperc,label="Pos. ratio "+label)
    plt.plot(bins,binexamples,label="Num.pred. "+label)
plt.plot(bins,bins)
plt.legend()
plt.show()

We see that both naive Bayes and neural network here are fairly well calibrated. Due to the relatively small number of cases with predicted probabilities in the middle range, there are some fluctuations in the positive ratios here.