**Chapter 3 – Classification**

_This notebook contains some of the sample code and solutions to the exercises in chapter 3 of the Hands-On Machne Learning with Scikit-Learn and TensorFlow textbook._

# Setup

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# MNIST

In [None]:
from scipy.io import loadmat
mnist = loadmat('./datasets/mldata/mnist-original.mat')
mnist

In [None]:
X, y = mnist["data"], mnist["label"]
X = np.swapaxes(X,0,1)
print('Shape of X: ' + str(X.shape))

In [None]:
y = np.swapaxes(y,0,1)
y = np.ravel(y)
print('Shape of y: ' + str(y.shape))

In [None]:
from math import sqrt
print('Dimension of MNIST Images := sqrt(' + str(X.shape[1]) + ') = ' + str(sqrt(X.shape[1])) + ' pixels')

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,
           interpolation="nearest")
plt.title('Random MNIST Digit')
plt.axis("off")
plt.show()

In [None]:
# Function to plot a single digit
def plot_digit(data):
    image = data.reshape(28, 28) # Reshape data to image dimensions
    plt.imshow(image, cmap = matplotlib.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [None]:
# Function to plot many digits
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = matplotlib.cm.binary, **options)
    plt.axis("off")

In [None]:
# Plot digits
plt.figure(figsize=(9,9))
example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]
plot_digits(example_images, images_per_row=10)
plt.show()

In [None]:
# Split training and test data
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
# Shuffle training data
import numpy as np
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Binary classifier

Build a binary classifier that identifies if a digit is a 5 or not.

In [None]:
# Create a mask to identify if a digit is a 5 or not
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [None]:
# Create and train a Stocastic Gradient Descent Logistic Regression Binary Classifier
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
# Remember: some_digit is a 5
sgd_clf.predict([some_digit])

In [None]:
# A 3-fold cross-validation on accuracy
# The cross validation function splits the train and test sets into 3, trains 3 models, and then tests the models
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
# This accuracy seems good...

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

### Dumb Binary Classifier
The bellow classifier, `Never5Classifier`, classifies everything to be `not-5`. Let's see how it does...

In [None]:
# Classifier that classifies everything to be not-5 
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [None]:
# This very dumb classifier 
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

Amazingly, this very dumb classifier gets 90% accuracy. This is because 10% of the dataset is 5. Thus, this dumb classifier classifies incorrectly 10% of the time...
### Metrics: Precision, Recall, and F1-Score

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

In [None]:
# Confusion matrix showing the accuracy of the 95% SGD classifier
# Doesn't look near as good as a 95% accuracy
from sklearn.metrics import confusion_matrix
print('  not-5\t 5')
print(confusion_matrix(y_train_5, y_train_pred))

In [None]:
y_train_perfect_predictions = y_train_5

In [None]:
# What a perfect confusion matrix looks like
print('  not-5\t 5')
print(confusion_matrix(y_train_5, y_train_perfect_predictions))

In [None]:
# Precision of 5 class
from sklearn.metrics import precision_score, recall_score
# 4344 / (4344 + 1307) := 5 correctly predicted over 5 correctly predicted plus 5 incorrectly predicted
precision_score(y_train_5, y_train_pred)

In [None]:
# Recall of not 5 class
# 4344 / (4344 + 1077) := 5 correctly predicted over 5 correctly predicted plus not-5 incorrectly predicted as 5
recall_score(y_train_5, y_train_pred)

In [None]:
from sklearn.metrics import f1_score
# 4344 / (4344 + (1077 + 1307)/2)
f1_score(y_train_5, y_train_pred)

In [None]:
# Fetch scores from SGD classifier to later calculate Precisions and Recalls at various thresholds
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method="decision_function")

Note: there is an [issue](https://github.com/scikit-learn/scikit-learn/issues/9589) introduced in Scikit-Learn 0.19.0 where the result of `cross_val_predict()` is incorrect in the binary classification case when using `method="decision_function"`, as in the code above. The resulting array has an extra first dimension full of 0s. We need to add this small hack for now to work around this issue:

In [None]:
# hack to work around issue #9589 introduced in Scikit-Learn 0.19.0
if y_scores.ndim == 2:
    y_scores = y_scores[:, 1]

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
# Plot the precision vs recall curve
# Ideally, you want a classifier that has high precision and recall
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

plt.figure(figsize=(8, 6))
plt.title('Precision VS Recall Curve')
plot_precision_vs_recall(precisions, recalls)
plt.show()

# ROC curves

In [None]:
# Fetch the true positives rate and false positives rate from the SGD classifier
# True positive rate is Recall
# False positive rate is also known probability of false alarm
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
# Plot the ROC curve
# You want a high True positive rate and a low False positive rate
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plt.title('ROC Curve')
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train_5, y_scores)

# Multiclass classification
### SGD Classifier
Now we train our SGD classifier to recognize all 10 categories. Notice that the training takes significantly longer than training a binary classifier

In [None]:
sgd_clf.fit(X_train, y_train)

In [None]:
# Remember: some_digit is a 5
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

In [None]:
np.argmax(some_digit_scores)

In [None]:
# 3-fold cross-validation
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
# By scaling the image input, the accuracy increases. 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

In [None]:
def plot_confusion_matrix(matrix):
    """If you prefer color and a colorbar"""
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix)
    fig.colorbar(cax)

In [None]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

In [None]:
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plt.title('Correctly Predicted ' + str(cl_a)); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plt.title(str(cl_a) + ' Predicted as ' + str(cl_b)); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plt.title(str(cl_b) + ' Predicted as ' + str(cl_a)); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plt.title('Correctly Predicted ' + str(cl_b)); plot_digits(X_bb[:25], images_per_row=5)
plt.show()

# Dummy (ie. random) classifier

The roc curve for randomly produced numbers is a line.

In [None]:
from sklearn.dummy import DummyClassifier
dmy_clf = DummyClassifier()
y_probas_dmy = cross_val_predict(dmy_clf, X_train, y_train_5, cv=3, method="predict_proba")
y_scores_dmy = y_probas_dmy[:, 1]

In [None]:
fprr, tprr, thresholdsr = roc_curve(y_train_5, y_scores_dmy)
plot_roc_curve(fprr, tprr)