# Classification examples based on Chapter 4 of "Hands-on Machine Learning with scikit-learn and TensorFlow" by Aurélien Géron

### Read MNIST data using TensorFlow as it's easy

In [None]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=False)

### Read training and test data and shuffle training set just in case

In [None]:
X_train, y_train = mnist.train.images, mnist.train.labels
print('Shape of X: (%d, %d)' % X.shape)
print('Shape of y: (%d, %d)' % y.shape)

X_test, y_test = mnist.test.images, mnist.test.labels

from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=42)

### Plot one of the images

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

digit_index = [index for index, label in enumerate(y_train) if label == 5][0]
digit = X_train[digit_index]
digit_image = digit.reshape(28, 28)

digit_label = y_train[digit_index]

plt.imshow(digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
plt.axis('off')
plt.title('Labelled as %d' % digit_label)
plt.show()

## Binary classification
### Do binary classification trying to classify images as "5 or "not a 5"

In [None]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

### Use a simple `SGDClassifier`

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(loss='log', random_state=42, max_iter=1000, tol=1e-3)
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([digit])

### Evaluate performance with 5-fold cross validation using accuracy as scoring

In [None]:
from sklearn.cross_validation import cross_val_score
cv_scores = cross_val_score(sgd_clf, X_train, y_train_5, cv=5, scoring="accuracy")
print('Got mean accuracy %.2f with std of %.2f' % (np.mean(cv_scores), np.std(cv_scores)))

### Compare to silly "not 5" estimator that also gets 90% accuracy

In [None]:
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((X.shape[0], 1), dtype=bool)
never_5_clf = Never5Classifier()
cv_scores_2 = cross_val_score(never_5_clf, X_train, y_train_5, cv=5, scoring="accuracy")
print('Got mean accuracy %.2f with std of %.2f' % (np.mean(cv_scores_2), np.std(cv_scores_2)))

### It seems that accuracy is not that good a metric for a skewed dataset with lots of "not 5":s, so let's look at confusion matrix instead. First compute predictions for all images in the training set:

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=5)

### Then compute the confusion matrix. The values go like this:
<table>
  <tr>
     <th>true negatives for not 5</th>
     <th>false positives for not 5</th> 
  </tr>
  <tr>
     <th>false negatives for a 5</th>
     <th>true positives for a 5</th> 
  </tr>
</table>
or generally for binary classifier
<table>
  <tr>
     <th>Class 0 predicted as 0</th>
     <th>Class 0 predicted as 1</th> 
  </tr>
  <tr>
     <th>Class 1 predicted as 0</th>
     <th>Class 1 predicted as 1</th> 
  </tr>
</table>

In [None]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_train_5, y_train_pred)
cf_matrix

### Compute precision and recall defined as 
$$ precision = \frac{TP}{TP + FP} $$
and 
$$ recall = \frac{TP}{TP + FN} $$
- Precision depicts the probability that an image predicted to be a five actually was a five.
- Recall is the probability that an image labelled as five was correctly predicted to be a five.

The harmonic mean of precision and recall is the $F_1$ score:
$$ F_1 = \frac{2}{\frac{1}{precision} + \frac{1}{recall}} $$

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def print_metrics(y_train, y_pred):
    print('Got precision %.2f' % (precision_score(y_train, y_pred)))
    print('Got recall %.2f' % (recall_score(y_train, y_pred)))
    print('Got F1 score %.2f' % (f1_score(y_train, y_pred)))
print_metrics(y_train_5, y_train_pred)

### One can tune the precision-recall tradeoff by tuning the decision boundary threshold. While this cannot be set for `SGDClassifier` directly, one can access the decision function and classify using a threshold. Increasing the threshold increases precision, as classifier classifies less images as fives. It reduces the recall, as more fives are missed.

In [None]:
class SGDClassifierWithThreshold(BaseEstimator):
    def __init__(self, sgd_clf, threshold):
        self.threshold = threshold
        self.sgd_clf = sgd_clf
        # super().__init__()
    def fit(self, X, y=None):
        self.sgd_clf.fit(X, y)
        return self
    def predict(self, X):
        return self.sgd_clf.decision_function(X) > self.threshold

X_fives = X_train[y_train_5]
X_not_fives = X_train[~y_train_5]

sgd_with_threshold = SGDClassifierWithThreshold(sgd_clf=SGDClassifier(max_iter=1000, tol=1e-3), threshold=3)
y_train_pred = cross_val_predict(sgd_with_threshold, X_train, y_train_5, cv=5)
print_metrics(y_train_5, y_train_pred)


### One can plot the precision-recall curve straightforwardly by computing the decision function values for all training instances

In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=5, method='decision_function')
y_scores

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

### Plot the precision-recall curve

In [None]:
def plot_precision_recall_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.xlabel('Threshold')
    plt.legend()
    plt.ylim([0, 1])
    plt.xlim([-10, 10])
    
plot_precision_recall_threshold(precisions, recalls, thresholds)