# Chapter 5, Exercise 9

Train an SVM classifier on the MNIST dataset Since the SVM classifiers are binary classifiers, you need to use one-versus-all to classify all 10 digits.  You may want to tune the hyperparameters using small validation sets to speed up the process.  What accuracy can you reach?

## Imports

In [23]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import warnings
warnings.simplefilter('ignore')

## Load Data

In [2]:
mnist = fetch_mldata('MNIST Original')
mnist

{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}

In [5]:
X = mnist['data']
y = mnist['target']

In [6]:
# train / test split using indices (first 60000 are the training data)
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

In [7]:
# shuffle the training data
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# create smaller sample to test with
X_train_sample, y_train_sample = X_train[:1000], y_train[:1000]

## Train binary classifier 

In [8]:
svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='linear'))
])

%timeit svm_clf.fit(X_train_sample, y_train_sample)



1 loop, best of 3: 474 ms per loop


In [9]:
# basic evaluation make predictions / create confusion matrix
y_pred = cross_val_predict(svm_clf, X_train_sample, y_train_sample, cv=3)
confusion_matrix(y_pred, y_train_sample)



array([[ 81,   0,   3,   4,   1,   3,   2,   1,   0,   2],
       [  0,  99,   2,   1,   0,   3,   1,   3,   0,   0],
       [  3,   0,  93,   5,   1,   0,   5,   3,   4,   2],
       [  1,   0,   3,  89,   0,   2,   0,   0,  10,   3],
       [  0,   1,   0,   0,  91,   3,   0,   5,   1,  13],
       [  0,   0,   0,   4,   0,  55,   0,   0,   6,   0],
       [  1,   0,   3,   0,   2,   2, 106,   0,   1,   0],
       [  0,   0,   4,   2,   3,   1,   0,  83,   1,   7],
       [  0,   2,   1,   6,   0,   5,   0,   0,  79,   0],
       [  0,   0,   0,   0,   6,   1,   0,   2,   1,  72]])

In [10]:
# precision / recall
from sklearn.metrics import precision_score, recall_score, f1_score
print(precision_score(y_pred, y_train_sample, average='weighted'))
print(recall_score(y_pred, y_train_sample, average='weighted'))
print(f1_score(y_pred, y_train_sample, average='weighted'))

0.854457286786
0.848
0.849419714986


## Fine-tune model using GridSearchCV

In [14]:
svm_clf.get_params().keys()

dict_keys(['memory', 'steps', 'scaler', 'svc', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'svc__C', 'svc__cache_size', 'svc__class_weight', 'svc__coef0', 'svc__decision_function_shape', 'svc__degree', 'svc__gamma', 'svc__kernel', 'svc__max_iter', 'svc__probability', 'svc__random_state', 'svc__shrinking', 'svc__tol', 'svc__verbose'])

In [24]:
from sklearn.model_selection import GridSearchCV
# first define a list and range of hyperparameters to search
param_grid = [
    {'svc__C': [0.1,0.3,1.0,10.0], 'svc__kernel':['rbf','linear','poly']}
]

grid_search = GridSearchCV(svm_clf, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_sample, y_train_sample)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'svc__C': [0.1, 0.3, 1.0, 10.0], 'svc__kernel': ['rbf', 'linear', 'poly']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [25]:
grid_search.best_estimator_

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [26]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(np.sqrt(-mean_score), params)

2.58398916406 {'svc__C': 0.1, 'svc__kernel': 'rbf'}
1.5703502794 {'svc__C': 0.1, 'svc__kernel': 'linear'}
3.1188138771 {'svc__C': 0.1, 'svc__kernel': 'poly'}
1.89763010094 {'svc__C': 0.3, 'svc__kernel': 'rbf'}
1.5703502794 {'svc__C': 0.3, 'svc__kernel': 'linear'}
4.06128058622 {'svc__C': 0.3, 'svc__kernel': 'poly'}
1.65801085642 {'svc__C': 1.0, 'svc__kernel': 'rbf'}
1.5703502794 {'svc__C': 1.0, 'svc__kernel': 'linear'}
4.06804621409 {'svc__C': 1.0, 'svc__kernel': 'poly'}
1.56748205731 {'svc__C': 10.0, 'svc__kernel': 'rbf'}
1.5703502794 {'svc__C': 10.0, 'svc__kernel': 'linear'}
2.0022487358 {'svc__C': 10.0, 'svc__kernel': 'poly'}
