Download dataset and split into a training set and a testing set.

In [1]:

##Prepare the dataset
from six.moves import urllib

print("Could not download MNIST data from mldata.org, trying alternative...")

# Alternative method to load MNIST, if mldata.org is down
from scipy.io import loadmat
mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
mnist_path = "./mnist-original.mat"
response = urllib.request.urlopen(mnist_alternative_url)
with open(mnist_path, "wb") as f:
    content = response.read()
    f.write(content)
mnist_raw = loadmat(mnist_path)
mnist = {
    "data": mnist_raw["data"].T,
    "target": mnist_raw["label"][0],
    "COL_NAMES": ["label", "data"],
    "DESCR": "mldata.org dataset: mnist-original",
}
print("Success!")

X,y = mnist['data'], mnist['target']

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]


Could not download MNIST data from mldata.org, trying alternative...
Success!


## PART ONE: MINIST classification using scikit learn
Set up environment:

In [27]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load scikit's support vector machine library
from sklearn.svm import SVC

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Set random seed
np.random.seed(0)

# Load scikit's grid search cross validation library
from sklearn.model_selection import GridSearchCV

# Import accuracy score library
from sklearn.metrics import accuracy_score

##### Perform a 3-fold grid search and train the Random Forest Classifier

In [9]:
#set grid search parameters
param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [100, 300, 784]}]
rfc = RandomForestClassifier()

#train rfc model across 3 folds
grid_search = GridSearchCV(rfc, param_grid, cv = 3)
grid_search.fit(X_train, y_train)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [100, 300, 784]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

Since 30 is the maximum value of n_estimators and 100 is the minimum value of max_features that were evaluated, we will continue to evaluate with different values crossing the max and min, since the score may continue to improve.

In [17]:
#set grid search parameters
param_grid = [{'n_estimators': [30, 40, 50], 'max_features': [30, 60, 100]}]
rfc = RandomForestClassifier()

#train rfc model across 3 folds
grid_search = GridSearchCV(rfc, param_grid, cv = 3)
grid_search.fit(X_train, y_train)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [30, 40, 50], 'max_features': [30, 60, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

Review best model

In [20]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=60, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
final_model = grid_search.best_estimator_
y_pred = final_model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.9674


##### Perform a 3-fold grid search and train the Support Vector Machine Classifier

In [29]:
#set grid parameters
param_grid2 = [
        {'kernel': ['linear'], 'C': [10., 30., 100.],'probability':[False]},
        {'kernel': ['poly'], 'C': [1.0, 3.0, 5.0],'gamma': [0.01, 0.03, 0.08],'probability':[False]},
        {'kernel': ['rbf'],'C': [2.8],'gamma': [0.0073],'probability': [False]}
]
svm = SVC()

#train rfc model across 3 folds
grid_search = GridSearchCV(svm, param_grid2, cv = 3)
grid_search.fit(X_train[:10000], y_train[:10000])

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['linear'], 'C': [10.0, 30.0, 100.0], 'probability': [False]}, {'kernel': ['poly'], 'C': [1.0, 3.0, 5.0], 'gamma': [0.01, 0.03, 0.08], 'probability': [False]}, {'kernel': ['rbf'], 'C': [2.8], 'gamma': [0.0073], 'probability': [False]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
grid_search.best_estimator_

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [None]:
svm_trained = SVC(C=10, kernel='linear', probability=False)
svm_trained.fit(X_train[10000:], y_train[10000:])