# Goal

Obtain over 97 % accuracy on the test set of MNIST dataset.

# Prepare dataset

In [3]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

We know that data comes already shuffled, so let's just create a function to create the train, test split with different proportions without shuffling:

In [13]:
def train_test_split(X, y, test_prop=.3):
    train_size = round(len(X) * (1-test_prop))
    train_slice = slice(train_size)
    test_slice = slice(train_size, None)
    return X[train_slice], X[test_slice], y[train_slice], y[test_slice]

In [14]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [15]:
X = mnist["data"].values
y = mnist["target"].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
len(X_train), len(X_test)

(49000, 21000)

In [57]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train classifiers

### baseline result

In [58]:
from sklearn.neighbors import KNeighborsClassifier

In [59]:
kn_clf = KNeighborsClassifier()
kn_clf.fit(X_train, y_train)

KNeighborsClassifier()

In [60]:
from sklearn.metrics import accuracy_score

In [61]:
y_pred = kn_clf.predict(X_test)

In [62]:
accuracy_score(y_test, y_pred)

0.9680952380952381

Almost there!

### grid search

In [63]:
from sklearn.model_selection import GridSearchCV

In [64]:
kn_clf = KNeighborsClassifier()
kn_clf.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [67]:
cv_results_ = GridSearchCV(KNeighborsClassifier(), 
                           param_grid={"n_neighbors": [3, 5, 10, 20], "weights": ["uniform", "distance"], "p": [1, 2]}, 
                           scoring="accuracy", cv=3, n_jobs=3)

In [68]:
cv_results_.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=3,
             param_grid={'n_neighbors': [3, 5, 10, 20], 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [69]:
cv_results_.best_estimator_

KNeighborsClassifier(n_neighbors=3, weights='distance')

In [70]:
best_kn_clf = cv_results_.best_estimator_

In [71]:
accuracy_score(y_test, best_kn_clf.predict(X_test))

0.9700952380952381