In [1]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)
# dtype object --> uint8
y = y.astype(np.uint8)

(70000, 784)
(70000,)


In [4]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [5]:
import collections
collections.Counter(y_train)

Counter({7: 6265,
         3: 6131,
         8: 5851,
         9: 5949,
         5: 5421,
         4: 5842,
         2: 5958,
         6: 5918,
         1: 6742,
         0: 5923})

In [6]:
# 1
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'weights': ["uniform"], 'n_neighbors': [2, 10]}
]
# "distance"
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=2,
                           scoring="accuracy",
                           return_train_score=True)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# 2
from scipy.ndimage.interpolation import shift
def shift_digit(row ,[dy, dx]):
    return shift(row.reshape(28, 28), [dy, dx]).reshape(784)
X_train_expanded = [X_train]
y_train_expanded = [y_train]

for dy, dx in ((1,0), (-1, 0), (0, 1), (0, -1)):
    x_expanded = np.apply_along_axis(shift_digit, axis=1, arr=X_train, dx=dx, dy=dy)
    X_train_expanded.append(x_expanded)
    y_train_expanded.append(y_train)
X_train_expanded = np.concatenate(X_train_expanded)
y_train_expanded = np.concatenate(y_train_expanded)

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_expanded, y_train_expanded)