In [None]:
from sklearn.datasets import fetch_openml
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from scipy.ndimage.interpolation import shift

In [None]:
mnist = fetch_openml('mnist_784', version=1)

### Question 1

In [None]:
neigh = KNeighborsClassifier(n_neighbors=10)

In [None]:
X_train, X_test, y_train, y_test = mnist['data'][:60000], mnist['data'][60000:], mnist['target'][:60000], mnist['target'][60000:]

In [None]:
neigh.fit(X_train, y_train)

In [None]:
neigh.score(X_test, y_test)

In [None]:
# Using grid search to increse the accurancy to 97%
grid_param = [
    {'n_neighbors': [3, 4, 5]},
    {'weights': ['uniform', 'distance']}
             ]

In [None]:
neigh = KNeighborsClassifier()
grid_search = GridSearchCV(neigh, grid_param, cv=5, verbose=3)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
y_pred = grid_search.predict(X_test)
accuracy_score(y_pred, y_test)

### Question 2

For each image, create four shifted copies and add them in the training set

In [None]:
def shift_image(image, dx, dy):
    """
    @param image: the input array of image
    @param dx: shift to the x_axis direction
    @param dy: shift to the y_axis direction
    """
    image_reshaped = image.reshape((28, 28))
    image_shifted = shift(image_reshaped, [dy, dx])
    return image_shifted.reshape([-1])

In [None]:
image = np.array(X_train.iloc[1000])
shift_image_down = shift_image(image, 0, 5)
shift_image_left = shift_image(image, -5, 0)

In [None]:
plt.figure(figsize=(12,3))
plt.subplot(131)
plt.title('Original', fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation='nearest', cmap='Greys')
plt.subplot(132)
plt.title('Shift left', fontsize=14)
plt.imshow(shift_image_left.reshape(28, 28), interpolation='nearest', cmap='Greys')  
plt.subplot(133)
plt.title('Shift down', fontsize=14)
plt.imshow(shift_image_down.reshape(28, 28), interpolation='nearest', cmap='Greys') 

The following code took a long time to run!

In [None]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((0,1), (0,-1), (-1,0), (1,0)):
    for i in range(len(X_train)):
        img, label = np.array(X_train.iloc[i]), np.array(y_train.iloc[i])
        X_train_augmented.append(shift_image(img, dx, dy))
        y_train_augmented.append(label) 

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

In [None]:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

In [None]:
knn_clf = KNeighborsClassifier(**grid_search.best_params_)

In [None]:
knn_clf.fit(X_train_augmented, y_train_augmented)

In [None]:
y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)