# MNIST_784 Dataset

In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784", version=1)

  warn(


In [2]:
X, y = mnist["data"], mnist["target"]

In [3]:
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

In [5]:
X.iloc[0]

pixel1      0.0
pixel2      0.0
pixel3      0.0
pixel4      0.0
pixel5      0.0
           ... 
pixel780    0.0
pixel781    0.0
pixel782    0.0
pixel783    0.0
pixel784    0.0
Name: 0, Length: 784, dtype: float64

In [6]:
import numpy as np
y = y.astype(np.uint8)

In [7]:
y.value_counts()

class
1    7877
7    7293
3    7141
2    6990
9    6958
0    6903
6    6876
8    6825
4    6824
5    6313
Name: count, dtype: int64

## Split data

In [8]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

## Train models

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, accuracy_score

In [10]:
forest_clf = RandomForestClassifier(random_state=42, n_jobs=4)

In [11]:
y_forest_pred = cross_val_predict(forest_clf, X_train, y_train, cv=3)

In [12]:
f1_score(y_train, y_forest_pred, average="macro")

0.9642944143539414

In [13]:
accuracy_score(y_train, y_forest_pred)

0.9645833333333333

In [14]:
knn_clf = KNeighborsClassifier(n_jobs=4)

In [15]:
y_knn_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)

In [16]:
f1_score(y_train, y_knn_pred, average="macro")

0.9672164755274896

In [17]:
accuracy_score(y_train, y_knn_pred)

0.9674166666666667

## Hyperparameter Optimization

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
param_grid = [
    {"n_neighbors": [3, 10, 20, 30], "weights": ["uniform", "distance"]}
]

In [20]:
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, scoring="accuracy", n_jobs=4)

In [21]:
grid_search.fit(X_train, y_train)



In [22]:
grid_search.best_estimator_

In [23]:
grid_search.best_score_

0.9711166666666665

In [24]:
final_model = grid_search.best_estimator_
y_final_pred = final_model.predict(X_test)

In [25]:
f1_score(y_test, y_final_pred, average="macro")

0.971528765919764

In [26]:
accuracy_score(y_test, y_final_pred)

0.9717

## Data augmentation

In [28]:
import pandas as pd

In [29]:
def shift_image(img: np.array, dx: int, dy: int) -> np.array:
    """
    Shift MNIST image by some offset

    :param img: The image to be shifted
    :type img: np.array
    :param dx: The number of pixels being shifted on the x axis. Can be negative.
    :type dx: int
    :param dy: The number of pixels being shifted on the y axis. Can be negative.
    :type dy: int
    :return: Description of the return value.
    :rtype: np.array
    
    """
    new_img = np.roll(img, dy, axis=0)
    new_img = np.roll(new_img, dx, axis=1)
    if dx > 0:
        new_img[:, :dx] = 0
    elif dx < 0:
        new_img[: dx:] = 0
    if dy > 0:
        new_img[:dy, :] = 0
    elif dy < 0:
        new_img[dy:, :] = 0

    return pd.Series(new_img.flatten())

In [37]:
img = np.reshape(X.iloc[0].to_numpy(), (28, 28))

In [40]:
transformed_1 = X_train.apply(lambda row: shift_image(np.reshape(row.to_numpy(), (28, 28)), 1, 0), axis=1)
transformed_2 = X_train.apply(lambda row: shift_image(np.reshape(row.to_numpy(), (28, 28)), 0, 1), axis=1)
transformed_3 = X_train.apply(lambda row: shift_image(np.reshape(row.to_numpy(), (28, 28)), -1, 0), axis=1)
transformed_4 = X_train.apply(lambda row: shift_image(np.reshape(row.to_numpy(), (28, 28)), 0, -1), axis=1)
transformed_1.columns = transformed_2.columns = transformed_3.columns = transformed_4.columns = X_train.columns

In [42]:
X_train_augmented = pd.concat([X_train, transformed_1, transformed_2, transformed_3, transformed_4], axis=0, ignore_index=True)

In [43]:
X_train_augmented.shape

(300000, 784)

In [45]:
knn_clf = KNeighborsClassifier(**grid_search.best_params_)

In [46]:
y_knn_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)

In [47]:
f1_score(y_train, y_knn_pred, average="macro")

0.9691263353762144

In [48]:
accuracy_score(y_train, y_knn_pred)

0.9693333333333334