# This Notebook Explores Computer Vision Using MNIST Dataset.

In [None]:
# Required libraries are loaded. 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

In [None]:
# Files to be used.
path = "../../../Data/digit-recognizer/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Here we are visualizing a sample observation.
obs = train.iloc[0,][1:].to_numpy()
im = obs.reshape(28, 28)
plt.imshow(im, cmap="binary")
plt.axis("off")
plt.show()

In [None]:
# Working with numpy is easier on sklearn.
X_train = train.loc[:,"pixel0":].to_numpy()
y_train = train.loc[:,"label"].to_numpy()
test = test.to_numpy()

### Grid Search KNeighbors for best parameter space.

In [None]:
knn_clf = KNeighborsClassifier()

In [None]:
# Parameters to search for the best model.
params = [{"n_neighbors": [2, 3, 4, 6], "weights": ["distance", "uniform"]}]

In [None]:
grid_search = GridSearchCV(knn_clf, params, cv=5, scoring="accuracy", verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
cvres = grid_search.cv_results_

for mean_test_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(f"The mean_test_score is: {mean_test_score} and params: {params}")

### Error Analysis

In [None]:
y_train_pred = cross_val_predict(grid_search, X_train, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

Let's visualize the above matrix

In [None]:
plt.matshow(conf_mx, cmap="gray")
plt.show()

The image shows that the classifier did a good job classifying the digits. However, let's focus on the errors.

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx/row_sums
norm_conf_mx

In [None]:
np.fill_diagonal(norm_conf_mx, 0) # This elliminates correctly classified digits

In [None]:
plt.matshow(norm_conf_mx, cmap="gray")
plt.show()

### Data Augmentation

In [None]:
from scipy.ndimage import shift

In [None]:
def shift_image(image, dx, dy):
    """
    This function is used to shift an image down, up, left and right.
    """
    
    image = image.reshape(28, 28)
    shifted_image = shift(image, [dx, dy], cval=0)
    shifted_image = shifted_image.reshape([-1])
    return shifted_image

In [None]:
def augment_data(predictors, labels):
    """
    This function augments both the predictors and labels.
    """
    augmented_predictors = [image for image in predictors]
    augmented_labels = [label for label in labels]
    
    rotations = [(1,0),(-1,0),(0,-1),(0,1)]
    for dx, dy in rotations:
        for image, label in zip(predictors, labels):
            shifted_image = shift_image(image, dx, dy)
            augmented_predictors.append(shifted_image)
            augmented_labels.append(label)
    augmented_predictors = np.array(augmented_predictors)
    augmented_labels = np.array(augmented_labels)
    return augmented_predictors, augmented_labels
X_train_augmented, y_train_augmented = augment_data(X_train, y_train)

In [None]:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

### Additional Preprocessing

In [None]:
# Standardize the training set.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_augmented)

### Train KNeighbors on best parameters and augmented data.

In [None]:
knn_clf = KNeighborsClassifier(**grid_search.best_params_)

In [None]:
knn_clf.fit(X_train_augmented, y_train_augmented)
predictions = knn_clf.predict(test)

In [None]:
#scores = cross_val_score(knn_clf, X_train_augmented, y_train_augmented, cv=5)
#print(np.mean(scores))

In [None]:
df = pd.DataFrame()
df["ImageId"] = np.arange(1, 28001)
df["Label"] = predictions
df.to_csv("kneighbors_predictions.csv", index=False)