# Implementing K-NN on MNIST Dataset

In [16]:
# Dataloader implemention for MNIST DATASET (csv)

import numpy as np
import pandas as pd


def load_mnistcsv(path):
    try:
        df = pd.read_csv(path)
        data = df.iloc[:, 1:]
        data = Normalization(data)
        data_labels = df.iloc[:, 0]
        print("Data Loaded Successfully. ")
        return np.array(data), np.array(data_labels)
    except:
        print("Error")

# Normalize pixel values to [0, 1]
def Normalization(data):
    return data / 255.0

In [17]:
# Download dataset from -> https://github.com/iamavenger/Datasets/releases/download/DATASETS/MNIST_DATASET_CSV.zip

train_data, train_labels = load_mnistcsv(r"MNIST_DATASET_CSV/mnist_train.cs") 
test_data, test_labels = load_mnistcsv(r"MNIST_DATASET_CSV/mnist_test.csv")

Data Loaded Successfully. 
Data Loaded Successfully. 


In [18]:
# Implementing k-nearest neighbor algorithm from scratch -> Using L2 Distance ( Ecludian Distance )

from collections import Counter

class KNN():
    def __init__(self, k):
        self.k = k

    def fit(self, train_data, train_labels):
        self.train_data = train_data
        self.train_labels = train_labels

    """ def euclidean_distance(self, data_point1, data_point2):
        return np.srqt(np.sum((data_point1 - data_point2) ** 2)) """
    

    def predict(self, test_data):
        predictions = []
        for test_sample in test_data:
            distances = np.linalg.norm(self.train_data - test_sample, axis=1)
            nearest_neighbours = np.argsort(distances)[:self.k]
            nearest_labels = self.train_labels[nearest_neighbours]

            most_common_label = Counter(nearest_labels).most_common(1)[0][0]
            predictions.append(most_common_label)
        return np.array(predictions)
    

In [19]:
knn = KNN(k = 3) # K = 3
knn.fit(train_data[:4000], train_labels[:4000]) # Training on 4000 images
accuracy = np.mean(knn.predict(test_data[:1000]) == test_labels[:1000] ) * 100 # Calculating accuracy on 1000 test images

print(f"Accuracy: {accuracy:.2f} %")


Accuracy: 91.10 %


In [20]:
%matplotlib inline

In [None]:
# Setting Hyperparameters
import matplotlib.pyplot as plt


def hyperpar(k):
    knn = KNN(k = 3) # K = 3
    knn.fit(train_data[:4000], train_labels[:4000]) # Training on 4000 images
    accuracy = np.mean(knn.predict(test_data[:1000]) == test_labels[:1000] ) * 100
    return accuracy

x_points = []
y_points = []

for i in range(1, 8):
    x_points.append(i)
    y_points.append(hyperpar(i))

plt.plot(x_points, y_points)
plt.xlabel('k- value')
plt.ylabel('Accuracy')
plt.title("K - Value vs Accuracy")

plot.show()
