In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter

In [6]:
train_data = pd.read_csv('mnist_train.csv')
y_train_full = train_data.iloc[:, 0].values
X_train_full = train_data.iloc[:, 1:].values
 
test_data = pd.read_csv('mnist_test.csv')
y_test = test_data.iloc[:, 0].values
X_test = test_data.iloc[:, 1:].values
 
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
 
 
X_combined = np.vstack((X_train_full, X_test))
y_combined = np.hstack((y_train_full, y_test))
 
 
X_train, X_remaining, y_train, y_remaining = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
)
 
X_val, X_test, y_val, y_test = train_test_split(
    X_remaining, y_remaining, test_size=0.5, random_state=42, stratify=y_remaining
)
 
 
print(f"Training Data Shape: {X_train.shape}, Training Labels Shape: {y_train.shape}")
print(f"Validation Data Shape: {X_val.shape}, Validation Labels Shape: {y_val.shape}")
print(f"Test Data Shape: {X_test.shape}, Test Labels Shape: {y_test.shape}")


Training Data Shape: (55998, 784), Training Labels Shape: (55998,)
Validation Data Shape: (7000, 784), Validation Labels Shape: (7000,)
Test Data Shape: (7000, 784), Test Labels Shape: (7000,)


In [14]:
import numpy as np

def distance_calc(X_train, X_test, distance_metric='euclidean'):
    if distance_metric == 'euclidean':
        a_squared = np.sum(X_train ** 2, axis = 1)
        b_squared = np.sum(X_test ** 2, axis = 1).reshape(-1,1)

        a_dot_b = X_test @ X_train.T
        distances = np.sqrt(a_squared + b_squared - 2*a_dot_b)

    return distances


In [17]:
class KNNCustom:
    def __init__(self,k):
        self.k = k
    
    def predict(self,distance_matrix,y_train_labels):

        predictions = []

        for distances in distance_matrix:
            nearest_indices = np.argsort(distances)[:self.k]
            nearest_labels = y_train_labels[nearest_indices]
            frequency_labels = Counter(nearest_labels)
            winner = frequency_labels.most_common(1)[0][0]
            predictions.append(winner)

        return np.array(predictions)

In [18]:
distance_matrix_val = distance_calc(X_train, X_val, distance_metric='euclidean')
distance_matrix_test = distance_calc(X_train, X_test, distance_metric='euclidean')

knn = KNNCustom(k=5)

y_val_pred = knn.predict(distance_matrix_val, y_train)
y_test_pred = knn.predict(distance_matrix_test, y_train)

accuracy_val = np.mean(y_val_pred == y_val)
accuracy_test = np.mean(y_test_pred == y_test)

print(f"Validation Accuracy: {accuracy_val * 100:.2f}%")
print(f"Test Accuracy: {accuracy_test * 100:.2f}%")


Validation Accuracy: 97.19%
Test Accuracy: 96.96%
