In [15]:
import numpy as np
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier 
from sklearn import datasets, metrics
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Loading the MNIST data
mnist = fetch_mldata('MNIST original', data_home = os.getcwd())
print(mnist)

images = mnist.data
targets = mnist.target

X_data = images
Y = targets

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_data, Y, test_size=0.15, random_state=42)

{'DESCR': 'mldata.org dataset: mnist-original', 'COL_NAMES': ['label', 'data'], 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.]), 'data': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}


In [3]:
# Adding the y_train vector to the X_train matrix so that I can split the observations into clusters based
# on their labels
X_train_new = np.concatenate((y_train[:, np.newaxis], X_train), axis=1)
cluster_means = []

In [4]:
# Caculating the mean of each cluster
for i in range(10):
    sub_mat = X_train_new[np.where(X_train_new[:,0] == i)].mean(axis=0)
    cluster_means.append(sub_mat)
    cluster_means[i] = cluster_means[i].reshape(X_train_new.shape[1],1)

In [5]:
# Helper functions
def euclidean_distance(a,b):
    return np.linalg.norm(a-b)
def assign_cluster(a,centers):
    dists = np.array([euclidean_distance(a,x[1:]) for x in centers])
    return np.argmin(dists)

In [6]:
###### APPROACH 1 #########
correct = 0
for i in range(len(X_test)):
    c = assign_cluster(X_test[i,:],cluster_means)
    if c == y_test[i]:
        correct += 1

In [7]:
# The acccuracy rate by using the first method
acc_rate = correct/len(X_test)
acc_rate

0.1700952380952381

In [8]:
# Data required for testing to determine which k is the best
(train_data,val_data,train_labels,val_labels) = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [9]:
###### APPROACH 2 #########
percentages = []

# Testing which k gives the highest accuracy by testing the model on the validation data
for k in range(1,10,2):
    model = KNeighborsClassifier(n_neighbors = k)
    model.fit(train_data,train_labels)
    
    # evaluate the model and update the accuracies list
    score = model.score(val_data,val_labels)
    print('k=%d, accuracy=%.2f%%' % (k,score*100))
    percentages.append(score)

accuracies = np.array(percentages) 

k=1, accuracy=97.50%
k=3, accuracy=97.16%
k=5, accuracy=97.11%
k=7, accuracy=96.91%
k=9, accuracy=96.64%


In [20]:
# finding the value of k that has the highest accuracy
i = np.argmax(accuracies)
kVals = [i for i in range(1,10,2)]
print('k = %d achieved the highest accuracy of %.2f%% on the validation data' % (kVals[i], accuracies[i]*100))

k = 1 achieved the highest accuracy of 97.50% on the validation data


In [22]:
# Using the best k on the original training and test data to calculate the accuracy
model = KNeighborsClassifier(n_neighbors = 1)
model.fit(X_train,y_train)
predictions = model.predict(X_test)

In [21]:
# Show a final classification report that demonstrates the accuracy of the classifier 
# for each of the digits
print("Evaluation on testing data:")
print(classification_report(y_test,predictions))

Evaluation on testing data:
             precision    recall  f1-score   support

        0.0       0.98      0.99      0.99      1024
        1.0       0.97      1.00      0.99      1185
        2.0       0.99      0.96      0.98      1051
        3.0       0.97      0.97      0.97      1057
        4.0       0.98      0.97      0.97       964
        5.0       0.97      0.97      0.97       964
        6.0       0.98      0.99      0.99      1085
        7.0       0.96      0.98      0.97      1128
        8.0       0.98      0.95      0.96      1037
        9.0       0.95      0.95      0.95      1005

avg / total       0.97      0.97      0.97     10500

