# MNIST KNN CLassifier with 97% Accuracy

## Initialize

In [2]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)


## Fetch Data

In [3]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

## Peepare Data

In [4]:
X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

print('Training: ', X_train.shape)
print('Test: ', X_test.shape)


Training:  (60000, 784)
Test:  (10000, 784)


## Train KNN Model

In [5]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

## Accuracy Score

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
y_pred = knn_clf.predict(X_test)


knn_acc = accuracy_score(y_true=y_test, y_pred=y_pred)
knn_prc = precision_score(y_true=y_test, y_pred=y_pred, average=None)
knn_rec = recall_score(y_true=y_test, y_pred=y_pred, average=None)
knn_f1 = f1_score(y_true=y_test, y_pred=y_pred, average=None)
knn_conf = confusion_matrix(y_true=y_test, y_pred=y_pred)

print('Accuracy: {:.4f}'.format(knn_acc))
print('\nPrecision: ',knn_prc)
print('\nRecall: ', knn_rec)
print('\nF1-score: ', knn_f1)
print('\nConfusion Matrix: ',knn_conf)


Accuracy: 0.9705

Precision:  [0.96626984 0.95773457 0.98224852 0.96347483 0.97535934 0.96625422
 0.98333333 0.96494645 0.98917749 0.96031746]

Recall:  [0.99387755 0.99823789 0.96511628 0.96633663 0.96741344 0.96300448
 0.98538622 0.96400778 0.93839836 0.95936571]

F1-score:  [0.97987928 0.97756687 0.97360704 0.96490361 0.97137014 0.96462661
 0.98435871 0.96447689 0.96311907 0.95984135]

Confusion Matrix:  [[ 974    1    1    0    0    1    2    1    0    0]
 [   0 1133    2    0    0    0    0    0    0    0]
 [  10    9  996    2    0    0    0   13    2    0]
 [   0    2    4  976    1   13    1    7    3    3]
 [   1    6    0    0  950    0    4    2    0   19]
 [   6    1    0   11    2  859    5    1    3    4]
 [   5    3    0    0    3    3  944    0    0    0]
 [   0   21    5    0    1    0    0  991    0   10]
 [   8    2    4   16    8   11    3    4  914    4]
 [   4    5    2    8    9    2    1    8    2  968]]


## Optimize Prarmeters

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn_clf2 = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf2, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)



Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... n_neighbors=3, weights=uniform, score=0.972, total=16.5min
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 16.5min remaining:    0.0s


[CV] ...... n_neighbors=3, weights=uniform, score=0.971, total=16.5min
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 33.0min remaining:    0.0s


[CV] ...... n_neighbors=3, weights=uniform, score=0.969, total=16.5min
[CV] n_neighbors=3, weights=uniform ..................................
[CV] ...... n_neighbors=3, weights=uniform, score=0.969, total=16.5min
[CV] n_neighbors=3, weights=uniform ..................................
[CV] ...... n_neighbors=3, weights=uniform, score=0.970, total=16.5min
[CV] n_neighbors=3, weights=distance .................................
[CV] ..... n_neighbors=3, weights=distance, score=0.972, total=16.5min
[CV] n_neighbors=3, weights=distance .................................
[CV] ..... n_neighbors=3, weights=distance, score=0.972, total=16.6min
[CV] n_neighbors=3, weights=distance .................................
[CV] ..... n_neighbors=3, weights=distance, score=0.970, total=16.6min
[CV] n_neighbors=3, weights=distance .................................
[CV] ..... n_neighbors=3, weights=distance, score=0.970, total=16.6min
[CV] n_neighbors=3, weights=distance .................................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 496.5min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'n_neighbors': [3, 4, 5],
                          'weights': ['uniform', 'distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [10]:
print("\nBest Parameters", grid_search.best_params_)
print("\nBest Score", grid_search.best_score_)


Best Parameters {'n_neighbors': 4, 'weights': 'distance'}

Best Score 0.9716166666666666


In [11]:
y_pred_opt = grid_search.predict(X_test)


In [12]:
knn_acc2 = accuracy_score(y_true=y_test, y_pred=y_pred_opt)
knn_prc2 = precision_score(y_true=y_test, y_pred=y_pred_opt, average=None)
knn_rec2 = recall_score(y_true=y_test, y_pred=y_pred_opt, average=None)
knn_f12 = f1_score(y_true=y_test, y_pred=y_pred_opt, average=None)
knn_conf2 = confusion_matrix(y_true=y_test, y_pred=y_pred_opt)

print('Accuracy: {:.4f}'.format(knn_acc2))
print('\nPrecision: ',knn_prc2)
print('\nRecall: ', knn_rec2)
print('\nF1-score: ', knn_f12)
print('\nConfusion Matrix: ',knn_conf2)

Accuracy: 0.9714

Precision:  [0.973      0.96834902 0.98417409 0.96819085 0.97535934 0.96312849
 0.97828335 0.95945946 0.98818475 0.95746785]

Recall:  [0.99285714 0.99735683 0.96414729 0.96435644 0.96741344 0.96636771
 0.9874739  0.96692607 0.94455852 0.95936571]

F1-score:  [0.98282828 0.98263889 0.97405776 0.96626984 0.97137014 0.96474538
 0.98285714 0.96317829 0.96587927 0.95841584]

Confusion Matrix:  [[ 973    1    1    0    0    1    3    1    0    0]
 [   0 1132    2    0    0    0    1    0    0    0]
 [  10    5  995    2    1    0    0   16    3    0]
 [   0    1    3  974    1   14    1    7    4    5]
 [   1    5    0    0  950    0    4    3    0   19]
 [   4    0    0    9    2  862    7    1    3    4]
 [   4    2    0    0    3    3  946    0    0    0]
 [   0   17    4    0    3    0    0  994    0   10]
 [   5    2    4   14    5   11    4    4  920    5]
 [   3    4    2    7    9    4    1   10    1  968]]
