In [1]:
import sklearn 
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
#fetching the mnist dataset, the dataset contains 28*28 images of digits and their respective labels
mnist = fetch_openml("mnist_784", version =1)

In [3]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [43]:
X, Y = mnist ["data"], mnist["target"]

In [44]:
Y = Y.astype(int)

In [45]:
#scaling of the images to make classifier work better
from sklearn.preprocessing import StandardScaler

In [46]:
scaler = StandardScaler()

In [47]:
scaler.fit(X)

StandardScaler()

In [48]:
X_new = scaler.transform(X)

In [49]:
# splitting into test and train dataset 
# with the mnist dataset the split is already set with [:60000], shuffling is not necessary

In [102]:
X_train_small, Y_train_small = X_new[:10000], Y[:10000]

In [103]:
X_test, Y_test = X_new[60000:], Y[60000:]

In [105]:
#grid search for optimal hyperparameters for n_neighbors and weights

In [106]:
from sklearn.neighbors import KNeighborsClassifier

In [107]:
classifier = KNeighborsClassifier()

In [108]:
from sklearn.model_selection import GridSearchCV

In [109]:
grid_param = [{"n_neighbors":[4, 5, 6], "weights": ["uniform", "distance"]}]

In [110]:
grid_Search = GridSearchCV(classifier, grid_param , return_train_score=True)

In [111]:
# the grid search on part on the first 10000 entries of the dataset yields better scores for weights = distance
# and optimal n neighbors in the range 4-6
# for the sake of training time I only trained with the whole dataset on n_neighbors = 5 and weights = distance
import time
start = time.time()
grid_Search.fit(X_train_small, Y_train_small)
end = time.time()
print(end-start)

47.261730909347534


In [112]:
grid_Search.cv_results_

{'mean_fit_time': array([0.01575203, 0.01219835, 0.01257   , 0.0122478 , 0.0122725 ,
        0.0123312 ]),
 'std_fit_time': array([0.00659185, 0.0001346 , 0.00071091, 0.00013488, 0.00020239,
        0.000175  ]),
 'mean_score_time': array([0.37373185, 0.3434937 , 0.35696449, 0.34688754, 0.36082969,
        0.34428358]),
 'std_score_time': array([0.01212464, 0.01531721, 0.01317216, 0.01526749, 0.01022908,
        0.01610887]),
 'param_n_neighbors': masked_array(data=[4, 4, 5, 5, 6, 6],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_weights': masked_array(data=['uniform', 'distance', 'uniform', 'distance',
                    'uniform', 'distance'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 4, 'weights': 'uniform'},
  {'n_neighbors': 4, 'weights': 'distance'},
  {'n_neighbors': 5, 'weights': 'uniform'},
  {'n_neig

In [None]:
# training the Kneighborsclassifier with n = 5 and weights = distance

In [113]:
classifier.set_params(n_neighbors = 5,weights = "distance")

KNeighborsClassifier(weights='distance')

In [115]:
#grid search takes a lot more time than fitting with pre-set parameters
# all the combinations of hyperparameters no longer need to be tested 
start = time.time()
classifier.fit(X_train_small,  Y_train_small)
end = time.time()
print(end-start)

0.010926961898803711


In [118]:
#train the classifier on the whole dataset 
X_train, Y_train = X_new[:60000], Y[:60000]
classifier.fit(X_train,  Y_train)

KNeighborsClassifier(weights='distance')

In [None]:
#Error analysis with the help of a confusion matrix

In [120]:
from sklearn.model_selection import cross_val_predict

Y_train_predict = cross_val_predict(classifier, X_train, Y_train, cv = 3)

In [124]:
classifier.classes_

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [123]:
# the confusion matrix shows how many digits are correctly predicted 
# e.g. row 0, column 0 shows how many 0 are predicted correctly 
# row 0, colum 1 shows how many 0's are wrongly predicted to be 1's
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_train, y_train_predict)
conf_matrix

array([[5820,    5,   16,    5,    1,   21,   42,    5,    5,    3],
       [   2, 6664,   18,   10,    8,    5,   17,   10,    4,    4],
       [  62,   63, 5519,   93,   35,   11,   54,   57,   40,   24],
       [  12,   26,   47, 5783,    4,   83,    8,   70,   61,   37],
       [   6,   69,   49,    7, 5408,   18,   27,   32,   10,  216],
       [  26,   13,   12,  153,   13, 4991,   96,   12,   51,   54],
       [  60,   13,   16,    4,   12,   46, 5757,    1,    9,    0],
       [  11,   57,   24,   16,   61,    2,    0, 5866,    4,  224],
       [  46,   83,   35,  104,   38,  192,   36,   20, 5216,   81],
       [  18,   10,   18,   48,  117,   13,    1,  208,   19, 5497]])

In [130]:
#receive the percentages for each confusion matrix entry
row_sums = conf_matrix.sum(axis = 1)
percent_conf = conf_matrix / row_sums 
percent_conf

array([[9.82610164e-01, 7.41619697e-04, 2.68546492e-03, 8.15527646e-04,
        1.71174255e-04, 3.87382402e-03, 7.09699223e-03, 7.98084597e-04,
        8.54554777e-04, 5.04286435e-04],
       [3.37666723e-04, 9.88430733e-01, 3.02114804e-03, 1.63105529e-03,
        1.36939404e-03, 9.22339052e-04, 2.87259209e-03, 1.59616919e-03,
        6.83643822e-04, 6.72381913e-04],
       [1.04676684e-02, 9.34440819e-03, 9.26317556e-01, 1.51688142e-02,
        5.99109894e-03, 2.02914591e-03, 9.12470429e-03, 9.09816441e-03,
        6.83643822e-03, 4.03429148e-03],
       [2.02600034e-03, 3.85642243e-03, 7.88855321e-03, 9.43239276e-01,
        6.84697022e-04, 1.53108283e-02, 1.35180804e-03, 1.11731844e-02,
        1.04255683e-02, 6.21953269e-03],
       [1.01300017e-03, 1.02343518e-02, 8.22423632e-03, 1.14173870e-03,
        9.25710373e-01, 3.32042059e-03, 4.56235215e-03, 5.10774142e-03,
        1.70910955e-03, 3.63086233e-02],
       [4.38966740e-03, 1.92821121e-03, 2.01409869e-03, 2.49551460e-02,
   

In [133]:
precent_correctly_predicted = {}
for i in range(10):
    precent_correctly_predicted[i] = percent_conf[i][i] * 100

In [134]:
precent_correctly_predicted

{0: 98.26101637683607,
 1: 98.8430732720261,
 2: 92.63175562269218,
 3: 94.323927581145,
 4: 92.57103731598768,
 5: 92.06788415421508,
 6: 97.27948631294356,
 7: 93.63128491620112,
 8: 89.14715433259272,
 9: 92.40208438393007}

In [None]:
from statistics import mean
average = mean(list(precent_correctly_predicted.values()))
average

In [None]:
# save model 
import pickle
file_name = 'mnist_Kneighbors_model.sav'
pickle.dump(classifier, open(file_name, 'wb'))

In [2]:
# load model 
import pickle
file_name = 'mnist_Kneighbors_model.sav'
classifier = pickle.load(open(file_name, 'rb')) 