# K-Nearest neighbours

In [62]:
import pandas as pd

# Importing the dataset which is images in a csv file
df = pd.read_csv("mnist_test.csv")

df.head()
# label is the number it represents. The rest is the pixel value from 0-255 for that pixel

Unnamed: 0,7,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Splitting the dataset into the dependent and independent variables
y = df.iloc[:, 0].values
X = df.iloc[:, 1:].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [3]:
from sklearn.neighbors import KNeighborsClassifier

# Fitting K-NN to the Training set with 5 neighbors
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

In [4]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [5]:
from sklearn.metrics import classification_report, confusion_matrix

# Creating the Confusion Matrix and Classification report
cr = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Classification report:\n", cr)
print("\nConfusion matrix:\n", cm)

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97       205
           1       0.90      1.00      0.95       218
           2       0.96      0.91      0.93       192
           3       0.91      0.96      0.93       201
           4       0.96      0.94      0.95       205
           5       0.96      0.92      0.94       198
           6       0.94      0.97      0.96       186
           7       0.93      0.96      0.95       193
           8       0.98      0.84      0.90       191
           9       0.94      0.93      0.94       211

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000


Confusion matrix:
 [[203   0   0   0   0   1   1   0   0   0]
 [  0 218   0   0   0   0   0   0   0   0]
 [  3   5 174   1   0   0   2   4   2   1]
 [  0   3   2 192   1   1   0   2   0   0]
 [  0   3   0   0 192   0

I can see we have pretty accuracy all over the model with 0.90+ for all metrics in all classes.


From the matrix I can see from the numbers on the diagonal line that most of the test images are interpreted correctly.

In [44]:
from PIL import Image
import numpy as np

# Importing the image using PIL.Image's open function and converting it to grayscale
img = Image.open("knn_test.png").convert('L')
# Convert the image to a numpy array
img_arr = np.array(img)
print(img_arr.shape)
print(X[0].shape)

(28, 28)
(784,)


In [59]:
# From the calculations above I see that the images in the dataset is flattened to a single array
# While the sample image is still a 2d image, so it needs to be flattened to 1d and reshaped
# to a 2d array with one line to fit the model.
img_reshaped = img_arr.flatten().reshape(1,-1)

In [61]:
# Predicting the class of the image
pred_proba = model.predict_proba(img_reshaped)
# Getting the "name" of the class using the argmax function
pred_class = np.argmax(pred_proba)

# The probability array gives us a value for the similarity fo alle the different classes
# The predicted class will be the highest number in this array.
print("The probability array:\n\t", pred_proba)
print("The predicted class of the model:\n\t", pred_class)

The probability array:
	 [[0.  0.  0.2 0.6 0.  0.2 0.  0.  0.  0. ]]
The predicted class of the model:
	 3


KNN looks at the amount of images closest to the sample image using the pixels of the image. I have 5 neighbors, which means it takes the 5 most similar images from the test set and counts how many we have of each class.


In this example I can see from the probability array that i have one image depicting "2", one depicting "4" and three depicting "3". Therefore the model predicts the class 3 for our image. which is correct