In [59]:
# reference https://www.youtube.com/watch?v=ngLyX54e1LU

In [1]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, classification_report
  
# Loading data 
irisData = load_iris() 
  
# Create feature and target arrays 
X = irisData.data 
y = irisData.target 
  
# Split into training and test set 
X_train, X_test, y_train, y_test = train_test_split( 
             X, y, test_size = 0.2, random_state=42) 
  
knn = KNeighborsClassifier(n_neighbors=7) # k = 7
  
knn.fit(X_train, y_train) 
  
# Calculate the accuracy of the model 
print("Accuracy:", knn.score(X_test, y_test)) 

Accuracy: 0.9666666666666667


In [2]:
y_pred = knn.predict(X_test)

print(classification_report(y_test, y_pred))
print('-'*55)
print('Confusion Matrix\n')
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30

-------------------------------------------------------
Confusion Matrix

[[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]


In [3]:
import numpy as np

In [4]:
def k_nearest_neighbors(X_train, y_train, X_test, y_test, k=3):
    y_pred = []
    for x_test, y_ in zip(X_test, y_test):
        
        # get the euclidean distance from this sample to all the training data
        distances = np.linalg.norm(X_train - x_test, axis=1).reshape(-1, 1)
        
        # keep the distance and the label side by side
        distance_label = np.concatenate([distances, y_train.reshape(-1, 1)], axis=1)
        
        # get the top k labels
        # this is done by sorting based on the first col which is the distance
        # this should be normal sorting as we are trying to get the nearest
        # ones first. ideal situation is if the distance is 0
        top_k = distance_label[distance_label[:, 0].argsort()][:k]
        
        # get the top label. TO do this take the second col which is the label col and then get the counts
        # the bin count does the counts based on the index of the labels so we do the argmax
        label = np.bincount(top_k[:, 1].astype(int)).argmax()
        
        print(x_test, y_, label)
        
        y_pred.append(label)
    
    return np.array(y_pred)

In [5]:
y_pred = k_nearest_neighbors(X_train, y_train, X_test, y_test, k=7)

print(classification_report(y_test, y_pred))
print('-'*55)
print('Confusion Matrix\n')
print(confusion_matrix(y_test, y_pred))

[6.1 2.8 4.7 1.2] 1 1
[5.7 3.8 1.7 0.3] 0 0
[7.7 2.6 6.9 2.3] 2 2
[6.  2.9 4.5 1.5] 1 1
[6.8 2.8 4.8 1.4] 1 1
[5.4 3.4 1.5 0.4] 0 0
[5.6 2.9 3.6 1.3] 1 1
[6.9 3.1 5.1 2.3] 2 2
[6.2 2.2 4.5 1.5] 1 2
[5.8 2.7 3.9 1.2] 1 1
[6.5 3.2 5.1 2. ] 2 2
[4.8 3.  1.4 0.1] 0 0
[5.5 3.5 1.3 0.2] 0 0
[4.9 3.1 1.5 0.1] 0 0
[5.1 3.8 1.5 0.3] 0 0
[6.3 3.3 4.7 1.6] 1 1
[6.5 3.  5.8 2.2] 2 2
[5.6 2.5 3.9 1.1] 1 1
[5.7 2.8 4.5 1.3] 1 1
[6.4 2.8 5.6 2.2] 2 2
[4.7 3.2 1.6 0.2] 0 0
[6.1 3.  4.9 1.8] 2 2
[5.  3.4 1.6 0.4] 0 0
[6.4 2.8 5.6 2.1] 2 2
[7.9 3.8 6.4 2. ] 2 2
[6.7 3.  5.2 2.3] 2 2
[6.7 2.5 5.8 1.8] 2 2
[6.8 3.2 5.9 2.3] 2 2
[4.8 3.  1.4 0.3] 0 0
[4.8 3.1 1.6 0.2] 0 0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg  

THis is matching exactly with the sklearn result

In [7]:
one_sample = X_test[0]

In [8]:
one_sample.shape

(4,)

In [9]:
one_sample = X_test[:3]

In [10]:
one_sample.shape

(3, 4)

In [11]:
one_sample = X_test[0]

In [12]:
one_sample.shape

(4,)

In [13]:
X_train.shape

(120, 4)

In [15]:
(X_train - one_sample)

array([[-1.5,  0.8, -3.7, -1. ],
       [-0.4,  1.6, -3.2, -0.8],
       [ 0.6,  0.3, -0.3,  0.2],
       [-1.3,  0.6, -3.1, -1. ],
       [-1.7,  0.4, -3.4, -1. ],
       [ 0.2, -0.3,  0.3,  0.7],
       [ 0.3,  0.4, -0.2,  0.3],
       [-0.9,  0.7, -3.2, -1. ],
       [-1.1,  0.8, -3.3, -1. ],
       [-0.9,  1.3, -3.2, -1.1],
       [-0.3, -0.1,  0.4,  0.7],
       [-0.1,  0.6, -0.2,  0.4],
       [ 0.6,  0.3,  0. ,  0.3],
       [-0.7,  1.1, -3.4, -0.8],
       [-0.7,  0.9, -3.2, -1. ],
       [-0.6, -0.4, -1. , -0.2],
       [ 0.2,  0. ,  0.4,  0.3],
       [ 0.3,  0.3,  0.8,  0.6],
       [ 0.5,  0.2, -0.3,  0.2],
       [ 1.1,  0.8,  1.4,  1.3],
       [-0.4,  0.1, -0.5,  0.1],
       [ 1.5,  0.2,  1.9,  0.9],
       [-0.5,  0.2, -0.2,  0.3],
       [-1. ,  0.7, -3.3, -1. ],
       [ 1.6,  0. ,  2. ,  0.8],
       [-0.3, -0.1, -0.6, -0.2],
       [-0.9,  0.6, -3.3, -1. ],
       [-1.1,  0.7, -3.4, -0.9],
       [-1. ,  1. , -2.8, -0.8],
       [-1.1, -0.8, -1.2, -0.2],
       [ 0

In [27]:
np.linalg.norm(X_train - one_sample, axis=1).reshape(-1, 1)

array([[4.19285106],
       [3.68781778],
       [0.76157731],
       [3.55808937],
       [3.95094925],
       [0.84261498],
       [0.6164414 ],
       [3.54118624],
       [3.7067506 ],
       [3.73496988],
       [0.8660254 ],
       [0.75498344],
       [0.73484692],
       [3.72827038],
       [3.54118624],
       [1.2489996 ],
       [0.53851648],
       [1.08627805],
       [0.64807407],
       [2.34520788],
       [0.65574385],
       [2.59036677],
       [0.64807407],
       [3.65786823],
       [2.68328157],
       [0.70710678],
       [3.6138622 ],
       [3.75099987],
       [3.23728281],
       [1.82482876],
       [0.67082039],
       [3.3       ],
       [3.44383507],
       [3.27719392],
       [0.72111026],
       [3.54964787],
       [0.67082039],
       [2.92574777],
       [3.78153408],
       [0.43588989],
       [0.98994949],
       [3.76961536],
       [0.64807407],
       [0.8660254 ],
       [0.9486833 ],
       [0.81240384],
       [1.43874946],
       [1.053

In [29]:
y_train.reshape(-1, 1)

array([[0],
       [0],
       [1],
       [0],
       [0],
       [2],
       [1],
       [0],
       [0],
       [0],
       [2],
       [1],
       [1],
       [0],
       [0],
       [1],
       [2],
       [2],
       [1],
       [2],
       [1],
       [2],
       [1],
       [0],
       [2],
       [1],
       [0],
       [0],
       [0],
       [1],
       [2],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [2],
       [0],
       [1],
       [2],
       [0],
       [2],
       [2],
       [1],
       [1],
       [2],
       [1],
       [0],
       [1],
       [2],
       [0],
       [0],
       [1],
       [1],
       [0],
       [2],
       [0],
       [0],
       [1],
       [1],
       [2],
       [1],
       [2],
       [2],
       [1],
       [0],
       [0],
       [2],
       [2],
       [0],
       [0],
       [0],
       [1],
       [2],
       [0],
       [2],
       [2],
       [0],
       [1],
       [1],
       [2],
       [1],
    

In [31]:
np.concatenate([np.linalg.norm(X_train - one_sample, axis=1).reshape(-1, 1),
                y_train.reshape(-1, 1)], axis=1)

array([[4.19285106, 0.        ],
       [3.68781778, 0.        ],
       [0.76157731, 1.        ],
       [3.55808937, 0.        ],
       [3.95094925, 0.        ],
       [0.84261498, 2.        ],
       [0.6164414 , 1.        ],
       [3.54118624, 0.        ],
       [3.7067506 , 0.        ],
       [3.73496988, 0.        ],
       [0.8660254 , 2.        ],
       [0.75498344, 1.        ],
       [0.73484692, 1.        ],
       [3.72827038, 0.        ],
       [3.54118624, 0.        ],
       [1.2489996 , 1.        ],
       [0.53851648, 2.        ],
       [1.08627805, 2.        ],
       [0.64807407, 1.        ],
       [2.34520788, 2.        ],
       [0.65574385, 1.        ],
       [2.59036677, 2.        ],
       [0.64807407, 1.        ],
       [3.65786823, 0.        ],
       [2.68328157, 2.        ],
       [0.70710678, 1.        ],
       [3.6138622 , 0.        ],
       [3.75099987, 0.        ],
       [3.23728281, 0.        ],
       [1.82482876, 1.        ],
       [0.

In [32]:
distance_label = np.concatenate(
    [np.linalg.norm(X_train - one_sample, axis=1).reshape(-1, 1), y_train.reshape(-1, 1)], axis=1)
print(distance_label)

array([[4.19285106, 0.        ],
       [3.68781778, 0.        ],
       [0.76157731, 1.        ],
       [3.55808937, 0.        ],
       [3.95094925, 0.        ],
       [0.84261498, 2.        ],
       [0.6164414 , 1.        ],
       [3.54118624, 0.        ],
       [3.7067506 , 0.        ],
       [3.73496988, 0.        ],
       [0.8660254 , 2.        ],
       [0.75498344, 1.        ],
       [0.73484692, 1.        ],
       [3.72827038, 0.        ],
       [3.54118624, 0.        ],
       [1.2489996 , 1.        ],
       [0.53851648, 2.        ],
       [1.08627805, 2.        ],
       [0.64807407, 1.        ],
       [2.34520788, 2.        ],
       [0.65574385, 1.        ],
       [2.59036677, 2.        ],
       [0.64807407, 1.        ],
       [3.65786823, 0.        ],
       [2.68328157, 2.        ],
       [0.70710678, 1.        ],
       [3.6138622 , 0.        ],
       [3.75099987, 0.        ],
       [3.23728281, 0.        ],
       [1.82482876, 1.        ],
       [0.

In [38]:
distance_label[distance_label[:, 0].argsort()][::-1][:3]

array([[4.19285106, 0.        ],
       [4.17731971, 0.        ],
       [3.95094925, 0.        ]])

In [41]:
top_3 = distance_label[distance_label[:, 0].argsort()][::-1][:3]
np.bincount(top_3[:, 1].asint())

AttributeError: 'numpy.ndarray' object has no attribute 'asint'

In [43]:
np.bincount(top_3[:, 1].astype(int))

array([3])

In [44]:
a = np.array([0,0,1,3])
np.bincount(a.astype(int))

array([2, 1, 0, 1])

In [46]:
np.bincount(a.astype(int)).argmax()

0

In [26]:
(np.linalg.norm(X_train - one_sample, axis=1)).max()

4.192851058647326

In [16]:
X_train

array([[4.6, 3.6, 1. , 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [6.7, 3.1, 4.4, 1.4],
       [4.8, 3.4, 1.6, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [6.3, 2.5, 5. , 1.9],
       [6.4, 3.2, 4.5, 1.5],
       [5.2, 3.5, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.2, 4.1, 1.5, 0.1],
       [5.8, 2.7, 5.1, 1.9],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [5.4, 3.9, 1.3, 0.4],
       [5.4, 3.7, 1.5, 0.2],
       [5.5, 2.4, 3.7, 1. ],
       [6.3, 2.8, 5.1, 1.5],
       [6.4, 3.1, 5.5, 1.8],
       [6.6, 3. , 4.4, 1.4],
       [7.2, 3.6, 6.1, 2.5],
       [5.7, 2.9, 4.2, 1.3],
       [7.6, 3. , 6.6, 2.1],
       [5.6, 3. , 4.5, 1.5],
       [5.1, 3.5, 1.4, 0.2],
       [7.7, 2.8, 6.7, 2. ],
       [5.8, 2.7, 4.1, 1. ],
       [5.2, 3.4, 1.4, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 2. , 3.5, 1. ],
       [6.3, 2.7, 4.9, 1.8],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [5.6, 2

In [17]:
one_sample

array([6.1, 2.8, 4.7, 1.2])

In [18]:
4.6 - 6.1

-1.5