In [1]:
# reference https://www.youtube.com/watch?v=ngLyX54e1LU

In [2]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, classification_report
  
# Loading data 
irisData = load_iris() 
  
# Create feature and target arrays 
X = irisData.data 
y = irisData.target 
  
# Split into training and test set 
X_train, X_test, y_train, y_test = train_test_split( 
             X, y, test_size = 0.2, random_state=42) 
  
knn = KNeighborsClassifier(n_neighbors=7) # k = 7
  
knn.fit(X_train, y_train) 
  
# Calculate the accuracy of the model 
print("Accuracy:", knn.score(X_test, y_test)) 

Accuracy: 0.9666666666666667


In [3]:
y_pred = knn.predict(X_test)

print(classification_report(y_test, y_pred))
print('-'*55)
print('Confusion Matrix\n')
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30

-------------------------------------------------------
Confusion Matrix

[[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]


In [4]:
import numpy as np

In [5]:
def k_nearest_neighbors(X_train, y_train, X_test, y_test, k=3):
    y_pred = []
    for x_test, y_ in zip(X_test, y_test):
        
        # get the euclidean distance from this sample to all the training data
        distances = np.linalg.norm(X_train - x_test, axis=1).reshape(-1, 1)
        
        # keep the distance and the label side by side
        distance_label = np.concatenate([distances, y_train.reshape(-1, 1)], axis=1)
        
        # get the top k labels
        # this is done by sorting based on the first col which is the distance
        # this should be normal sorting as we are trying to get the nearest
        # ones first. ideal situation is if the distance is 0
        top_k = distance_label[distance_label[:, 0].argsort()][:k]
        
        # get the top label. TO do this take the second col which is the label col and then get the counts
        # the bin count does the counts based on the index of the labels so we do the argmax
        label = np.bincount(top_k[:, 1].astype(int)).argmax()
        
        print(x_test, y_, label)
        
        y_pred.append(label)
    
    return np.array(y_pred)

In [6]:
y_pred = k_nearest_neighbors(X_train, y_train, X_test, y_test, k=7)

print(classification_report(y_test, y_pred))
print('-'*55)
print('Confusion Matrix\n')
print(confusion_matrix(y_test, y_pred))

[6.1 2.8 4.7 1.2] 1 1
[5.7 3.8 1.7 0.3] 0 0
[7.7 2.6 6.9 2.3] 2 2
[6.  2.9 4.5 1.5] 1 1
[6.8 2.8 4.8 1.4] 1 1
[5.4 3.4 1.5 0.4] 0 0
[5.6 2.9 3.6 1.3] 1 1
[6.9 3.1 5.1 2.3] 2 2
[6.2 2.2 4.5 1.5] 1 2
[5.8 2.7 3.9 1.2] 1 1
[6.5 3.2 5.1 2. ] 2 2
[4.8 3.  1.4 0.1] 0 0
[5.5 3.5 1.3 0.2] 0 0
[4.9 3.1 1.5 0.1] 0 0
[5.1 3.8 1.5 0.3] 0 0
[6.3 3.3 4.7 1.6] 1 1
[6.5 3.  5.8 2.2] 2 2
[5.6 2.5 3.9 1.1] 1 1
[5.7 2.8 4.5 1.3] 1 1
[6.4 2.8 5.6 2.2] 2 2
[4.7 3.2 1.6 0.2] 0 0
[6.1 3.  4.9 1.8] 2 2
[5.  3.4 1.6 0.4] 0 0
[6.4 2.8 5.6 2.1] 2 2
[7.9 3.8 6.4 2. ] 2 2
[6.7 3.  5.2 2.3] 2 2
[6.7 2.5 5.8 1.8] 2 2
[6.8 3.2 5.9 2.3] 2 2
[4.8 3.  1.4 0.3] 0 0
[4.8 3.1 1.6 0.2] 0 0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94         9
           2       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg  

In [29]:
X_test[:2, :]

array([[6.1, 2.8, 4.7, 1.2],
       [5.7, 3.8, 1.7, 0.3]])

In [33]:
(X_train - X_test[:2, np.newaxis]).shape

(2, 120, 4)

In [37]:
np.linalg.norm(X_train - X_test[:, np.newaxis], axis=2).shape

(30, 120)

In [38]:
np.linalg.norm(X_train - X_test[:, np.newaxis], axis=2)

array([[4.19285106, 3.68781778, 0.76157731, ..., 3.84447656, 0.78740079,
        1.81383571],
       [1.32287566, 0.64031242, 3.16069613, ..., 0.55677644, 2.74772633,
        4.84561658],
       [7.05903676, 6.32534584, 2.88270706, ..., 6.5169011 , 3.6373067 ,
        1.2489996 ],
       ...,
       [5.78100337, 5.06162029, 1.75499288, ..., 5.30471488, 2.48596058,
        0.41231056],
       [0.75498344, 1.67032931, 3.7188708 , ..., 1.43178211, 2.95465734,
        5.36469943],
       [0.80622577, 1.59687194, 3.59026461, ..., 1.40356688, 2.83019434,
        5.23450093]])

In [42]:
np.argsort(np.linalg.norm(X_train - X_test[:, np.newaxis], axis=2))[:, :3]

array([[ 79,  90,  39],
       [ 48,  14,  94],
       [ 24,  21,  64],
       [ 90,  79,  86],
       [ 92,  12, 105],
       [114,   7,  26],
       [ 88,  95, 108],
       [ 85,  76,  87],
       [110,  73,  68],
       [118, 108,  25],
       [ 87,  83,  54],
       [104,  55,  78],
       [ 14,  26,  94],
       [ 55, 104,  32],
       [ 98,  84,  94],
       [  6,  11,  90],
       [103,  97,  76],
       [ 53,  65,  15],
       [ 22,  93,  20],
       [107,  50,  76],
       [ 70,   3,  38],
       [ 42,  81, 113],
       [ 33,  58,  57],
       [107,  50, 109],
       [ 37,  21,  64],
       [ 87,  85,  76],
       [107, 109,  50],
       [ 61,  97, 100],
       [104,  55,  78],
       [ 55,  32,  70]])

In [44]:
top_points = np.argsort(np.linalg.norm(X_train - X_test[:, np.newaxis], axis=2))[:, :3]

In [47]:
top_points

array([[ 79,  90,  39],
       [ 48,  14,  94],
       [ 24,  21,  64],
       [ 90,  79,  86],
       [ 92,  12, 105],
       [114,   7,  26],
       [ 88,  95, 108],
       [ 85,  76,  87],
       [110,  73,  68],
       [118, 108,  25],
       [ 87,  83,  54],
       [104,  55,  78],
       [ 14,  26,  94],
       [ 55, 104,  32],
       [ 98,  84,  94],
       [  6,  11,  90],
       [103,  97,  76],
       [ 53,  65,  15],
       [ 22,  93,  20],
       [107,  50,  76],
       [ 70,   3,  38],
       [ 42,  81, 113],
       [ 33,  58,  57],
       [107,  50, 109],
       [ 37,  21,  64],
       [ 87,  85,  76],
       [107, 109,  50],
       [ 61,  97, 100],
       [104,  55,  78],
       [ 55,  32,  70]])

In [48]:
y_train[[79, 90, 39]]

array([1, 1, 1])

In [46]:
y_train[top_points]

array([[1, 1, 1],
       [0, 0, 0],
       [2, 2, 2],
       [1, 1, 1],
       [1, 1, 1],
       [0, 0, 0],
       [1, 1, 1],
       [2, 2, 2],
       [1, 1, 2],
       [1, 1, 1],
       [2, 2, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [2, 2, 2],
       [1, 1, 1],
       [1, 1, 1],
       [2, 2, 2],
       [0, 0, 0],
       [2, 2, 2],
       [0, 0, 0],
       [2, 2, 2],
       [2, 2, 2],
       [2, 2, 2],
       [2, 2, 2],
       [2, 2, 2],
       [0, 0, 0],
       [0, 0, 0]])

In [54]:
help(np.apply_along_axis)

Help on function apply_along_axis in module numpy:

apply_along_axis(func1d, axis, arr, *args, **kwargs)
    Apply a function to 1-D slices along the given axis.
    
    Execute `func1d(a, *args, **kwargs)` where `func1d` operates on 1-D arrays
    and `a` is a 1-D slice of `arr` along `axis`.
    
    This is equivalent to (but faster than) the following use of `ndindex` and
    `s_`, which sets each of ``ii``, ``jj``, and ``kk`` to a tuple of indices::
    
        Ni, Nk = a.shape[:axis], a.shape[axis+1:]
        for ii in ndindex(Ni):
            for kk in ndindex(Nk):
                f = func1d(arr[ii + s_[:,] + kk])
                Nj = f.shape
                for jj in ndindex(Nj):
                    out[ii + jj + kk] = f[jj]
    
    Equivalently, eliminating the inner loop, this can be expressed as::
    
        Ni, Nk = a.shape[:axis], a.shape[axis+1:]
        for ii in ndindex(Ni):
            for kk in ndindex(Nk):
                out[ii + s_[...,] + kk] = func1d(arr[ii 

In [72]:
arr = y_train[top_points]

In [68]:
u, indices = np.unique(y_train[top_points], return_inverse=True)

In [70]:
u, indices

(array([0, 1, 2]),
 array([1, 1, 1, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 2,
        2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 0, 0, 2, 2, 2,
        0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0,
        0, 0]))

In [71]:
indices.shape

(90,)

In [74]:
axis = 0

In [80]:
np.unique(arr, axis=1, return_counts=True)

(array([[1, 1],
        [0, 0],
        [2, 2],
        [1, 1],
        [1, 1],
        [0, 0],
        [1, 1],
        [2, 2],
        [1, 2],
        [1, 1],
        [2, 1],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0],
        [1, 1],
        [2, 2],
        [1, 1],
        [1, 1],
        [2, 2],
        [0, 0],
        [2, 2],
        [0, 0],
        [2, 2],
        [2, 2],
        [2, 2],
        [2, 2],
        [2, 2],
        [0, 0],
        [0, 0]]),
 array([2, 1]))

In [75]:
u[np.argmax(np.apply_along_axis(np.bincount, axis, indices.reshape(arr.shape),
                                None, np.max(indices) + 1), axis=axis)]

array([2, 2, 2])

In [77]:
np.apply_along_axis(np.bincount, axis, indices.reshape(arr.shape), None, np.max(indices) + 1)

array([[10, 10, 10],
       [ 9,  9,  9],
       [11, 11, 11]])

In [67]:
np.apply_along_axis(np.bincount, axis=0, arr=y_train[top_points])

array([[10, 10, 10],
       [ 9,  9,  9],
       [11, 11, 11]])

In [52]:
np.bincount(y_train[top_points])

ValueError: object too deep for desired array

In [40]:
x_test = X_test[0]

np.linalg.norm(X_train - x_test, axis=1)

array([4.19285106, 3.68781778, 0.76157731, 3.55808937, 3.95094925,
       0.84261498, 0.6164414 , 3.54118624, 3.7067506 , 3.73496988,
       0.8660254 , 0.75498344, 0.73484692, 3.72827038, 3.54118624,
       1.2489996 , 0.53851648, 1.08627805, 0.64807407, 2.34520788,
       0.65574385, 2.59036677, 0.64807407, 3.65786823, 2.68328157,
       0.70710678, 3.6138622 , 3.75099987, 3.23728281, 1.82482876,
       0.67082039, 3.3       , 3.44383507, 3.27719392, 0.72111026,
       3.54964787, 0.67082039, 2.92574777, 3.78153408, 0.43588989,
       0.98994949, 3.76961536, 0.64807407, 0.8660254 , 0.9486833 ,
       0.81240384, 1.43874946, 1.05356538, 3.36749165, 1.86010752,
       0.97467943, 3.65376518, 3.82361086, 1.15758369, 0.86023253,
       3.57351368, 1.3       , 3.57910603, 3.41613817, 0.75498344,
       1.99749844, 1.73493516, 0.58309519, 0.94339811, 2.40208243,
       0.97467943, 3.8457769 , 4.17731971, 0.74161985, 1.8493242 ,
       3.68510515, 3.63180396, 3.93573373, 0.50990195, 1.62788

In [34]:
X_train.shape

(120, 4)

THis is matching exactly with the sklearn result

In [7]:
one_sample = X_test[0]

In [8]:
one_sample.shape

(4,)

In [9]:
one_sample = X_test[:3]

In [10]:
one_sample.shape

(3, 4)

In [11]:
one_sample = X_test[0]

In [12]:
one_sample.shape

(4,)

In [13]:
X_train.shape

(120, 4)

In [14]:
(X_train - one_sample)

array([[-1.5,  0.8, -3.7, -1. ],
       [-0.4,  1.6, -3.2, -0.8],
       [ 0.6,  0.3, -0.3,  0.2],
       [-1.3,  0.6, -3.1, -1. ],
       [-1.7,  0.4, -3.4, -1. ],
       [ 0.2, -0.3,  0.3,  0.7],
       [ 0.3,  0.4, -0.2,  0.3],
       [-0.9,  0.7, -3.2, -1. ],
       [-1.1,  0.8, -3.3, -1. ],
       [-0.9,  1.3, -3.2, -1.1],
       [-0.3, -0.1,  0.4,  0.7],
       [-0.1,  0.6, -0.2,  0.4],
       [ 0.6,  0.3,  0. ,  0.3],
       [-0.7,  1.1, -3.4, -0.8],
       [-0.7,  0.9, -3.2, -1. ],
       [-0.6, -0.4, -1. , -0.2],
       [ 0.2,  0. ,  0.4,  0.3],
       [ 0.3,  0.3,  0.8,  0.6],
       [ 0.5,  0.2, -0.3,  0.2],
       [ 1.1,  0.8,  1.4,  1.3],
       [-0.4,  0.1, -0.5,  0.1],
       [ 1.5,  0.2,  1.9,  0.9],
       [-0.5,  0.2, -0.2,  0.3],
       [-1. ,  0.7, -3.3, -1. ],
       [ 1.6,  0. ,  2. ,  0.8],
       [-0.3, -0.1, -0.6, -0.2],
       [-0.9,  0.6, -3.3, -1. ],
       [-1.1,  0.7, -3.4, -0.9],
       [-1. ,  1. , -2.8, -0.8],
       [-1.1, -0.8, -1.2, -0.2],
       [ 0

In [15]:
np.linalg.norm(X_train - one_sample, axis=1).reshape(-1, 1)

array([[4.19285106],
       [3.68781778],
       [0.76157731],
       [3.55808937],
       [3.95094925],
       [0.84261498],
       [0.6164414 ],
       [3.54118624],
       [3.7067506 ],
       [3.73496988],
       [0.8660254 ],
       [0.75498344],
       [0.73484692],
       [3.72827038],
       [3.54118624],
       [1.2489996 ],
       [0.53851648],
       [1.08627805],
       [0.64807407],
       [2.34520788],
       [0.65574385],
       [2.59036677],
       [0.64807407],
       [3.65786823],
       [2.68328157],
       [0.70710678],
       [3.6138622 ],
       [3.75099987],
       [3.23728281],
       [1.82482876],
       [0.67082039],
       [3.3       ],
       [3.44383507],
       [3.27719392],
       [0.72111026],
       [3.54964787],
       [0.67082039],
       [2.92574777],
       [3.78153408],
       [0.43588989],
       [0.98994949],
       [3.76961536],
       [0.64807407],
       [0.8660254 ],
       [0.9486833 ],
       [0.81240384],
       [1.43874946],
       [1.053

In [16]:
y_train.reshape(-1, 1)

array([[0],
       [0],
       [1],
       [0],
       [0],
       [2],
       [1],
       [0],
       [0],
       [0],
       [2],
       [1],
       [1],
       [0],
       [0],
       [1],
       [2],
       [2],
       [1],
       [2],
       [1],
       [2],
       [1],
       [0],
       [2],
       [1],
       [0],
       [0],
       [0],
       [1],
       [2],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [2],
       [0],
       [1],
       [2],
       [0],
       [2],
       [2],
       [1],
       [1],
       [2],
       [1],
       [0],
       [1],
       [2],
       [0],
       [0],
       [1],
       [1],
       [0],
       [2],
       [0],
       [0],
       [1],
       [1],
       [2],
       [1],
       [2],
       [2],
       [1],
       [0],
       [0],
       [2],
       [2],
       [0],
       [0],
       [0],
       [1],
       [2],
       [0],
       [2],
       [2],
       [0],
       [1],
       [1],
       [2],
       [1],
    

In [17]:
np.concatenate([np.linalg.norm(X_train - one_sample, axis=1).reshape(-1, 1),
                y_train.reshape(-1, 1)], axis=1)

array([[4.19285106, 0.        ],
       [3.68781778, 0.        ],
       [0.76157731, 1.        ],
       [3.55808937, 0.        ],
       [3.95094925, 0.        ],
       [0.84261498, 2.        ],
       [0.6164414 , 1.        ],
       [3.54118624, 0.        ],
       [3.7067506 , 0.        ],
       [3.73496988, 0.        ],
       [0.8660254 , 2.        ],
       [0.75498344, 1.        ],
       [0.73484692, 1.        ],
       [3.72827038, 0.        ],
       [3.54118624, 0.        ],
       [1.2489996 , 1.        ],
       [0.53851648, 2.        ],
       [1.08627805, 2.        ],
       [0.64807407, 1.        ],
       [2.34520788, 2.        ],
       [0.65574385, 1.        ],
       [2.59036677, 2.        ],
       [0.64807407, 1.        ],
       [3.65786823, 0.        ],
       [2.68328157, 2.        ],
       [0.70710678, 1.        ],
       [3.6138622 , 0.        ],
       [3.75099987, 0.        ],
       [3.23728281, 0.        ],
       [1.82482876, 1.        ],
       [0.

In [18]:
distance_label = np.concatenate(
    [np.linalg.norm(X_train - one_sample, axis=1).reshape(-1, 1), y_train.reshape(-1, 1)], axis=1)
print(distance_label)

[[4.19285106 0.        ]
 [3.68781778 0.        ]
 [0.76157731 1.        ]
 [3.55808937 0.        ]
 [3.95094925 0.        ]
 [0.84261498 2.        ]
 [0.6164414  1.        ]
 [3.54118624 0.        ]
 [3.7067506  0.        ]
 [3.73496988 0.        ]
 [0.8660254  2.        ]
 [0.75498344 1.        ]
 [0.73484692 1.        ]
 [3.72827038 0.        ]
 [3.54118624 0.        ]
 [1.2489996  1.        ]
 [0.53851648 2.        ]
 [1.08627805 2.        ]
 [0.64807407 1.        ]
 [2.34520788 2.        ]
 [0.65574385 1.        ]
 [2.59036677 2.        ]
 [0.64807407 1.        ]
 [3.65786823 0.        ]
 [2.68328157 2.        ]
 [0.70710678 1.        ]
 [3.6138622  0.        ]
 [3.75099987 0.        ]
 [3.23728281 0.        ]
 [1.82482876 1.        ]
 [0.67082039 2.        ]
 [3.3        0.        ]
 [3.44383507 0.        ]
 [3.27719392 0.        ]
 [0.72111026 1.        ]
 [3.54964787 0.        ]
 [0.67082039 1.        ]
 [2.92574777 2.        ]
 [3.78153408 0.        ]
 [0.43588989 1.        ]


In [19]:
distance_label[distance_label[:, 0].argsort()][::-1][:3]

array([[4.19285106, 0.        ],
       [4.17731971, 0.        ],
       [3.95094925, 0.        ]])

In [20]:
top_3 = distance_label[distance_label[:, 0].argsort()][::-1][:3]
np.bincount(top_3[:, 1].asint())

AttributeError: 'numpy.ndarray' object has no attribute 'asint'

In [21]:
np.bincount(top_3[:, 1].astype(int))

array([3])

In [22]:
a = np.array([0,0,1,3])
np.bincount(a.astype(int))

array([2, 1, 0, 1])

In [23]:
np.bincount(a.astype(int)).argmax()

0

In [24]:
(np.linalg.norm(X_train - one_sample, axis=1)).max()

4.192851058647326

In [25]:
X_train

array([[4.6, 3.6, 1. , 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [6.7, 3.1, 4.4, 1.4],
       [4.8, 3.4, 1.6, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [6.3, 2.5, 5. , 1.9],
       [6.4, 3.2, 4.5, 1.5],
       [5.2, 3.5, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.2, 4.1, 1.5, 0.1],
       [5.8, 2.7, 5.1, 1.9],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [5.4, 3.9, 1.3, 0.4],
       [5.4, 3.7, 1.5, 0.2],
       [5.5, 2.4, 3.7, 1. ],
       [6.3, 2.8, 5.1, 1.5],
       [6.4, 3.1, 5.5, 1.8],
       [6.6, 3. , 4.4, 1.4],
       [7.2, 3.6, 6.1, 2.5],
       [5.7, 2.9, 4.2, 1.3],
       [7.6, 3. , 6.6, 2.1],
       [5.6, 3. , 4.5, 1.5],
       [5.1, 3.5, 1.4, 0.2],
       [7.7, 2.8, 6.7, 2. ],
       [5.8, 2.7, 4.1, 1. ],
       [5.2, 3.4, 1.4, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 2. , 3.5, 1. ],
       [6.3, 2.7, 4.9, 1.8],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [5.6, 2

In [26]:
one_sample

array([6.1, 2.8, 4.7, 1.2])

In [27]:
4.6 - 6.1

-1.5