## Algorithm:

- Take test data and compute distances with all training samples.

- Get k nearest samples, labels.

- Majority vote, and get most common class label.

- Classify the test data as belonging to that class label.

In [1]:
import numpy as np
from sklearn import datasets
from collections import Counter
from sklearn.model_selection import train_test_split

## Dataset

In [2]:
X, y = datasets.make_classification(n_samples=100, n_features=5, n_classes=3, n_clusters_per_class=2, random_state=4, n_informative=3)

# obey this rule: (n_classes * n_clusters_per_class) ≤ (2 ^ n_informative)

In [3]:
X.shape, y.shape

((100, 5), (100,))

In [4]:
values, counts = np.unique(y, return_counts=True)
values, counts

(array([0, 1, 2]), array([34, 33, 33]))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80, 5), (20, 5), (80,), (20,))

In [7]:
X_train[:5]

array([[ 1.23757935e-03,  1.98079863e+00,  1.86114132e+00,
         1.04702962e+00,  4.72818528e-02],
       [ 4.84125952e-01,  6.32836447e-01,  1.08093578e+00,
         4.99436332e-02,  5.25206674e-01],
       [-9.51878266e-01, -2.12417596e-01, -1.05877284e+00,
         2.99262154e-01, -8.28961353e-01],
       [-1.44725099e+00, -3.96387876e-01, -2.41069696e+00,
         1.52793397e+00, -2.60911442e+00],
       [ 9.67033364e-01, -9.42721751e-01, -1.14746518e+00,
         8.07500273e-01, -1.27397308e+00]])

## Model Building

In [8]:
# testing Counter

test = [1,2,3,4,1,2,3,1,1]
most_common = Counter(test).most_common(1)
print(most_common)
print(most_common[0][0])      # returns most common element

[(1, 4)]
1


In [9]:
def euclidean_distance(x1, x2):                         # global function
    return np.sqrt(np.sum((x1-x2)**2))

In [10]:
# testing

for x_train in X_train[:5]:
  print(x_train)
  print(euclidean_distance(X_test[0], x_train))
  print()

[1.23757935e-03 1.98079863e+00 1.86114132e+00 1.04702962e+00
 4.72818528e-02]
2.203785741544138

[0.48412595 0.63283645 1.08093578 0.04994363 0.52520667]
1.8904622007419956

[-0.95187827 -0.2124176  -1.05877284  0.29926215 -0.82896135]
4.918474188336305

[-1.44725099 -0.39638788 -2.41069696  1.52793397 -2.60911442]
7.160108410413117

[ 0.96703336 -0.94272175 -1.14746518  0.80750027 -1.27397308]
5.02348221082457



In [11]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):                                            # KNN doesn't have training step, so simply storing them
        self.X_train = X
        self.y_train = y

    def predict(self, X):                                           # for bulk samples
        y_pred = [self._predict(x) for x in X]                      # for each of the samples, get predicted value
        return np.array(y_pred)                                     # converting y_pred list to array

    def _predict(self, x):                                          # for one sample, finding distance of x to all other training samples from X
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]                  # sorting the distances and getting the indices of top k items
        k_nearest_labels = [self.y_train[i] for i in k_indices]     # getting labels for those indices
        most_common = Counter(k_nearest_labels).most_common(1)      # Counter will count all the labels, most_common will give label with highest count
        return most_common[0][0]

In [12]:
model = KNN(k=3)
model.fit(X_train, y_train)                                       # used to train model and update w, b
y_pred = model.predict(X_test)

## Evaluating test data

In [13]:
y_test

array([1, 1, 2, 0, 0, 0, 1, 2, 2, 1, 1, 1, 0, 0, 0, 1, 1, 0, 2, 1])

In [14]:
y_pred

array([1, 1, 1, 2, 0, 0, 2, 2, 2, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1])

In [15]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_test == y_pred) / len(y_test)
    return accuracy

In [16]:
acc = accuracy(y_test, y_pred)
print("Accuracy:", acc*100)

Accuracy: 65.0
