# kNN from scratch!
This will test your understanding of the kNN algorithm. You will implement the kNN algorithm from scratch and use it to classify an artificial dataset.
It will also test your understanding of python programming. 
Use the class skeleton to implement the kNN algorithm and the test function to test your implementation.


In [21]:
import numpy as np
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

def euclidean_distance(x1, x2):
    return np.linalg.norm(x1 - x2, ord=2)

class KNN:
    def __init__(self, k=3, distance='euclidean'):
        self.k = k
        if distance not in ['euclidean', 'manhattan']:
            raise ValueError('Distance must be either "euclidean" or "manhattan"')
        self.distance = {
            'euclidean': euclidean_distance,
            'manhattan': manhattan_distance
        }[distance]

    def fit(self, X, y):
        self.X = X
        self.y = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions

    def _predict(self, x: np.ndarray):
        distances = [self.distance(x, x_train) for x_train in self.X]
        indices = np.argsort(distances)
        k_nearest_labels = self.y[indices[:self.k]]

        # counting with numpy:
        most_common = np.bincount(k_nearest_labels).argmax()

        # alternative:
        # counter = {}
        # for l in k_nearest_labels:
        #     counter[l] = counter.get(l, 0) + 1
        # most_common = max(counter, key=counter.get) 

        return most_common



In [22]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [23]:
# X is a list of training vectors, y is a list of corresponding labels
X, y = make_blobs(n_samples=100, centers=2, n_features=2, cluster_std=1.60, random_state=0,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
for k in range(1, 22, 2):
    knn = KNN(k=k, distance='euclidean')
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)
    print(f'k={k}, accuracy={accuracy_score(y_test, predictions)}')
# knn = KNN(k=8, distance='euclidean')
# knn.fit(X_train, y_train)
# predictions = knn.predict(X_test)

k=1, accuracy=0.8
k=3, accuracy=0.85
k=5, accuracy=0.95
k=7, accuracy=0.9
k=9, accuracy=0.9
k=11, accuracy=0.85
k=13, accuracy=0.9
k=15, accuracy=0.9
k=17, accuracy=0.9
k=19, accuracy=0.9
k=21, accuracy=0.9


In [60]:
accuracy_score(y_test, predictions)

0.9