In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import defaultdict
import operator
import time

In [2]:
# Generate Data
mean1 = [0,0]
cov1 = [[1,0],[0,1]]
dataSize1 = 20
gauss1 = np.random.multivariate_normal(mean1, cov1, dataSize1)
labels1 = np.full(dataSize1, 1)

mean2 = [2,2]
cov2 = [[1,0],[0,1]]
dataSize2 = 20
gauss2 = np.random.multivariate_normal(mean2, cov2, dataSize2)
labels2 = np.full(dataSize1, 2)

dataset = np.concatenate((gauss1, gauss2), axis=0)
labels = np.concatenate((labels1, labels2), axis=0)

In [3]:
# Split into test/train
data_train, data_test, labels_train, labels_test = train_test_split(dataset, labels, test_size=0.20, random_state=42)

In [4]:
class KNNModel:
    
    def __init__(self, k):
        self.k = k
        self.trainData = None
        self.trainLabels = None
    
    def train(self, trainData, trainLabels):
        start_time = time.time()
        self.trainData = trainData
        self.trainLabels = trainLabels
        return time.time() - start_time
        
    def test(self, testData, testLabels):
        if self.trainData is None:
            raise RunTimeError('You must train the model before testing')
            
        start_time = time.time()
        predictions = list(map(lambda testDataPoint: KNN(self.trainData, self.trainLabels, testDataPoint, self.k), testData))
        accuracy = accuracy_score(testLabels.tolist(), predictions)
        timing = time.time() - start_time
        return accuracy, timing

In [5]:
def DistanceMeasure(datapoint1, datapoint2):
    return np.linalg.norm(datapoint1-datapoint2)

In [6]:
def KNNDistancesAndLabels(trainingData, trainingLabels, testDatapoint, k):
    distances = list(map(lambda x: DistanceMeasure(x, testDatapoint), trainingData))
    # Get the sorted indicies of the distances
    indiciesOfSortedDistances = np.argsort(distances)
    kShortestDistances = [distances[ii] for ii in indiciesOfSortedDistances[:k]]
    kShortestLabels = [trainingLabels[ii] for ii in indiciesOfSortedDistances[:k]]
    return list(zip(kShortestLabels, kShortestDistances))

In [7]:
def KNNDecision(labelsAndDistances):
    labelsDictionary = defaultdict(int)
    for (label, distance) in labelsAndDistances:
        labelsDictionary[label] += distance
    sortedLabelsAndDistances = sorted(labelsDictionary.items(), key=operator.itemgetter(1))
    shortestLabelAndDistance = sortedLabelsAndDistances[0]
    return shortestLabelAndDistance[0]

In [8]:
# Gets the incidies of the k nearest neighbors
def KNN(trainingData, trainingLabels, testDatapoint, k):
    labelsAndDistances = KNNDistancesAndLabels(trainingData, trainingLabels, testDatapoint, k)
    return KNNDecision(labelsAndDistances)

In [9]:
model = KNNModel(3)
print(model.train(data_train, labels_train))
print(model.test(data_test, labels_test))

0.0
(0.625, 0.00400090217590332)
