In [1]:
import numpy as np
import pandas as pd
from sortedcontainers import SortedList
from datetime import datetime

In [2]:
def getData(limit = None):
    df = pd.read_csv("train.csv")
    data = df.values
    np.random.shuffle(data) # To make order random at the time of sampling 
    X = data[:, 1:] / 255.0 # Scaled between 0-1
    Y = data[:, 0]
    if limit is not None:
        X, Y = X[:limit], Y[:limit]
    return X, Y

In [3]:
class KNN(object):
    def __init__(self, k):
        self.k = k
    
    def fit(self, x, y):
        self. x = x
        self.y = y
        
    def predict(self, x):
        y = np.zeros(len(x))
        for i, x in enumerate(x):
            sortedL = SortedList()
            for j, xt in enumerate(self.x):
                diff = x - xt
                d = diff.dot(diff)
                if len(sortedL) < self.k:
                    sortedL.add((d, self.y[j]))
                else:
                    if d < sortedL[-1][0]:
                        del sortedL[-1]
                        sortedL.add((d, self.y[j]))
            votes = {}
            for _, v in sortedL:
                votes[v] = votes.get(v, 0) + 1
            maxVotes = 0
            maxVotesClass = -1
            for v, count in votes.items():
                if count > maxVotes:
                    maxVotes = count
                    maxVotesClass = v
            y[i] = maxVotesClass
        return y
    
    def score(self, x, y):
        p = self.predict(x)
        return np.mean(p == y)

In [4]:
x, y = getData(2000)
nTrain = 1000
trainScores = []
testScores = []
xTrain, yTrain = x[:nTrain], y[:nTrain]
xTest, yTest = x[nTrain:], y[nTrain:]
for k in range(1, 5):
    knn = KNN(k)
    t0 = datetime.now()
    knn.fit(xTrain, yTrain)
    print("Training Time: ", (datetime.now() - t0))
    
    t0 = datetime.now()
    trainScore = knn.score(xTrain, yTrain)
    trainScores.append(trainScore)
    print("Train accuracy:", trainScore)
    print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(yTrain))

Training Time:  0:00:00
Train accuracy: 1.0
Time to compute train accuracy: 0:00:11.039639 Train size: 1000
Training Time:  0:00:00
Train accuracy: 1.0
Time to compute train accuracy: 0:00:11.586502 Train size: 1000
Training Time:  0:00:00
Train accuracy: 0.958
Time to compute train accuracy: 0:00:11.638996 Train size: 1000
Training Time:  0:00:00
Train accuracy: 0.956
Time to compute train accuracy: 0:00:10.620457 Train size: 1000
