# k-NN

by Kirill Bykov, Oleg Vlasovets, Ilya

In [4]:
#imports
import pandas as pd #pandas for reading csv file
import numpy as np  #numpy for vectorising
from random import randrange
import seaborn as sns
from scipy.spatial import distance_matrix
from tqdm import tqdm
%matplotlib inline

In [5]:
def read_MNIST(file_path):
    df = pd.read_csv(file_path, header = None)
    Y = np.array(df[0])
    X = np.array(df.iloc[:,1:])
    return X, Y

def distance(A, B, metric = "euclidean"):
    if metric == "euclidean":
        return np.linalg.norm(A-B)

def first_most_frequent(arr):
    c = 0
    num = arr[0]
    for i in arr:
        amount_times = arr.count(i)
        if amount_times > c:
            c = amount_times
            num = i
    return num

def vote(NN, Y):
    n = len(NN)
    votes = np.zeros(n).astype(int)
    for i in range(0, n):
        votes[i] = first_most_frequent([Y[j] for j in NN[i]])
    return votes
    
        
def kNN(X_train, Y_train, X_test, k, p = 2):
    n = len(X_train)
    n_test = len(X_test)
    D = np.zeros((n_test, n))
    NN = np.zeros((n_test,k)).astype(int)
    
#     for i in range(0,n_test):
#         for j in range(0,n):
#             D[i, j] = distance(X_test[i], X_train[j], metric)
    
    D = distance_matrix(X_test, X_train, p);
    for i in range(0,n_test):
        NN[i] = D[i].argsort()[:k]
    pred = vote(NN, Y_train)
    return pred

def accuracy(Y_pred, Y_true):
    return np.mean(1 - np.equal(Y_true, Y_pred, dtype=int))

In [6]:
results = []
for k in tqdm(range(1,21)):
    X_train, Y_train = read_MNIST("data/MNIST_train_small.csv")
    X_test, Y_test = read_MNIST("data/MNIST_test_small.csv")
    Y_pred = kNN(X_train, Y_train, X_test, k)
    results.append(accuracy(Y_pred, Y_test))

ax = sns.lineplot(x=[i for i in range(1,21)], y=results)
print(results)


  0%|          | 0/20 [00:00<?, ?it/s][A
  5%|▌         | 1/20 [00:13<04:21, 13.74s/it][A

KeyboardInterrupt: 

### Cross-validation

In [33]:
def data_split(dataset, folds=3):
    dataset_split = np.array([])
    dataset_copy = np.array(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = np.array([])
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold = np.concatenate((fold,np.delete(dataset_copy, index, None)))
        dataset_split = np.concatenate((dataset_split, fold))
    return dataset_split

In [25]:
X, Y = read_MNIST("data/MNIST_train_small.csv")