In [1]:
import numpy as np
from scipy import stats

In [2]:
def manhattan_dst(row1, row2):
    return sum([abs(r1-r2) for r1,r2 in zip(row1,row2)])

def get_neigh(train,test_row,num_neigh):
    return sorted(train, key = lambda train_r : manhattan_dst(train_r,test_row))[:num_neigh]

In [3]:
def predict_classification(train, test_row, num_neigh):
    return stats.mode(np.array(get_neigh(train, test_row, num_neigh))[:,-1]).mode

In [35]:
!wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz

--2021-02-19 03:56:21--  https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170498071 (163M) [application/x-gzip]
Saving to: ‘cifar-10-python.tar.gz.1’


2021-02-19 03:57:50 (1.86 MB/s) - ‘cifar-10-python.tar.gz.1’ saved [170498071/170498071]



In [38]:
!tar -xf cifar-10-python.tar.gz

In [4]:
import pickle

In [5]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dic = pickle.load(fo, encoding='bytes')
    return dic

In [6]:
import os

In [7]:
def load_dataset():
    X,y =None,[]
    for filename in sorted(os.listdir('/'.join([os.getcwd(),'cifar-10-batches-py']))):
        if filename[:10] == 'data_batch':
            print('loaded',filename)
            a = unpickle('/'.join([os.getcwd(),'cifar-10-batches-py',filename]))
            y.extend(a[b'labels'])
            if X is None:
                X = np.moveaxis(a[b'data'].reshape(-1,3,32,32),1,-1)
            else:
                X = np.append(X, np.moveaxis(a[b'data'].reshape(-1,3,32,32),1,-1), axis=0)
    return X, np.array(y)

In [8]:
X,y = load_dataset()

loaded data_batch_1
loaded data_batch_2
loaded data_batch_3
loaded data_batch_4
loaded data_batch_5


In [13]:
def preprocess(X):
    bgr2gray_weights = [0.1140, 0.5870, 0.2989]
    gray = np.dot(X,bgr2gray_weights)
    gray = (gray-gray.mean(axis=0))/ 255.
    vec = gray.reshape(-1,1024)
    return vec

In [14]:
X_train = preprocess(X)

In [23]:
data = np.append(X_train,y.reshape(-1,1),axis=-1)

In [40]:
train, valid = data[:10000], data[40000:40100]

In [41]:
from tqdm import tqdm

In [44]:
preds = [predict_classification(train, valid_row,5) for valid_row in tqdm(valid)]





  0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A



  1%|          | 1/100 [00:04<08:10,  4.96s/it][A[A[A[A



  2%|▏         | 2/100 [00:10<08:13,  5.04s/it][A[A[A[A



  3%|▎         | 3/100 [00:14<08:00,  4.96s/it][A[A[A[A



  4%|▍         | 4/100 [00:19<07:53,  4.93s/it][A[A[A[A



  5%|▌         | 5/100 [00:24<07:56,  5.01s/it][A[A[A[A



  6%|▌         | 6/100 [00:29<07:49,  4.99s/it][A[A[A[A



  7%|▋         | 7/100 [00:34<07:42,  4.97s/it][A[A[A[A



  8%|▊         | 8/100 [00:39<07:34,  4.94s/it][A[A[A[A



  9%|▉         | 9/100 [00:44<07:32,  4.98s/it][A[A[A[A



 10%|█         | 10/100 [00:49<07:25,  4.95s/it][A[A[A[A



 11%|█         | 11/100 [00:54<07:20,  4.95s/it][A[A[A[A



 12%|█▏        | 12/100 [00:59<07:12,  4.91s/it][A[A[A[A



 13%|█▎        | 13/100 [01:04<07:06,  4.90s/it][A[A[A[A



 14%|█▍        | 14/100 [01:09<07:01,  4.90s/it][A[A[A[A



 15%|█▌        | 15/100 [01:14<06:58,  4.92s/it][A[A

In [54]:
print('Accuracy',100*np.mean(np.array(preds).reshape(-1,1)==valid[:,-1]),' %')

Accuracy 9.82  %


In [53]:
from matplotlib import pyplot as plt

In [None]:
best,bestK= 0,0
ks, accs= [],[]
for k in range(5,25):
    y_hat = np.array([predict_classification(train, valid_row,k) for valid_row in valid]).reshape(-1,1)
    acc = 100*np.mean(preds==valid[:,-1])
    print('Accuracy',acc,' % for',k,'neighbours')
    accs.append(acc)
    ks.append(k)
    if acc>best:
        best = acc
        bestK = k
plt.plot(accs,ks)
print('The best value for k can be seen from the hockey graph is:',bestK) 