# a) KNN

In [1]:
import numpy as np
from scipy.stats import mode
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd

def knn(data, labels, test, k):
    a = np.sum(test ** 2, axis=-1)
    b = np.sum(data ** 2, axis=-1)
    c = -2 * (test @ data.T)
    dists = c + a.reshape(-1,1) + b
    knn = np.argpartition(dists, k)[..., :k]
    knn_labels = labels[knn]
    res = mode(knn_labels, axis=1)[0]
    return res[:,0]
# data = np.array([[0,1,2,3,4], [0,2,0,2,0]]).T
# labels = np.array([0,0,1,1,0])
# test = np.array([[1,3,5,6], [1,1,1,1]]).T
# k = 3
# knn(data,labels, test,k)

# b) Iris data

In [2]:
iris = datasets.load_iris()
data = np.array(iris.data)
labels = iris.target
print(data.shape)

(150, 4)


In [3]:
randomChoice = np.random.permutation(np.concatenate((data.T, labels[np.newaxis])).T)
train_data = randomChoice[:100,:-1]
train_labels = randomChoice[:100,-1]
test_data = randomChoice[100:,:-1]
test_labels = randomChoice[100:,-1]

In [4]:
for k in range(1,20):
    knn_test_labels = knn(train_data, train_labels, test_data, k)
    print(f'k={k}: \t{np.sum(knn_test_labels == test_labels)/len(test_labels)*100}%')

k=1: 	98.0%
k=2: 	98.0%
k=3: 	98.0%
k=4: 	98.0%
k=5: 	98.0%
k=6: 	98.0%
k=7: 	98.0%
k=8: 	98.0%
k=9: 	98.0%
k=10: 	98.0%
k=11: 	98.0%
k=12: 	98.0%
k=13: 	98.0%
k=14: 	98.0%
k=15: 	98.0%
k=16: 	98.0%
k=17: 	98.0%
k=18: 	98.0%
k=19: 	98.0%


# c) Optical Recognition of Handwritten Digits 

In [5]:
train_digits = np.array(pd.read_csv('data/optdigits.tra', header=None))
test_digits = np.array(pd.read_csv('data/optdigits.tes', header=None))
train_digits.shape, test_digits.shape

((3823, 65), (1797, 65))

In [6]:
train_data = train_digits[:,:-1]
train_labels = train_digits[:,-1]
test_data = test_digits[:,:-1]
test_labels = test_digits[:,-1]

In [7]:
for k in range(1,20):
    knn_test_labels = knn(train_data, train_labels, test_data, k)
    print(f'k={k}: \t{np.sum(knn_test_labels == test_labels)/len(test_labels)*100}%')

k=1: 	97.9966611018364%
k=2: 	97.38452977184195%
k=3: 	97.8297161936561%
k=4: 	97.60712298274903%
k=5: 	97.88536449638286%
k=6: 	97.77406789092933%
k=7: 	97.66277128547578%
k=8: 	97.66277128547578%
k=9: 	97.71841958820256%
k=10: 	97.55147468002225%
k=11: 	97.94101279910963%
k=12: 	97.44017807456873%
k=13: 	97.38452977184195%
k=14: 	97.16193656093489%
k=15: 	97.27323316638842%
k=16: 	97.10628825820812%
k=17: 	97.21758486366166%
k=18: 	97.05063995548136%
k=19: 	97.10628825820812%


# d) Cross validation

In [8]:
iris = datasets.load_iris()
irisData = np.array(iris.data)
irisLabels = iris.target

In [9]:
for k in range(1,11,2):
    rands = (np.random.uniform(0,1, len(irisLabels)) * 10).astype(int)
    err = 0
    print(f"\nk={k}:")
    for i in range(10):
        train_inds = rands != i
        test_inds = rands == i
        train_data = irisData[train_inds]
        test_data = irisData[test_inds]
        train_labels = irisLabels[train_inds]
        test_labels = irisLabels[test_inds]

        test_knn_labels = knn(train_data, train_labels, test_data, k)
        err += np.sum(test_knn_labels != test_labels)
        print('\t', np.sum(test_knn_labels == test_labels)/len(test_labels))
    print(err, '\n')


k=1:
	 1.0
	 0.8461538461538461
	 1.0
	 0.9545454545454546
	 1.0
	 1.0
	 0.9444444444444444
	 0.9375
	 1.0
	 0.9
6 


k=3:
	 1.0
	 0.875
	 0.9375
	 1.0
	 1.0
	 1.0
	 1.0
	 0.9
	 1.0
	 0.9545454545454546
6 


k=5:
	 1.0
	 0.9375
	 1.0
	 1.0
	 0.9444444444444444
	 1.0
	 1.0
	 0.8888888888888888
	 1.0
	 1.0
3 


k=7:
	 1.0
	 0.9411764705882353
	 0.9411764705882353
	 1.0
	 1.0
	 1.0
	 1.0
	 1.0
	 0.9411764705882353
	 1.0
3 


k=9:
	 1.0
	 1.0
	 1.0
	 1.0
	 1.0
	 1.0
	 0.9047619047619048
	 0.9333333333333333
	 0.9333333333333333
	 1.0
4 



In [10]:
digitData = np.concatenate((train_digits[:,:-1], test_digits[:,:-1]))
digitLabels = np.concatenate((train_digits[:,-1], test_digits[:,-1]))

In [11]:
for k in range(1,11,2):
    rands = (np.random.uniform(0,1, len(digitLabels)) * 10).astype(int)
    err = 0
    print(f"\nk={k}:")
    for i in range(10):
        train_inds = rands != i
        test_inds = rands == i
        train_data = digitData[train_inds]
        test_data = digitData[test_inds]
        train_labels = digitLabels[train_inds]
        test_labels = digitLabels[test_inds]

        test_knn_labels = knn(train_data, train_labels, test_data, k)
        err += np.sum(test_knn_labels != test_labels)
        print('\t', np.sum(test_knn_labels == test_labels)/len(test_labels))
    print(err, '\n')


k=1:
	 0.9944444444444445
	 0.984375
	 0.9863013698630136
	 0.9897260273972602
	 0.9931389365351629
	 0.9930191972076788
	 0.9849340866290018
	 0.9849624060150376
	 0.9855072463768116
	 0.9858407079646018
66 


k=3:
	 0.9850467289719627
	 0.9895287958115183
	 0.9925650557620818
	 0.9853479853479854
	 0.9914821124361158
	 0.9831932773109243
	 0.9910873440285205
	 0.9871559633027523
	 0.9910394265232975
	 0.9810996563573883
69 


k=5:
	 0.9880952380952381
	 0.9929453262786596
	 0.9852125693160814
	 0.9881956155143339
	 0.9763113367174281
	 0.9785992217898832
	 0.9931623931623932
	 0.9907235621521335
	 0.9910233393177738
	 0.9871559633027523
72 


k=7:
	 0.996415770609319
	 0.9892857142857143
	 0.9891500904159132
	 0.9880341880341881
	 0.9774696707105719
	 0.9795158286778398
	 0.988155668358714
	 0.9867424242424242
	 0.9871794871794872
	 0.9863247863247864
74 


k=9:
	 0.9882747068676717
	 0.9857397504456328
	 0.9854014598540146
	 0.9876106194690265
	 0.9828897338403042
	 0.9896729776247