# k-NN classifier

### Load data, select validation set and apply SVD on training data

In [1]:
import h5py
import numpy as np
import time
# load data
with h5py.File('/Users/liutong/Downloads/5318/assignment1/knn/images_training.h5','r') as H:
    data = np.copy(H['data'])
with h5py.File('/Users/liutong/Downloads/5318/assignment1/knn/labels_training.h5','r') as H:
    label = np.copy(H['label'])
with h5py.File('/Users/liutong/Downloads/5318/assignment1/knn/images_testing.h5','r') as H:
    test_data = np.copy(H['data'])
with h5py.File('/Users/liutong/Downloads/5318/assignment1/knn/labels_testing_2000.h5','r') as H:
    test_label = np.copy(H['label'])

In [2]:
def Validation_dataset(X, Y, percent):
    k=int((percent/100)*X.shape[1])
    rand= np.random.permutation(X.shape[1])
    rand= rand[0:k]
    All_index=np.arange(X.shape[1])
    a=set(rand)
    b=set(All_index)
    c=b-a
    a=sorted(a)
    c=sorted(c)

    Train_Data_mat=np.zeros((X.shape[0],len(c)))
    Train_labels= np.zeros((len(c)))
    Val_Data_mat=np.zeros((X.shape[0], len(a)))
    Val_labels= np.zeros((len(a)))
    for i in range(len(c)):
        Train_Data_mat[:,i]=X[:,c[i]]
        Train_labels[i]= Y[c[i]]
    for i in range(len(a)):
        Val_Data_mat[:,i]=X[:,a[i]]
        Val_labels[i]= Y[a[i]] 
        
    return Train_Data_mat, Train_labels, Val_Data_mat, Val_labels

In [3]:
def SVD(X, var_desired):
    U, s, Vt= np.linalg.svd(X, full_matrices=False)
    sum_ev_sq=np.sum(s**2)
    s_square=np.square(s)
    cumsum_s=np.cumsum(s_square)
    var_arr=cumsum_s/sum_ev_sq
    k=np.argmax(var_arr>var_desired)
    k=k+1
    S=np.diag(s)
    X_hat_reconst=U[:,0:k].dot(S[0:k,0:k]).dot(Vt[0:k,:])
    data=X
    data_hat_reconstructed=X_hat_reconst
    SSE = np.sum((data - data_hat_reconstructed)**2)
    n=k
    comp_ratio = (data.shape[1]*n + n + data.shape[0]*n)/(data.shape[1] * data.shape[0])
    return X_hat_reconst, k, comp_ratio   

### Main k-NN algorithm

In [21]:
def KNN(data, label, test_data, k):
    pre_label = []
    for i in range(test_data.shape[1]): 
        X_q = test_data[:, i][:, np.newaxis]# the ith test image
        dis = ((data - X_q)**2).sum(axis=0) # distance list of this query point to every training image: (30000,)
        arg_ascending = np.argsort(dis) #(30000,)
        classes = np.zeros(10)
        for j in range(k):
            for m in range(10):
                if label[arg_ascending[j]] == m:
                    classes[m] += 1
                    break
        #prob = classes / k
        l = np.argmax(classes)
        pre_label.append(l)
    return pre_label

### Confusion Matrix and Performance Metrics functions

In [5]:
def Confusn_mat(labels_pred, Y):
    lst=[]
    true_pos=0
    false_posit=0
    false_negat=0
    true_neg=0
    for i in range(len(labels_pred)):
        if (labels_pred[i]== 1 and Y[i]==1):
            true_pos=true_pos +1
        elif (labels_pred[i]==1 and Y[i]==0):
            false_posit=false_posit+1
        elif (labels_pred[i]==0 and Y[i]==1):
            false_negat=false_negat+1
        else:
            true_neg=true_neg+1
    lst=[true_pos, false_posit, false_negat, true_neg]
    lst=np.asarray(lst)
    return lst

In [6]:
def FindConfMat(pred_labels, Y_test):
    unique, counts=np.unique(Y_test, return_counts=True)
    Conf_mat4All= np.zeros((len(unique), 4))
    Param_mat4All= np.zeros((len(unique), 6))
    for j in range(len(unique)):
        pred_labels_bin=[]
        Y_test_bin=[]
        for i in range(len(Y_test)):
            if (pred_labels[i]==j):
                pred_labels_bin.append(1)
            else:
                pred_labels_bin.append(0)
            if(Y_test[i]==j):
                Y_test_bin.append(1)
            else:
                Y_test_bin.append(0)
        Conf_mat4All[j]=Confusn_mat(pred_labels_bin, Y_test_bin)
        Param_mat4All[j]=Parameters(Conf_mat4All[j][0], Conf_mat4All[j][1], Conf_mat4All[j][2], Conf_mat4All[j][3])
    return Conf_mat4All, Param_mat4All

In [35]:
# Performance Metrics
def Parameters(true_pos, false_posit, false_negat, true_neg):
    #print(true_pos, false_posit, false_negat, true_neg)
    lst=[]
    Accuracy=((true_pos+true_neg)/(true_pos+true_neg+false_posit+false_negat))*100
    Accuracy = str(round(Accuracy, 2))
    Precision= true_pos/(true_pos+false_posit)
    Recall= true_pos/(true_pos+false_negat)
    F_measure=(2*true_pos)/(2*true_pos + false_negat+ false_posit)
    TPR=true_pos/(true_pos + false_negat)
    FPR= false_posit/(false_posit + true_neg)
    lst=[Accuracy, Precision, Recall, F_measure, TPR, FPR]
    lst=np.asarray(lst)
    return lst

### Data preprocessing and normalization

In [8]:
# transform to 2D matrix and normalization
data = np.reshape(data, (30000, -1))
test_data = np.reshape(test_data, (5000, -1))
data = data.T
test_data = test_data.T
data = data/255
test_data = test_data/255

### Calculate Accuracy, Confusion Matrix and Performance Metrics for validation data, train data, and test data

In [33]:
# validation data
pre_label = []
var_desired = 0.96
Train_Data_mat, Train_labels, Val_Data_mat, Val_labels = Validation_dataset(data, label, 20)
X_hat_reconst, n_components, comp_ratio = SVD(Train_Data_mat, var_desired)
val_pre_label = KNN(X_hat_reconst, Train_labels, Val_Data_mat, Val_labels, 6)
Conf_mat4All_Val, Param_mat4All_Val = FindConfMat(val_pre_label, Val_labels)
correct = [1 if i == j else 0 for (i, j) in zip(val_pre_label, Val_labels)]
accuracy_Val = (sum(correct) / float(len(Val_labels)))
print('n_components: ', n_components)
print('Accuracy: {}%'.format(np.round(accuracy_Val*100, 2)))
np.set_printoptions(suppress=True)
print(Conf_mat4All_Val)
print(np.round(Param_mat4All_Val, 2))

n_components:  90
Accuracy: 85.45%
[[ 558.  143.   88. 5211.]
 [ 562.    5.   21. 5412.]
 [ 480.  184.  113. 5223.]
 [ 500.   71.   76. 5353.]
 [ 456.  131.  153. 5260.]
 [ 536.   16.   80. 5368.]
 [ 326.  164.  248. 5262.]
 [ 529.   88.   36. 5347.]
 [ 590.   21.   18. 5371.]
 [ 590.   50.   40. 5320.]]
[[96.15  0.8   0.86  0.83  0.86  0.03]
 [99.57  0.99  0.96  0.98  0.96  0.  ]
 [95.05  0.72  0.81  0.76  0.81  0.03]
 [97.55  0.88  0.87  0.87  0.87  0.01]
 [95.27  0.78  0.75  0.76  0.75  0.02]
 [98.4   0.97  0.87  0.92  0.87  0.  ]
 [93.13  0.67  0.57  0.61  0.57  0.03]
 [97.93  0.86  0.94  0.9   0.94  0.02]
 [99.35  0.97  0.97  0.97  0.97  0.  ]
 [98.5   0.92  0.94  0.93  0.94  0.01]]


In [36]:
# train data
pre_label = []
var_desired = 0.96
train_pre_label = KNN(X_hat_reconst, Train_labels, X_hat_reconst, Train_labels, 6)
Conf_mat4All_train, Param_mat4All_train = FindConfMat(train_pre_label, Train_labels)
correct = [1 if i == j else 0 for (i, j) in zip(train_pre_label, Train_labels)]
accuracy_train = (sum(correct) / float(len(Train_labels)))
print('Accuracy: {}%'.format(np.round(accuracy_train*100, 2)))
np.set_printoptions(suppress=True)
print(Conf_mat4All_train)
print(np.round(Param_mat4All_train, 2))

Accuracy: 89.13%
[[ 2171.   534.   194. 21101.]
 [ 2319.    18.    54. 21609.]
 [ 2076.   527.   351. 21046.]
 [ 2204.   210.   222. 21364.]
 [ 1998.   442.   422. 21138.]
 [ 2224.    44.   188. 21544.]
 [ 1497.   421.   896. 21186.]
 [ 2239.   186.    91. 21484.]
 [ 2298.    72.    96. 21534.]
 [ 2366.   154.    94. 21386.]]
[[96.97  0.8   0.92  0.86  0.92  0.02]
 [99.7   0.99  0.98  0.98  0.98  0.  ]
 [96.34  0.8   0.86  0.83  0.86  0.02]
 [98.2   0.91  0.91  0.91  0.91  0.01]
 [96.4   0.82  0.83  0.82  0.83  0.02]
 [99.03  0.98  0.92  0.95  0.92  0.  ]
 [94.51  0.78  0.63  0.69  0.63  0.02]
 [98.85  0.92  0.96  0.94  0.96  0.01]
 [99.3   0.97  0.96  0.96  0.96  0.  ]
 [98.97  0.94  0.96  0.95  0.96  0.01]]


In [38]:
# test data
var_desired = 0.96
test_labeled_data = test_data[:, 0:2000]
test_pre_label = KNN(X_hat_reconst, Train_labels, test_labeled_data, 6)
correct = [1 if i == j else 0 for (i, j) in zip(test_pre_label, test_label)]
accuracy_test= (sum(correct) / float(len(test_label)))
print('Accuracy: {}%'.format(np.round(accuracy_test*100, 2)))

Conf_mat4All_test, Param_mat4All_test = FindConfMat(test_pre_label, test_label)
np.set_printoptions(suppress=True)
print(Conf_mat4All_test)
print(np.round(Param_mat4All_test, 2))

Accuracy: 85.15%
[[ 152.   51.   26. 1771.]
 [ 185.    0.    6. 1809.]
 [ 174.   68.   36. 1722.]
 [ 170.   21.   21. 1788.]
 [ 158.   47.   54. 1741.]
 [ 188.    4.   26. 1782.]
 [ 100.   60.  100. 1740.]
 [ 188.   19.   10. 1783.]
 [ 212.    5.    7. 1776.]
 [ 176.   22.   11. 1791.]]
[[96.15  0.75  0.85  0.8   0.85  0.03]
 [99.7   1.    0.97  0.98  0.97  0.  ]
 [94.8   0.72  0.83  0.77  0.83  0.04]
 [97.9   0.89  0.89  0.89  0.89  0.01]
 [94.95  0.77  0.75  0.76  0.75  0.03]
 [98.5   0.98  0.88  0.93  0.88  0.  ]
 [92.    0.62  0.5   0.56  0.5   0.03]
 [98.55  0.91  0.95  0.93  0.95  0.01]
 [99.4   0.98  0.97  0.97  0.97  0.  ]
 [98.35  0.89  0.94  0.91  0.94  0.01]]


In [22]:
# remain test data
pre_label_remain = []
var_desired = 0.96
Train_Data_mat, Train_labels, Val_Data_mat, Val_labels = Validation_dataset(data, label, 20)
X_hat_reconst, n_components, comp_ratio = SVD(Train_Data_mat, var_desired)
test_remain_data = test_data[:, 2000:5000]
test_remain_pre_label = KNN(X_hat_reconst, Train_labels, test_remain_data, 6)

In [47]:
output = test_pre_label + test_remain_pre_label
print(len(output))

5000


In [48]:
with h5py.File('predicted_labels.h5','w') as H:
    H.create_dataset('label',data=output)