In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import numpy as np

In [12]:
def calc_l2_distance(X_test, X_train):
    test_num = X_test.shape[0]
    train_num = X_train.shape[0]
    dist = np.zeros((test_num, train_num))
    
    dist = np.sqrt(np.tile(np.sum(np.square(X_train), axis=1), (test_num, 1)) + 
                       np.tile(np.sum(np.square(X_test), axis=1), (train_num, 1)).T - 2 * np.dot(X_test, X_train.T))
    # print(dist)
    return dist

In [13]:
def knn(dist, k, Y_train):
    test_num = dist.shape[0]
    Y_predict = np.zeros(test_num)
    
    for i in range(test_num):
        dist_T = dist[i].reshape(-1,1)
        Y_train_T = Y_train.reshape(-1,1)

        dist_T = np.hstack((Y_train_T, dist_T))
        dist_T = dist_T[dist_T[:,1].argsort()]
        
        dist_T = dist_T.astype(int)

        class_count = [0 for j in range(10)]
        for j in range(k):
            class_count[dist_T[j][0]] +=1
            
        predict = class_count.index(max(class_count))
        
        Y_predict[i] = predict

    return Y_predict

In [19]:
mnist_data = fetch_openml('mnist_784')

In [20]:
X_data = mnist_data.data[:10000]
Y_data = mnist_data.target[:10000].astype('int32')

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, random_state=777, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)


(8000, 784)
(2000, 784)
(8000,)
(2000,)


In [21]:
dist = calc_l2_distance(X_test, X_train)

y_pred = knn(dist, 3, Y_train)

count = 0
for i  in range(Y_test.shape[0]):
    if(y_pred[i] == Y_test[i]):
        count += 1
    
accuracy = count / Y_test.shape[0] * 100

print('kNN accuracy : %f'%accuracy)

kNN accuracy : 93.850000


In [22]:
# cross validation ------------------------------------------------

num_fold = 5
k_candidate = [3,5,7,9,11]
accuracy_dict = {}
X_train_folds = []
Y_train_folds = []

for k in k_candidate:
    accuracy_dict[k] = []

train_length = X_train.shape[0]
fold_range = []
for i in range(num_fold+1):
    fold_range.append(int((train_length/num_fold)*(i)))
    

for i in range(num_fold):
    Y_validation = []
    Y_temp = np.split(Y_train, (fold_range[i],fold_range[i+1]))
    Y_validation = Y_temp[1]
    Y_train_folds = np.concatenate((Y_temp[0],Y_temp[2]), axis=0)
   
    #print(Y_validation)
    #print(Y_train_folds)
    #print(Y_train_folds.shape)
    
    X_validation = []
    X_temp = np.split(X_train, (fold_range[i],fold_range[i+1]))
    X_validation = X_temp[1]
    X_train_folds = np.concatenate((X_temp[0],X_temp[2]), axis=0)
    
    #print(X_validation)
    #print(X_train_folds)
    #print(X_train_folds.shape)
    
    dist = calc_l2_distance(X_validation, X_train_folds)
    for k in k_candidate:
        y_pred = knn(dist, k, Y_train_folds)
        count = 0
        for i  in range(Y_validation.shape[0]):
            if(y_pred[i] == Y_validation[i]):
                count += 1
    
        accuracy = count / Y_validation.shape[0] * 100
        accuracy_dict[k].append(accuracy)
        

        
for k in k_candidate:
    accuracies = accuracy_dict[k]
    
accuracy_mean = np.array([np.mean(v) for k,v in sorted(accuracy_dict.items())])

print(accuracy_dict)
print(accuracy_mean)

{3: [94.9375, 93.3125, 95.5625, 93.9375, 93.1875], 5: [94.3125, 92.875, 94.9375, 93.625, 93.6875], 7: [94.3125, 92.6875, 95.3125, 93.8125, 93.4375], 9: [94.0625, 92.875, 94.8125, 93.625, 93.0], 11: [93.375, 92.5, 94.6875, 93.375, 92.1875]}
[94.1875 93.8875 93.9125 93.675  93.225 ]
