In [5]:
'''
Question 2.1 Skeleton Code

Here you should implement and evaluate the k-NN classifier.
'''

import data
import numpy as np
# Import pyplot - plt.imshow is useful!
import matplotlib.pyplot as plt

class KNearestNeighbor(object):
    '''
    K Nearest Neighbor classifier
    '''

    def __init__(self, train_data, train_labels):
        from scipy import stats
        self.train_data = train_data
        self.train_norm = (self.train_data**2).sum(axis=1).reshape(-1,1)
        self.train_labels = train_labels
        self.prior_count = {}
        # keep track of the prior probability
        unique, counts = np.unique(train_labels, return_counts=True)
        for i in range(len(unique)):
            self.prior_count[unique[i]] = counts[i]
        

    def l2_distance(self, test_point):
        '''
        Compute L2 distance between test point and each training point
        
        Input: test_point is a 1d numpy array
        Output: dist is a numpy array containing the distances between the test point and each training point
        '''
        # Process test point shape
        test_point = np.squeeze(test_point)
        if test_point.ndim == 1:
            test_point = test_point.reshape(1, -1)
        assert test_point.shape[1] == self.train_data.shape[1]

        # Compute squared distance
        test_norm = (test_point**2).sum(axis=1).reshape(1,-1)
        dist = self.train_norm + test_norm - 2*self.train_data.dot(test_point.transpose())
        return np.squeeze(dist)

    def query_knn(self, test_point, k):
        '''
        Query a single test point using the k-NN algorithm

        You should return the digit label provided by the algorithm
        '''
        from scipy import stats
        dist = self.l2_distance(test_point)
        ind = np.argpartition(dist, k)[:k]
        nn = self.train_labels[ind]
        digit, _ = stats.mode(nn)
        if len(digit) > 1:
            # break the tie with prior probability
            return max(digit, key = lambda digit: self.prior_count[digit])
        return digit[0]

def cross_validation(train_data, train_labels, k_range=np.arange(1,16)):
    '''
    Perform 10-fold cross validation to find the best value for k

    Note: Previously this function took knn as an argument instead of train_data,train_labels.
    The intention was for students to take the training data from the knn object - this should be clearer
    from the new function signature.
    '''
    from sklearn.model_selection import KFold
    kf = KFold(n_splits = 10, shuffle=True)
    acc = {}
    for k in k_range:
        # Loop over folds
        # Evaluate k-NN
        acc[k] = []
        for train_index, test_index in kf.split(train_labels):
            X_train, X_test = train_data[train_index], train_data[test_index]
            y_train, y_test = train_labels[train_index], train_labels[test_index]
            knn = KNearestNeighbor(X_train, y_train)
            acc[k].append(classification_accuracy(knn, k, X_test, y_test))
        acc[k] = np.mean(acc[k])
    K = max(k_range, key = lambda k: acc[k])
    return K, acc[K]
            
def classification_accuracy(knn, k, eval_data, eval_labels):
    '''
    Evaluate the classification accuracy of knn on the given 'eval_data'
    using the labels
    '''
    sample_size = eval_labels.shape[0]
    hit = 0
    for i in range(sample_size):
        if knn.query_knn(eval_data[i], k) == eval_labels[i]:
            hit += 1
    return hit / sample_size

In [8]:
def main():
    train_data, train_labels, test_data, test_labels = data.load_all_data('data')
    knn = KNearestNeighbor(train_data, train_labels)

    # Q1
    # compute accuracy for knn-1
    accuracy = classification_accuracy(knn, 1, train_data, train_labels)
    print("KNN-1 train accuracy: {}".format(accuracy))
    
    accuracy = classification_accuracy(knn, 1, test_data, test_labels)
    print("KNN-1 test accuracy: {}".format(accuracy))
    
    # compute accuracy for knn-15
    accuracy = classification_accuracy(knn, 15, train_data, train_labels)
    print("KNN-15 train accuracy: {}".format(accuracy))
    
    accuracy = classification_accuracy(knn, 15, test_data, test_labels)
    print("KNN-15 test accuracy: {}".format(accuracy))
    
    # Q2 as seen in class KNearestNeighbor
    
    # Q3
    K, folds_acc = cross_validation(train_data, train_labels)
    train_acc = classification_accuracy(knn, K, train_data, train_labels)
    test_acc = classification_accuracy(knn, K, test_data, test_labels)
    print("K: {}, trainset accuracy: {}, \naverage accuracy across folds: {}, test accuracy: {}".format(K, train_acc, folds_acc, test_acc))
    
if __name__ == '__main__':
    main()

KNN-1 train accuracy: 1.0
KNN-1 test accuracy: 0.96875
KNN-15 train accuracy: 0.9594285714285714
KNN-15 test accuracy: 0.9585
K: 3, trainset accuracy: 0.9834285714285714, 
average accuracy across folds: 0.9637142857142857, test accuracy: 0.96975
