In [2]:
import scipy
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

## Q3 $k$ Nearest Neighbor

In [3]:
# 1) Load data.

X_and_Y = np.load('./ionosphere.npy')       # Load data from file.
np.random.seed(0)
np.random.shuffle(X_and_Y)        # Shuffle the data.
X = X_and_Y[:, 0:X_and_Y.shape[1] - 1]       # First column to second last column: Features (numerical values) 
Y = X_and_Y[:, X_and_Y.shape[1] - 1]       # Last column: Labels (0 or 1)
print(X.shape, Y.shape)      # Check the shapes.

(351, 34) (351,)


In [5]:
# 2) Split the dataset into 2 parts: 
#    (a) Training set + Validation set  (First 80% of all data points)
#    (b) Test set                       (Last 20% of all data points)

eighty_percent = round(len(X) * .8)
X_train_val = X[:eighty_percent] # Get features from train + val set.
X_test      = X[eighty_percent:] # Get features from test set.     
Y_train_val = Y[:eighty_percent] # Get labels from train + val set.
Y_test      = Y[eighty_percent:] # Get labels from test set.  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(281, 34) (70, 34) (281,) (70,)


In [4]:
# 3) Implement the k-NN.
class simple_KNeighborsClassifier(object):
    def __init__(self, k):
        """
        k-NN initialization.
            k: Number of nearest neighbors.
        """
        self.k = k
        
    def fit(self, X_train, Y_train):
        """
        k-NN fitting function.
            X_train: Feature vectors in training set.
            Y_train: Labels in training set.
        """
        self.X_train = X_train
        self.Y_train = Y_train
        
    def predict(self, X_pred):
        """
        k-NN prediction function.
            X_pred: Feature vectors in training set.
        Return the predicted labels for X_pred. Shape: (len(X_pred), ).
        """
        Y_pred = []
        for x in X_pred:
            ______________________
            ______________________
            ______________________
            ______________________
            ______________________
            ______________________
        return np.array(Y_pred)

In [5]:
# 4) Implement the cross-validation.

def simple_cross_validation(X_train_val, Y_train_val, k, fold):
    """
    A simple cross-validation function for k-NN.
    
    X_train_val: Features for train and val set. 
                 Shape: (num of data points, num of features)
    Y_train_val: Labels for train and val set.
                 Shape: (num of data points,)
    k:           Parameter k for k-NN.
    fold:        The number of folds to do the cross-validation.
    
    Return the average accuracy on validation set.
    """
    val_acc_list = []
    train_acc_list = []
    for i in range(fold):
        ______________________
        ______________________
        ______________________
        ______________________
        ______________________
        ______________________
        ______________________
        ______________________
        # Actually you can use the built-in function from sklearn
        # to validate if your implementation is correct or not:
        #    from sklearn.neighbors import KNeighborsClassifier
        #    classifier  = KNeighborsClassifier(algorithm='brute', n_neighbors=k)
        classifier = simple_KNeighborsClassifier(k=k)
        classifier.fit(_____, _____)
        val_acc_list.append(______________________)
        train_acc_list.append(____________________)
        
    return sum(val_acc_list) / len(val_acc_list), \
           sum(train_acc_list) / len(train_acc_list)

In [6]:
# 5) Implement the grid search function.

def simple_GridSearchCV_fit(X_train_val, Y_train_val, k_list, fold):
    """
    A simple grid search function for k with cross-validation in k-NN.
    
    X_train_val: Features for train and val set. 
                 Shape: (num of data points, num of features)
    Y_train_val: Labels for train and val set.
                 Shape: (num of data points,)
    k_list:      The list of k values to try.
    fold:        The number of folds to do the cross-validation.
    
    Return the val and train accuracy matrix of cross-validation. 
    All combinations of k are included in the array. 
    Shape: (len(k_list), )
    """
    val_acc_array = ______________________
    train_acc_array = ______________________
    ______________________
        ______________________
    return val_acc_array, train_acc_array

In [7]:
# 6) Perform grid search.

k_list = list(range(1,7))
val_acc_array, train_acc_array = \
    simple_GridSearchCV_fit(X_train_val, Y_train_val, k_list, 3)

In [8]:
# 7) Draw heatmaps for result of grid search and find 
#    best k on validation set.

def draw_heatmap_knn(acc, acc_desc, k_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=k_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$k$')
    plt.title(acc_desc + ' w.r.t $k$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()
    
#
# You can use the draw_heatmap_knn() to draw a heatmap to visualize 
# the accuracy w.r.t. k. Some demo code is given below as hint:
#
# demo_acc        = np.array([[0.8],
#                             [0.7]])
# demo_k_list     = [1, 2]
# draw_heatmap_linear(demo_acc, 'demo accuracy', demo_k_list)
#

In [None]:
draw_heatmap_knn(train_acc_array.reshape(-1,1), 'train accuracy', k_list)
draw_heatmap_knn(val_acc_array.reshape(-1,1), 'val accuracy', k_list)

In [None]:
# 8) Use the best k to calculate the test accuracy.

classifier = simple_KNeighborsClassifier(k=_______)
_______________________________
_______________________________