In [None]:
%matplotlib notebook
import cifar10
import matplotlib.pyplot as plt
import numpy as np


In [None]:
if not cifar10.get_path().is_file():
    cifar10.download()
else:
    print("cifar10 is already downloaded at:\n{}".format(cifar10.get_path()))

In [None]:
x_train, y_train, x_test, y_test = (i.astype("float32") for i in cifar10.load())
x_train = x_train.transpose([0,2,3,1])
x_test = x_test.transpose([0,2,3,1])

In [None]:
print('Training data shape: ', x_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', x_test.shape)
print('Test labels shape: ', y_test.shape)
print('\n')
print('data type: {}'.format(x_train.dtype))
print('label type: {}'.format(y_train.dtype))

In [None]:
# represented by a 32x32x3 array
x_train[0].shape

In [None]:
# accessing the class-label for image-0
y_train[0]

In [None]:
fig, ax = plt.subplots()
ax.imshow(x_train[0].astype("uint8"))

In [None]:
# Visualize examples
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_classes = len(classes)
samples_per_class = 7
fig, axes = plt.subplots(nrows=samples_per_class, ncols=num_classes)

for label_ind, cls in enumerate(classes):
    idxs = np.where(y_train == label_ind)[0]
    idxs = np.random.choice(idxs, samples_per_class, replace=False)
    for i, idx in enumerate(idxs):
        axes[i, label_ind].imshow(x_train[idx].astype('uint8'))
        axes[i, label_ind].xaxis.set_major_locator(plt.NullLocator())
        axes[i, label_ind].yaxis.set_major_locator(plt.NullLocator())
        if i == 0:
            axes[i, label_ind].set_title(cls)
            

In [None]:
x_train, y_train = x_train[:5000], y_train[:5000]
x_test, y_test = x_test[:500], y_test[:500]
print('Training data shape: ', x_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', x_test.shape)
print('Test labels shape: ', y_test.shape)

In [None]:
# shape-(5000, 32, 32, 3) -> shape-(5000, 3072)
x_train = np.reshape(x_train, (x_train.shape[0], -1))

# shape-(500, 32, 32, 3) -> shape-(500, 3072)
x_test = np.reshape(x_test, (x_test.shape[0], -1))
print("new train-shape:", x_train.shape)
print("new test-shape:", x_test.shape)

In [None]:
def compute_distances(x, y):
    """ computes the L2 distance between each row in `x` and `y`
        
        Parameters
        ----------
        x : numpy.ndarray
            x.shape must be (M, D)
            Each row of `x` is a flattened vector representing the pixel 
            values of a single image. Thus `x` represents
            M images, each one described by a length-D vector.

        y : numpy.ndarray
            y.shape must be (N, D)
            Each row of `y` is a flattened vector representing the pixel 
            values of a single image. Thus `y` represents
            N images, each one described by a length-D vector.
            
        Returns
        -------
        distances : numpy.ndarray
            distances.shape = (M, N)
            distances[i, j] = the L2 distance between x[i] and y[j]
    """
    m = x.shape[0]
    n = y.shape[0]
    distance = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            distance[i, j] = np.sqrt(np.sum(np.square(x[i] - y[j])))
    return distance
    

In [None]:
from bwsi_grader.cogworks.nearest_neighbors import grade_distances
grade_distances(compute_distances)

In [None]:
dists = compute_distances(x_test, x_train)
fig, ax = plt.subplots()
ax.imshow(dists, interpolation='none', cmap="gray")

In [None]:
def predict(dists, labels, k=1):
    """ With a shape-(M, N) array of distances between M-unlabeled 
        and N-labeled images, and N labels, we predict a label for each 
        of the M images based on its k-nearest neighbors.

        Parameters
        ----------
        dists : numpy.ndarray
            `dists.shape` must be (M, N) where M is the number of
            examples you wish to predict labels for, and N is 
            the number of labeled images used in the prediction
        
        labels : numpy.ndarray
            A shape-(N,) array of class-IDs, of labels for the N images.    

        Returns
        -------
        y_pred : numpy.array`
            A shape-(M,) array of class-IDs, as predicted by the k-nearest
            neighbors.
    """
    #print("%%%")
    m = dists.shape[0]
    n = dists.shape[1]
    y_pred = np.zeros((m,), dtype=int)
    for i in range(m):
        knn = np.full(k, float('inf'))
        knnIndex = np.full(k, -1)
        alt = []
        altIndex = []
        Max = np.max(knn)
        for j in range(n):
            if dists[i][j]<Max:
                maxIndex = np.argmax(knn)
                knn[maxIndex] = dists[i][j]
                knnIndex[maxIndex] = j
                Max = np.max(knn)
        for l in range(n):
            if dists[i][l]==Max:
                if l not in knnIndex:
                    alt.append(dists[i][l])
                    altIndex.append(l)
        for p in range(len(alt)): #0, 1
            maxInKnn = np.max(knn) #1
            maxInKnnIndex = np.argmax(knn) #1
            maxInLabels = labels[knnIndex[maxInKnnIndex]] #1
            if labels[altIndex[p]] < maxInLabels: #0<1
                knnIndex[maxInKnnIndex] = altIndex[p] #1=>4
        maxcount = 0
        eleMaxFreq = float('inf')
        for ele1 in knnIndex: 
            count = 0
            for ele2 in knnIndex: 
                if(labels[ele1] == labels[ele2]): 
                    #print("@@@")
                    count += 1
            if count > maxcount or (count == maxcount and labels[ele1] < eleMaxFreq): 
                maxcount = count 
                eleMaxFreq = labels[ele1] 
        res = eleMaxFreq
        y_pred[i] = res
    return y_pred

In [None]:
from bwsi_grader.cogworks.nearest_neighbors import grade_predict
grade_predict(predict)

In [None]:
# create data points of shape (4, 2)
toy_x_train = np.array([[0.7, -0.7],
                       [-0.7, -0.7],
                       [0, 0.7],
                       [0, 0]
                      ])
# create class-labels of shape (4,)
toy_y_train = np.array([0, 1, 2, 3])

# define class colors 
toy_label_colors = {0: 'b',  # class 0 is blue
                    1: 'y',  # class 1 is yellow 
                    2: 'g',  # class 2 is green
                    3: 'r'}  # class 3 is is red

# Create a set of densly sampled points in the range [-1, 1]
xv, yv = np.meshgrid(np.linspace(-1, 1, 100), np.linspace(-1, 1, 100))
toy_x_test = np.stack((xv, yv), axis=-1).reshape(-1, 2)

In [None]:
def plot_2d_data(x_train, y_train, x_test, test_predictions=np.empty(0)):
    """ Plot data color coded by class.
    
    Parameters
    ----------
    x_train : numpy.ndarray
        Training data of shape (N, 2).
    
    y_train : numpy.ndarray
        Training labels of shape (N,).
    
    x_test : numpy.ndarray
        Test data of shape (M, 2).
    
    test_predictions : numpy.ndarray, optional (default=np.array([]))
        Test predictions. If no argument is given the points 
        `x_test` are given the default color code.
    """
    # if no test predictions are given, use the default color
    # otherwise, find the corresponding class color
    if len(test_predictions) == 0:
        test_pt_colors = 'C0'
    else:
        test_pt_colors = [toy_label_colors[l] for l in test_predictions]
    fig, ax = plt.subplots()
    ax.scatter(*x_test.T, c=test_pt_colors, alpha=0.1)
    ax.scatter(*x_train.T, c=[toy_label_colors[l] for l in y_train])    

In [None]:
plot_2d_data(toy_x_train, toy_y_train, toy_x_test)

In [None]:
dists = compute_distances(toy_x_test, toy_x_train)
predictions = predict(dists, toy_y_train)
plot_2d_data(toy_x_train, toy_y_train, toy_x_test, predictions)

In [None]:
def generate_noisy_clusters(x, y, std=0.2, n_cluster_points=100):
    """ Generate clusters around data points by adding random noise.
    
    Parameters
    ----------
    x : numpy.ndarray
        Data points of shape (N, 2).
        
    y : numpy.ndarray
        Labels of data points `x` of shape (N,).
        
    std : float
        Standard deviation of noise used to generate clusters.
        
    n_cluster_points : int, optional (default=100)
        Number of data points to generate around each point in `x`.
        
    Returns
    -------
    Tuple[numpy.ndarray, numpy.ndarray]
        Arrays of shapes (`n_cluster_points`*N, 2) and (`n_cluster_points`*N,)
        Containing the data clusters and their labels.
    """
    toy_clusters_x = (np.repeat(x, n_cluster_points, axis=0) \
                      + std * np.random.randn(n_cluster_points*x.shape[0], 2)).clip(-1, 1)
    toy_clusters_y = np.repeat(y, n_cluster_points, axis=0)
    
    return toy_clusters_x, toy_clusters_y

In [None]:
n_cluster_points = 100
moise_std = 0.2  # standard deviation of perturbations 
toy_clusters_x, toy_clusters_y = generate_noisy_clusters(toy_x_train, toy_y_train, moise_std, 
                                                         n_cluster_points)
plot_2d_data(toy_clusters_x, toy_clusters_y, toy_x_test)

In [None]:
dists = compute_distances(toy_x_test, toy_clusters_x)
predictions = predict(dists, toy_clusters_y)
plot_2d_data(toy_clusters_x, toy_clusters_y, toy_x_test, predictions)

In [None]:
mini_train_set_idxs = range(11)
x_train_mini = x_train[mini_train_set_idxs]
y_train_mini = y_train[mini_train_set_idxs]

mini_test_set_idx = [11]
x_test_mini = x_train[mini_test_set_idx]
y_test_mini = y_train[mini_test_set_idx]

fig, ax = plt.subplots()
ax.imshow(x_test_mini.reshape(32, 32,  3).astype("uint8"))  # we flattened the images earlier 
ax.set_title(f'class: {classes[int(y_test_mini)]}')

In [None]:
dists = compute_distances(x_train_mini, x_test_mini) 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=len(dists), figsize=(8, 3))

fig.suptitle(f'Class and L2 distance to test image')
for label_ind, dist in enumerate(dists[:, 0]):
    axes[label_ind].imshow(x_train_mini[label_ind].astype('uint8').reshape(32, 32, 3))
    axes[label_ind].axis('off')
    axes[label_ind].set_title(f'{classes[int(y_train_mini[label_ind])]}\n{dist:1.0f}')

In [None]:
dists = compute_distances(toy_x_test, toy_clusters_x)
predictions = predict(dists, toy_clusters_y, k=5)

In [None]:
plot_2d_data(toy_clusters_x, toy_clusters_y, toy_x_test, predictions)

In [None]:
def make_folds(x, num_folds):
    """ Divides the array `x` along axis-0 into a list of equal-sized 
    sub-arrays.
    
    Parameters
    ----------
    x : numpy.ndarray, shape=(N, ...)
        An array of one or more dimensions, to be split along axis-0
    
    num_folds : int 
        The number of equal-sized folds to split `x` into. 
        Assume that: 0 < num_folds <= N.
    
    Returns
    -------
    List[numpy.ndarray]
        A list of the sub-divided arrays"""
    size = x.shape[0]//num_folds
    List = []
    for i in range(num_folds):
        List.append(x[i*size:(i+1)*size])
    return List

In [None]:
from bwsi_grader.cogworks.nearest_neighbors import grade_make_folds
grade_make_folds(make_folds)

In [None]:
# using 5 fold in cross-validation
num_folds = 5
x_train_folds = make_folds(x_train, num_folds=num_folds)
y_train_folds = make_folds(y_train, num_folds=num_folds)

# evaluate classifier's accuracy for the following values of k
k_values = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

In [None]:
# Accuracies: a dictionary that map the k-value to the resulting list of n classifier accuracies, one accuracy value for each validation fold.
accuracies = {}

for a in k_values:
    accuracies[a] = []
print(accuracies)
for fold_i in range(num_folds):
    print("for", fold_i, "in range num_folds")
    validation_data = x_train_folds[fold_i]
    remaining_arrays = [arr for i, arr in enumerate(x_train_folds) if i != fold_i]
    labeled_data = np.vstack(remaining_arrays)
    validation_labels = y_train_folds[fold_i]
    remaining_labels = [arr2 for j, arr2 in enumerate(y_train_folds) if j != fold_i]
    labels = np.concatenate(remaining_labels)
    
    for j in k_values:
        print("for", j,"in k_values")
        dists = compute_distances(validation_data, labeled_data)
        pred = predict(dists, labels, k=j)
        county = 0
        for i in range(len(validation_labels)):
            if pred[i]==validation_labels[i]:
                county+=1
        accuracy = county/(len(validation_labels))
        accuracies[j].append(accuracy)
        print("Completed for", j ,"in k_values")
print(accuracies)
print("Completed for ALL")

In [None]:
# check if recorded the appropriate number of fold-accuracies, for each k-value
assert sorted(accuracies) == k_values
print(accuracies)
for list_of_acc in accuracies.values():
    print(list_of_acc)
    assert len(list_of_acc) == num_folds
print("Completed the current cell")

In [None]:
# plot the n accuracies, for each k-value, along with the average accuracy

fig, ax = plt.subplots()
for k in k_values:
    print(k,"in k_values")
    ax.scatter([k] * len(accuracies[k]), accuracies[k], marker="x")

# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(accuracies.items())])
ax.errorbar(k_values, accuracies_mean, yerr=accuracies_std, label="mean accuracy")
ax.set_title('Cross-validation on k')
ax.set_xlabel('k')
ax.set_ylabel('Cross-validation accuracy')
ax.grid(True)
ax.legend()
print("Completed the current cell")

In [None]:
all_acc = list()
for k in accuracies:
    all_acc.extend(accuracies[k])
print(max(all_acc))
print(min(all_acc))

In [None]:
k = 10
dists = compute_distances(x_test, x_train)
labels = predict(dists, y_train, k)
acc = np.mean(labels == y_test)
round(float(acc), 3)
print