# Machine Learning 2 Assignment 1

## Junyoung Jung 

In [1]:
import numpy as np
import torch
from torchvision import datasets
import torch.nn.functional as F
import time

In [2]:
trainset = datasets.MNIST(root='./data ', train=True, download=True)
testset = datasets.MNIST(root='./data ', train=False, download=True)

In [3]:
# Indices for train /val splits : train_idx , valid_idx
np.random.seed(0)
val_ratio = 0.1
train_size = len(trainset)
indices = list(range(train_size))
split_idx = int(np.floor(val_ratio * train_size))
np.random.shuffle(indices)
train_idx, val_idx = indices[split_idx:], indices[: split_idx]
train_data = trainset.data[train_idx].float()/255.
train_labels = trainset.targets[train_idx]
val_data = trainset.data[val_idx].float() / 255.
val_labels = trainset.targets[val_idx]
test_data = testset.data.float() / 255.
test_labels = testset.targets

train_size = len(train_idx)
val_size = len(val_idx)
test_size = len(testset)

In [4]:
ans_idx = 70

ans_label, ans_data = train_labels[ans_idx], train_data[ans_idx]

### (a) Implement an iterative method (using for loop) to classify a single new example. Write down your observations.

In [5]:
def loop_classification(datas, labels, size, ansdata):
    distance_list = []
    distance_idx = []
    ansdata = torch.flatten(ansdata)
    for i in range(size):
        subtraction, sum_sub = 0.0, 0.0
        data = torch.flatten(datas[i])
        for j in range(28 * 28):
            subtraction = data[j] - ansdata[j]
            sum_sub += torch.abs(subtraction)
        distance_list.append(sum_sub)
        distance_idx.append(labels[i])

    location = distance_list.index(min(distance_list))
    result = distance_idx[location]

    return result

In [6]:
print("-------------------------")
print("Loop start")
start_time = time.time()
loop_result = loop_classification(train_data, train_labels, train_size, ans_data)
print("Answer data's label: ", ans_label)
print("Loop classification classifies as: ", loop_result)
print("Time took for Loop is: ", time.time() - start_time)
print("-------------------------")


-------------------------
Loop start
Answer data's label:  tensor(8)
Loop classification classifies as:  tensor(8)
Time took for Loop is:  556.4878840446472
-------------------------


### (b) Use the broadcasting concept you learned in the laboratory session to classify a single new example. Compare against the result from (a).

In [7]:
def broad_classification(datas, labels, size, ansdata):
    distance_list = []
    distance_idx = []
    result_idx, temp = 0, 0
    for i in range(size):
        result = 0.0
        result = torch.sum((datas[i] - ansdata)**2)
        distance_list.append(result.item())
        distance_idx.append(labels[i])
    temp = distance_list.index(min(distance_list))
    result_idx = distance_idx[temp]

    return result_idx.item()

In [8]:
print("-------------------------")
print("Broadcasting start")
start_time = time.time()
broad_result = broad_classification(train_data, train_labels, train_size, ans_data)
print("Broadcasting classification classifies as: ", broad_result)
print("Answer label is: ", ans_label.item())
print("Time took for Broadcasting is: ", time.time() - start_time)
print("-------------------------")

-------------------------
Broadcasting start
Broadcasting classification classifies as:  8
Answer label is:  8
Time took for Broadcasting is:  0.8361051082611084
-------------------------


### (c) Now, implement a k-NN algorithm (starting with k=5) and its training/validation/evaluation code to perform multiclass classification over all digits, using the implementation from (b). Write down your observations.

In [9]:
def knn(datas, labels, size, anydigit, k):
    distances_tensor = torch.empty(1)
    distances_list = []
    distances_idx = []
    results = []
    for i in range(size):
        result = torch.sum((datas[i] - anydigit) ** 2)
        distances_list.append(result)
        distances_idx.append(labels[i])
    distances_tensor = torch.tensor(distances_list)
    temp, index = torch.sort(distances_tensor)
    index = index[:k]
    for j in index:
        results.append(distances_idx[j].item())
    end, _ = torch.mode(torch.tensor(results))

    return end.item()

In [14]:
print("-------------------------")
print("KNN start")
start_time = time.time()
knn_result = knn(train_data, train_labels, train_size, ans_data, k=1000)
print("KNN classification classifies as: ", knn_result)
print("Answer label is: ", ans_label.item())
print("Time took for KNN is: ", time.time() - start_time)
print("-------------------------")


-------------------------
KNN start
KNN classification classifies as:  3
Answer label is:  8
Time took for KNN is:  0.9432189464569092
-------------------------


### (d) Improve the algorithm from (c) [Hint: Try to find the desirable distance function, which can be found by googling or going through PyTorch document].

In [26]:
def knn_improved(datas, labels, size, anydigit, k):
    distances_list = []
    distances_idx = []
    results = []
    for i in range(size):
        distance = F.pairwise_distance(datas[i], anydigit, p=1)
        distances_list.append(torch.sum(distance))
        distances_idx.append(labels[i])
    distances_tensor = torch.tensor(distances_list)
    sorted_tensor, sorted_indices = torch.topk(distances_tensor, k, largest=False)
    for j in sorted_indices:
        results.append(distances_idx[j])
    result, _ = torch.mode(torch.tensor(results))

    return result.item()

In [27]:
print("-------------------------")
print("KNN improved start")
start_time = time.time()
knn_result = knn_improved(train_data, train_labels, train_size, ans_data, k=15)
knn_result2 = knn_improved(test_data, test_labels, test_size, ans_data, k=15)
print("KNN classification classifies as: ", knn_result)
print("For test data: ", knn_result2)
print("Answer label is: ", ans_label.item())
print("Time took for KNN is: ", time.time() - start_time)
print("-------------------------")

-------------------------
KNN improved start
KNN classification classifies as:  8
For test data:  8
Answer label is:  8
Time took for KNN is:  1.6508538722991943
-------------------------
