# Import libraries, import dataset


Implemented using resources available in the CM4107 Advanced Artificial Intelligence

In [None]:
import numpy as np
import pandas as pd
import scipy.special
import operator
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
from google.colab import files

In [None]:
uploaded = files.upload()
data_file = "HPDataset - Copy - Copy.csv"
class_index = 10

In [None]:
def load_dataset(filename, class_idx=class_index, split=0.8):
    dataframe = pd.read_csv(filename)
    instances = dataframe.values
    

    print ("Class Index: "+str(class_idx))
    X = instances[:,0:class_idx] 
    Y = instances[:,class_idx] 
    
   
    X_train = [] 
    Y_train = [] 
    X_test = [] 
    Y_test = [] 
    
    for  x, y in zip(X, Y): 
        if random.random() < split:
            X_train.append(x)
            Y_train.append(y)
        else:
            X_test.append(x)
            Y_test.append(y)       
    
    return X_train, Y_train, X_test, Y_test

# Declare distance calculation metrics


In [None]:
def euclidean(instance1, instance2):
        distance = 0
        for val1, val2 in zip(instance1, instance2):            
            distance += pow((val1 - val2), 2)
        distance = pow(distance, 1/2)             
        return 1 / (1+ distance)
    

def manhattan(instance1, instance2):
        distance = 0
        for val1, val2 in zip(instance1, instance2):
            distance += abs(val1 - val2)          
        return 1 / (1+ distance)
    

# Define k-NN model

In [None]:
def accuracy(prediction, true):
        mae = mean_absolute_error(true, prediction)
        return mae

class kNN:
    def __init__(self, X_train, Y_train, k=3, sim=manhattan):
        self.X_train = X_train
        self.Y_train = Y_train
        self.k = k 
        self.similarity = sim 
        
        #store results from testing 
        self.results= []
        
    def get_neighbours(self, test_instance):
        similarities = []

        for train_instance, y in zip(self.X_train, self.Y_train):
            sim = self.similarity(test_instance, train_instance)
            
            similarities.append((y, sim))
            
        similarities.sort(key = operator.itemgetter(1), reverse = True) 
        neighbours = []
        for x in range(self.k):
            neighbours.append(similarities[x])

        return neighbours

    def predict(self, neighbours):
        class_votes = {}
        for x in range(len(neighbours)):
            response = neighbours[x][0]
            if response in class_votes:
                class_votes[response] += 1         
            else:
                class_votes[response] = 1
                
        sorted_votes = sorted(class_votes, key = lambda k: (class_votes[k], k), reverse = True)
        return sorted_votes[0]
    
    def test(self, X_test, Y_test):
        self.results = []
        self.prediction = []
        self.true = []

        for test_instance, target_label in zip(X_test, Y_test):
            neighbours = self.get_neighbours(test_instance)
            predict_label = self.predict(neighbours)
            self.results.append([predict_label, target_label])
            self.prediction.append([predict_label])
            self.true.append([target_label])
    

# Test hyperparameters


In [None]:
k_values = [1, 3, 5, 8, 10, 15, 30, 60, 120]

results = []
for k in k_values:
    knn = kNN(X_train, Y_train, k=k)
    knn.test(X_test, Y_test)
    results.append(accuracy(knn.prediction, Y_test))

In [None]:
objects = k_values
y_pos = np.arange(len(objects))
performance = results
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylim(0, max(performance))
plt.ylabel('MAE')
plt.xlabel('K-Value')
plt.title('K-Value Test')
 
plt.show()

In [None]:
sims = [euclidean, manhattan]

results = []
for sim in sims:
    knn = kNN(X_train, Y_train, sim=sim, k=15)
    knn.test(X_test, Y_test)
    results.append(accuracy(knn.prediction, Y_test))

In [None]:
objects = ['euclidean', 'manhattan']
y_pos = np.arange(len(objects))
performance = results
 
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylim(0, max(performance)+1)
plt.ylabel('MAE')
plt.title('Distance Calculator Test')
 
plt.show()