# Part 1: Nearest Neighbour
### By: Gokhan Arkan & Juan Diaz

<br><br>
This cell imports the required libraries and both testing and training data needed to work with

In [277]:
import pandas as pd
import numpy as np

data_train = pd.read_csv('sonar_train.csv', header='infer')
data_test = pd.read_csv('sonar_test.csv', header='infer')

The following cell contains the two algorithms (Euclidean & Manhattan)<br><br>
They both require:
* Two vectors to be evaluated
* The amount of attributes to considere, it is 60 by default (declared at the bottom of this notebook)

In [278]:
# Function to return (float) euclidean distance 
def euclideanDistance(vector_1, vector_2, attr_length):
    distance = 0.0

    # Go through attributes in attr_length (60 attributes) and add the squares
    for x in range(attr_length):
        distance += (vector_1[x] - vector_2[x]) ** 2

    # Square the distance before return
    return np.sqrt(distance)


# Function to return (float) manhattan distance
def manhattanDistance(vector_1, vector_2, attr_length):
    distance = 0.0

    # Go through attributes in attr_length (60 attributes) and add the abs values
    for x in range(attr_length):
        distance += np.absolute(vector_1[x] - vector_2[x])

    return distance

This next cell contains the main function that mesuares all distances of the Training Set with a single Vector from the Testing Set.<br>
It receives:<br>
* Training Set
* Testing vector
* Algorithm type (Euclidean or Manhattan)
* Attributes length (By default 60)

It compares all distances and find the closest.
The return is an array with: The closest vector, the distance value and index within the Training Set

In [279]:
# Function to return an array in this form [(nearest neighbour vector), distance, index of the vector] 
def getNearestNeighbour(trainingSet, testingVector, algorithm, attr_length):

    distances = []

    # Calculate the distance of every vector in the training set
    for i in range(len(trainingSet)):

        # What algorithm to use (euclidean or manhattan)
        if algorithm == 'euclidean':
            dist = euclideanDistance(testingVector, trainingSet[i], attr_length)
        else:
            dist = manhattanDistance(testingVector, trainingSet[i], attr_length)

        # Save the (set, the distance and the index) in distances array  
        distances.append([trainingSet[i], dist, i])
        
    # Convert to NP vector and sort by index 1 (distance)
    distances = np.array(distances)
    dsorted = distances[np.argsort(distances[:, 1])]

    # Return the first element which is the lowest in the sorted array
    return dsorted[0]

In [280]:
# Function to return a vector of all predictions 
def myNN(trainSet, testSet, algorithm, attr_length):

    allPredictions = []

    # Go through each vector in the testing data
    for i in range(len(testSet)):

        # Find the nearest neighbour as [vector, distance (float), index]
        nearestNeighbour = getNearestNeighbour(trainSet, testSet[i], algorithm, attr_length) 
        classfound = nearestNeighbour[0][-1]

        # Save each prediction
        allPredictions.append(classfound)

    return np.array(allPredictions)

In [281]:
def getAccuracy(testset, predictions):
    totalright = 0
    
    # Go through each record of the test data 
    for i in range(len(testset)):
        
        # Compare the class of the test data and the prediction for this record
        if testset[i][-1] == predictions[i]:
            totalright += 1
        
    # Return the accuracy as float 
    return (totalright/float(len(testset))) * 100

In [282]:
# Convert the data from CVS to numpy arrays
nparrtrain = np.array(data_train)
nparrtest = np.array(data_test)

# Get the predictions (arrays) for each algorithm
predictEucl = myNN(nparrtrain, nparrtest, 'euclidean', 60)
predictManh = myNN(nparrtrain, nparrtest, 'manhattan', 60)

# Evaluate the accuracy of each prediction (by algorithm)
accuEucl = getAccuracy(nparrtest, predictEucl)
accuManh = getAccuracy(nparrtest, predictManh)

# Print the accuracies
print("Accuracy for Euclidean algorithm: ", accuEucl)
print("Accuracy for Manhattan algorithm: ", accuManh)

# Print the prediction vectors
print("\nClass predictions for Euclidean:\n", predictEucl)
print("\nClass predictions for Manhattan:\n", predictManh)

# Print
realClasses = np.array(data_test['Class'])
print("\nReal classes from Testing Set:\n", realClasses)


Accuracy for Euclidean algorithm:  89.85507246376811
Accuracy for Manhattan algorithm:  88.40579710144928

Class predictions for Euclidean:
 ['R' 'M' 'M' 'R' 'R' 'M' 'M' 'M' 'M' 'M' 'R' 'R' 'R' 'R' 'M' 'M' 'M' 'R'
 'M' 'M' 'M' 'R' 'R' 'R' 'R' 'M' 'R' 'R' 'M' 'M' 'M' 'M' 'M' 'R' 'M' 'M'
 'M' 'M' 'M' 'R' 'R' 'M' 'M' 'M' 'M' 'R' 'R' 'M' 'R' 'R' 'M' 'R' 'R' 'M'
 'M' 'R' 'M' 'R' 'M' 'M' 'R' 'M' 'M' 'R' 'M' 'M' 'M' 'M' 'M']

Class predictions for Manhattan:
 ['R' 'M' 'M' 'R' 'R' 'M' 'M' 'M' 'M' 'M' 'R' 'R' 'R' 'R' 'M' 'M' 'M' 'R'
 'M' 'M' 'M' 'R' 'R' 'R' 'R' 'M' 'R' 'R' 'M' 'M' 'M' 'M' 'M' 'R' 'M' 'M'
 'M' 'M' 'M' 'R' 'R' 'M' 'M' 'M' 'R' 'R' 'R' 'M' 'R' 'R' 'M' 'R' 'M' 'M'
 'M' 'R' 'M' 'R' 'M' 'R' 'M' 'M' 'M' 'R' 'M' 'M' 'M' 'M' 'R']

Real classes from Testing Set:
 ['R' 'M' 'M' 'R' 'R' 'R' 'M' 'M' 'M' 'R' 'R' 'R' 'R' 'R' 'R' 'M' 'M' 'M'
 'M' 'M' 'M' 'R' 'R' 'R' 'R' 'M' 'R' 'R' 'M' 'M' 'M' 'M' 'M' 'R' 'M' 'M'
 'M' 'M' 'M' 'R' 'R' 'M' 'M' 'M' 'R' 'R' 'R' 'M' 'R' 'R' 'M' 'R' 'R' 'M'
 'M' 'R' '