# 9. K-Nearest Neighbours

In [1]:
from random import seed
from math import sqrt

from functions import *

### Define functions
* K-Nearest Neighbours for classification or regression

In [2]:
# Calculate the Euclidian distance between two vectors
# Input: two rows of data, where the last vector element is assumed to be the output
# Output: Euclidian distance, i.e. squared differences element-wise (per input variable)
# Dependencies: NA
def euclidian_distance(row1, row2):
    distance = 0.0
    # Assuming last row value is output (class or regression value)
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

# Find the K nearest neighbours to a given row from a given (training) dataset
# Input: training dataset, row for which to find neighbours, number of neighbours to find
# Output: list of num_neighbours observations from train that are most similar to test_row
# Dependencies: euclidian_distance
def get_neighbours(train, test_row, num_neighbours):
    # List of tuples, each tuple corresponds to an observation
    # Each tuple element thus contains the full row (vector) and its distance to test_row
    distances = list()
    for train_row in train:
        dist = euclidian_distance(test_row, train_row)
        # Add tuple of observation vector and distance to test_row to list of tuples
        distances.append((train_row, dist))
    # Sort the list of neighbours by distance, by default in ascending order
    distances.sort(key=lambda tup: tup[1])
    # Add the K closest rows to a list of complete data observations
    neighbours = list()
    for i in range(num_neighbours):
        neighbours.append(distances[i][0])
    return neighbours

# Get the predicted class for a given row (vector), based on k neighbours from train set
# Input: test_row for which to predict class, based on num_neighbours from train set
# Output: class prediction for given row
# Dependencies: get_neighbours, euclidian_distance
def predict_classification(train, test_row, num_neighbours):
    # Get num_neighbours nearest neighbours to test_row
    neighbours = get_neighbours(train, test_row, num_neighbours)
    # Add output classes of K nearest neighbours to a list
    output_values = [row[-1] for row in neighbours]
    # Find distinct classes (using set), and find class with maximum count (occurrences)
    prediction = max(set(output_values), key = output_values.count)
    return prediction

# Get the predicted regression value for a given row, based on k neighbours from train set
# Input: row for which to predict regression value, based on num_neighbours from train set
# Output: predicted regression value; the average value of that of its K neighbours
# Dependencies: get_neighbours, euclidian_distance
def predict_regression(train, test_row, num_neighbours):
    neighbours = get_neighbours(train, test_row, num_neighbours)
    output_values = [row[-1] for row in neighbours]
    prediction = sum(output_values) / float(len(output_values))
    return prediction

# For a test set, get predictions using "training" data and problem type
# Input: train (neighbours), test, number of neighbours (K), classification or regression
# Output: predicted class or regression value for provided test observations
def k_nearest_neighbours(train, test, num_neighbours, type = "classification"):
    # List of predictions, equal length as rows in test set
    predictions = list()
    # There is strictly speaking no algorithm training step in KNN;
        # the K neighbours are found from among the training set
    # Get prediction based on K neighbours (from train) for each row in test set
    if(type == "regression"):
        for row in test:
            output = predict_regression(train, row, num_neighbours)
            predictions.append(output)
    elif(type == "classification"):
        for row in test:
            output = predict_classification(train, row, num_neighbours)
            predictions.append(output)
    else:
        print("Error: Invalid problem type.")
        return None

    return predictions

### Testing KNN on contrived dataset

In [3]:
# Contrived dataset for testing
dataset =   [[2.7810836,2.550537003,0],
            [1.465489372,2.362125076,0],
            [3.396561688,4.400293529,0],
            [1.38807019,1.850220317,0],
            [3.06407232,3.005305973,0],
            [7.627531214,2.759262235,1],
            [5.332441248,2.088626775,1],
            [6.922596716,1.77106367,1],
            [8.675418651,-0.242068655,1],
            [7.673756466,3.508563011,1]]

# Calculate Euclidian distance between first row compared to all others (including itself)
print(" Contrived dataset distances to first row:")
row0 = dataset[0]
for row in dataset:
    distance = euclidian_distance(row0, row)
    print(distance)

# Find nearest neighbours to first row in contrived dataset
    # Note: the closest neighbour will always be itself (the same row)
k = 3
print("\n", k, " closest neighbours:")
neighbours = get_neighbours(dataset, dataset[0], k)
for neighbour in neighbours:
    print(neighbour)

# Get predicted class for first row
prediction = predict_classification(dataset, dataset[0], k)
print("\nFirst row: expected class %d, got class %d." % (dataset[0][-1], prediction))

 Contrived dataset distances to first row:
0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795

 3  closest neighbours:
[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]

First row: expected class 0, got class 0.


### Testing KNN on Abalone case study

In [4]:
# Abalone Case Study
seed(1)
# Load data
dataset = load_csv('data/abalone.csv')
# Convert string numbers to floats
for i in range(1, len(dataset[0])):
    str_column_to_float(dataset, i)
# Convert float gender (first column) to int
str_column_to_int(dataset, 0)

# Evaluate algorithm using k-fold cross-validation with 5 folds
    # 4177 / 5 = 835.4, i.e. 835 records per fold

print("\nAbalone Case Study as Classification")
n_folds = 5
num_neighbours = 5
# Run KNN algorithm and check classification accuracy with 5-fold cross-validation
scores = evaluate_algorithm(dataset, k_nearest_neighbours, n_folds, accuracy_metric, num_neighbours)
print("Scores: %s" % scores)
print("Mean Accuracy: %.3f%%" % (sum(scores)/float(len(scores))))
# 23% beats the 16% baseline with K = 5 neighbours
# The large number of classes makes accuracy a poor judge of skill on this problem
# Many classes also only have a few examples, which gives many misclassifications

print("\nAbalone Case Study as Regression")
n_folds = 5
# Running regression algorithm instead, treating the classes (since so many) as values
scores = evaluate_algorithm(dataset, k_nearest_neighbours, n_folds, rmse_metric, num_neighbours, "regression")
print("Scores: %s" % scores)
print("Mean RMSE: %.3f" % (sum(scores)/float(len(scores))))
# RMSE of 2.24 rings beats the 3.222 rings baseline
# Also have a model more useful in the domain with a performance easier to understand

Loaded data file data/abalone.csv with 4177 rows and 9 columns.

Abalone Case Study as Classification
Scores: [24.790419161676645, 21.79640718562874, 23.592814371257482, 21.676646706586826, 23.353293413173652]
Mean Accuracy: 23.042%

Abalone Case Study as Regression
Scores: [2.2449837629275575, 2.2882883282169404, 2.3248720266113048, 2.148766531010387, 2.22366154319461]
Mean RMSE: 2.246
