# Implemenation

### Helper functions

In [1]:
import pandas as pd
import numpy as np

def get_dataset(data_path):
    """ Load the data set at the given location and return as an numpy array.
    
    Arguments:
    data_path: The path to the file containing the data. The file must be a .csv of the
               form described in the assignment.
    dim: The dimension of each example. E.g. The entry (x1, x2, y) has dimension 2.
    
    Returns:
    The data set as numpy array.
    
    """
    return np.array(pd.read_csv(data_path))

def euclidean(a, b):
    """ Calculates the euclidean distance between the two given points."""
    return np.linalg.norm(a - b)


### Basic k-NN search

In [2]:
def kNN_indices(query_point, k, candidates, dim):
    """
    Returns the indices of the k nearest neighbours of the query point.
    
    Arguments:
    query_point: The query point.
    k: The number of nearest neighbours to calculate.
    candidates: The dataset containing the known points to compare with.
    dim: The dimension of each example point. E.g. The entry (x1, x2, y) has dimension 2.
    
    Returns:
    A list of indices representing the nearest neighbours in the candidate set.
    
    """
    
    distances = []
    for i in range(0, len(candidates)):
        dis = euclidean(query_point, candidates[i][0:dim])
        distances.append(dis)
    
    ordered_indices = np.argsort(distances)
    k_nearest_indices = ordered_indices[:k]
    
    return k_nearest_indices
    

def kNN_results(query_point, k, candidates, dim):
    """
    Returns the y value of the target function for the k nearest neighbours of the query point.
    
    Arguments:
    query_point: The query point.
    k: The number of nearest neighbours to calculate.
    candidates: The dataset containing the known points to compare with.
    dim: The dimension of each example point. E.g. The entry (x1, x2, y) has dimension 2.
    
    Returns:
    A list containing the value of the target function for each of the k nearest neighbours.
    
    """
    
    k_nearest_indices = kNN_indices(query_point, k, candidates, dim)
    k_nearest_results = [candidates[i][dim] for i in k_nearest_indices]
    
    return k_nearest_results

### Regression and Classification using k-NN

As suggested in the assignment sheet, the regression is based on simple mean and the classification is based on voting.

In [3]:
from collections import Counter

def kNN_regression(query_point, k, candidates, dim):
    """
    Predicts the target function value for the query_point using k-NN regression.
    
    Arguments:
    query_point: The query point.
    k: The number of nearest neighbours to calculate.
    candidates: The dataset containing the known points to compare with.
    dim: The dimension of each example point. E.g. The entry (x1, x2, y) has dimension 2.
    
    Returns:
    The predicted value.
    
    """
    
    k_nearest_results = kNN_results(query_point, k, candidates, dim)
    prediction = np.mean(k_nearest_results)
    return prediction

def kNN_classification(query_point, k, candidates, dim):
    """
    Predicts the target function value for the query_point using k-NN classification.
    
    Arguments:
    query_point: The query point.
    k: The number of nearest neighbours to calculate.
    candidates: The dataset containing the known points to compare with.
    dim: The dimension of each example point. E.g. The entry (x1, x2, y) has dimension 2.
    
    Returns:
    The predicted value.
    
    """
    
    k_nearest_results = kNN_results(query_point, k, candidates, dim)
    prediction = Counter(k_nearest_results).most_common(1)[0][0]
    return prediction

# Tests

Loading test data:

In [4]:
reg_set = get_dataset('../../data/knn_regression.csv')
clas_set = get_dataset('../../data/knn_classification.csv')

reg_query = reg_set[123][0:3]
reg_exp = reg_set[123][3]
clas_query = clas_set[123][0:4]
clas_exp = clas_set[123][4]

## k-NN Regression

### Test result:

In [5]:
print("target y: ", reg_exp)
print("prediction: ", kNN_regression(reg_query, 10, reg_set, 3))

target y:  1.8
prediction:  1.6


### Nearest neighbours:

In [6]:
pd.DataFrame([reg_set[i] for i in kNN_indices(reg_query, 10, reg_set, 3)],
            columns = ['x1', 'x2', 'x3', 'y'])

Unnamed: 0,x1,x2,x3,y
0,6.3,2.7,4.9,1.8
1,6.2,2.8,4.8,1.8
2,6.3,2.5,4.9,1.5
3,6.3,2.8,5.1,1.5
4,6.3,2.5,5.0,1.9
5,6.1,2.8,4.7,1.2
6,6.1,2.9,4.7,1.4
7,6.0,2.7,5.1,1.6
8,6.1,3.0,4.9,1.8
9,6.5,2.8,4.6,1.5


## k-NN Classification

### Test result:

In [7]:
print("target y: ", clas_exp)
print("prediction: ", kNN_classification(clas_query, 10, clas_set, 4))

target y:  2.0
prediction:  2.0


### Nearest Neighbours:

In [8]:
pd.DataFrame([clas_set[i] for i in kNN_indices(clas_query, 10, clas_set, 4)],
            columns = ['x1', 'x2', 'x3', 'x4', 'y'])

Unnamed: 0,x1,x2,x3,x4,y
0,6.3,2.7,4.9,1.8,2.0
1,6.2,2.8,4.8,1.8,2.0
2,6.3,2.5,5.0,1.9,2.0
3,6.1,3.0,4.9,1.8,2.0
4,6.3,2.5,4.9,1.5,1.0
5,6.3,2.8,5.1,1.5,2.0
6,6.0,2.7,5.1,1.6,1.0
7,6.4,2.7,5.3,1.9,2.0
8,6.0,3.0,4.8,1.8,2.0
9,6.5,2.8,4.6,1.5,1.0
