In [1]:
import csv
import random
import math
import operator
import numpy as np
from typing import Callable,List

# Handle Data

In [2]:
def loadDataset(filename : str, split: float=0.80, trainingSet : list =[] , testSet: list =[], readHeader: bool=False, shuffle: bool=True):
    '''
        Open the dataset from CSV and split into test/train datasets.

        Parameters:
            filename: name of a file containing the dataset to load
            split: proportion of the Training Set
            trainingSet: list object that will receive Training data set
            testSet: list object that will receive Testing data set
            readHeader: read first line as a header
            shuffle: The data will be shuffles if True
        Returns:
            Nothing
    '''
    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        if shuffle : 
            random.shuffle(dataset)
        splitnb=len(dataset)*split
        for x in range(len(dataset)):
            
            for y in range(4):
                dataset[x][y] = float(dataset[x][y])
            
            if (len(trainingSet) < splitnb) :
                
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])

# Similarity

In [3]:
def euclideanDistance(instance1: list, instance2: list, length: int=0):
    '''
        calculate the euclidean Distance between two data points
        
        Parameters:
            instance1: list of coordiante of first data point
            instance2: list of coordiante of second data point
            length: int: controls which fields to include in the distance calculation.
        Returns:
            Euclidian distance between two data point 
    '''
    
    if length==0: length=min(len(instance1),len(instance2))
    sse=sum([math.pow(x-y,2) for x,y in zip(instance1[:length],instance2[:length])])
    return math.sqrt(sse)

# Neighbors

In [4]:
def getNeighbors(trainingSet: list, testInstance: list, k: int =1, fdistance: Callable=euclideanDistance):
    '''
         returns k most similar neighbors from the training set for a given test instance 
         (using the already defined euclideanDistance function)
         
         Parameters:
             trainingSet: list of training sets
             testInstance: coordinate of one point to find it's neighboors from the trainingSet
             k: number of neighbors to calculate
             fdistance: distance function to use 
             
         Returns:
            k most similar neighbors of testInstance from trainingSet points
    '''
    distances = []

    length = len(testInstance)-1
    
    for x in range(len(trainingSet)):
        

        dist = fdistance(testInstance, trainingSet[x], length)

        distances.append((trainingSet[x], dist))

    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    
    for x in range(k):
        neighbors.append(distances[x][0])
    

    return neighbors

# Response

In [5]:
def getResponse(neighbors: list):
    '''
        gets the majority voted response from a number of neighbors. 
        It assumes the class is the last attribute for each neighbor
        
        Parameters:
            neighbors: a list of neighbors
    '''

    classVotes = {}

    for x in range(len(neighbors)):

        response = neighbors[x][-1] 
        

        if response in classVotes:

            classVotes[response]+=1
        else:
            classVotes[response]=1
            

    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    
    return sortedVotes[0][0]

# Accuracy

In [6]:
def getAccuracy(testSet : list, predictions :list):
    '''
        evaluate the accuracy of the model is to calculate a ratio of 
        the total correct predictions out of all predictions made,
        called the classification accuracy
        
        Parameters:
            testSet : list of test points used to evaluate a model
            predictions: list of predicted labels returned by the model when filled with the testSet
        
        Returns:
             accuracy of the prediction
    '''
    correct=sum([ x[-1]==y for x,y in zip(testSet, predictions)])
    

    return (correct/float(len(testSet))) * 100.0

# Main

In [7]:
def predict(filename: str, split: int=0.80, k: int=1, fdistance: Callable=euclideanDistance):
    '''
        make the prediction and calculate the model accuracy
        
        Parameters:
            filename: path to the file containing the dataset
            split: proportion of the Training Set
            k: number of neighbors to calculate
            fdistance: distance function to use 
            
        Returns:
            the model accuracy
    '''
    trainingSet=[]
    testSet=[]

    loadDataset(filename, split, trainingSet, testSet)
    
    prediction=[getResponse(getNeighbors(trainingSet, x, k,euclideanDistance))\
                  for x in testSet]
    
    return getAccuracy(testSet, prediction)  

In [8]:
matacc=[]
for i in range(100):
    matacc.append(predict(filename='iris.data.txt', split=0.70, k=5 , fdistance=euclideanDistance))
acc=sum(matacc)/100   
print(f'Mean Accuracy of the Model = {acc:.2f} %')

Mean Accuracy of the Model = 96.09 %


# Another distance metric

In [9]:
def manhattanDistance(instance1: list, instance2: list, length: int=0):
    '''
        calculate the manhattan Distance between two data points
        
        Parameters:
            instance1: list of coordiante of first data point
            instance2: list of coordiante of second data point
            length: int: controls which fields to include in the distance calculation.
            
        Returns:
            manhattan distance between two data point 
    '''
    
    if length==0: length=min(len(instance1),len(instance2))
    sae=sum([abs(x-y) for x,y in zip(instance1[:length],instance2[:length])])
    
    return sae

In [10]:
matacc=[]
for i in range(100):
    matacc.append(predict(filename='iris.data.txt', split=0.70, k=5 , fdistance=manhattanDistance))
acc=sum(matacc)/100   
print(f'Mean Accuracy of the Model = {acc:.2f} %')


Mean Accuracy of the Model = 96.13 %
