In [8]:
import numpy as np
import pandas as pd
from math import sqrt
from random import seed
from random import randrange

class KNN(object):
    def __init__(self,k= 5):
        self.k = k
        
    #load csv file
    def load_csv(self,filename):
        df = pd.read_csv(filename, header = None)
        return df
    
    #convert string column to integer (class values to interger)
    def stringColumnsToInts(self,dataset,column):
        classValues = [row[column] for row in dataset]
        uniqueValues = set(classValues)
        intValues = dict()
        for i, value in enumerate(uniqueValues):
            intValues[value] = i
        for row in dataset:
            row[column] =  intValues[row[column]]
        return intValues
    
    #Split data into k-folds
    def k_Folds(self,dataset, n_folds):
        datasetCopy = list(dataset)
        datasetSplit = list()
        foldSize = int(len(dataset)/n_folds)
        for i in range (n_folds):
            fold = list()
            while len(fold) < foldSize:
                index = randrange(len(datasetCopy))
                fold.append(datasetCopy.pop(index))
            datasetSplit.append(fold)
        return datasetSplit
    
    # Find the min and max values for each column
    def dataset_minmax(self,dataset):
        minmax = list()
        for i in range(len(dataset[0])):
            col_values = [row[i] for row in dataset]
            value_min = min(col_values)
            value_max = max(col_values)
            minmax.append([value_min, value_max])
        return minmax

    # Rescale dataset columns to the range 0-1
    def normalize_dataset(self,dataset,minmax):
        normalizedList = list(dataset)
        for row in normalizedList:
            for i in range(len(row)):
                row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
        return normalizedList
    
    #calculate accuracy percentage
    def accuracyMetric(self,actual, predicted):
        correct = 0.0
        for i in range (len(actual)):
            if actual[i] == predicted[i]:
                correct += 1
        percentage = correct / float(len(actual)) *100.0
        return percentage
    
    #Calculate Euclidean Distance between 2 vectors/rows.
    def calculateEuclideanDistance(self,row1, row2):
        for i in range(len(row1)-1):
            sum_sq = np.sum(np.square(row1[i] - row2[i]))
        return (np.sqrt(sum_sq)) 
    
    #Calculate the nearest neighbors.
    def calculateNearestNeighbors(self,train, test_row, num_neighbors):
        distances = list()
        for train_row in train:
            dist = self.calculateEuclideanDistance(train_row, test_row)
            distances.append((train_row, dist))
        distances.sort(key = lambda tup : tup[1])
        neighbors = list()
        for i in range(num_neighbors):
            neighbors.append(distances[i][0])
        return neighbors
    
    #make predictions.
    def makePredictions(self,train, test_row, num_neighbors):
        neighbors = self.calculateNearestNeighbors(train, test_row, num_neighbors)
        outputValues = [row[-1] for row in neighbors]
        prediction = max(set(outputValues), key = outputValues.count )
        return prediction
    
    #evaluate algorithm using a cross validation split to check score
    def evaluateAlgorithm(self,dataset, algorithm, n_folds, *args):
        folds = self.k_Folds(dataset, n_folds)
        score = list()
        for fold in folds:
            trainSet = list(folds)
            trainSet.remove(fold)
            trainSet = sum(trainSet, [])
            testSet = list()
            for row in fold:
                rowCopy = list(row)
                testSet.append(rowCopy)
                rowCopy[-1] = None
            actual = [row[-1] for row in fold]
            predicted = algorithm(trainSet, testSet, *args)
            accuracy = self.accuracyMetric(actual, predicted)
            score.append(accuracy)
        return score
    
    #KNN Algorithm
    def kNN_Algorithm(self,train, test, num_neighbors):
        predictions = list()
        for row in test:
            output = self.makePredictions(train, row, num_neighbors)
            predictions.append(output)
        return predictions

IRIS DATASET

In [9]:
iris_knn = KNN()

seed(12)
filename = 'Datasets/iris.csv'
iris_df = iris_knn.load_csv(filename)
iris_df.columns = ["sepal length", "sepal width", "petal length", "petal width", "Class"]
iris_df.Class.replace(('Iris-setosa','Iris-versicolor','Iris-virginica'),(1,2,3), inplace=True)
iris_dataset = iris_df.values.tolist()
iris_minmax = iris_knn.dataset_minmax(iris_dataset)
normalized_iris_dataset = iris_knn.normalize_dataset(iris_dataset,iris_minmax)
n_folds = int(input("Please provide fold number:" ))
num_neighbors = int(input("Please provide number of neighbors:" ))
options = input("If normal values to be viewed press 'a' or if you need normalized values press 'b' : ")
if options == 'a':
    scores = iris_knn.evaluateAlgorithm(iris_dataset, iris_knn.kNN_Algorithm, n_folds, num_neighbors)
elif options == 'b':
    scores = iris_knn.evaluateAlgorithm(normalized_iris_dataset, iris_knn.kNN_Algorithm, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Please provide fold number:10
Please provide number of neighbors:5
If normal values to be viewed press 'a' or if you need normalized values press 'b' : a
Scores: [100.0, 100.0, 93.33333333333333, 93.33333333333333, 93.33333333333333, 93.33333333333333, 100.0, 80.0, 100.0, 100.0]
Mean Accuracy: 95.333%


HAYES-ROTH DATASET

In [10]:
hayes_roth_knn = KNN()

seed(1)
filename = 'Datasets/hayes-roth.data'
hayes_roth_df = hayes_roth_knn.load_csv(filename)
hayes_roth_df.columns = ["name", "hobby", "age", "educational level", "marital status","class"]
hayes_roth_dataset = hayes_roth_df.values.tolist()
hayes_roth_minmax = hayes_roth_knn.dataset_minmax(hayes_roth_dataset)
normalized_hayes_roth_dataset = hayes_roth_knn.normalize_dataset(hayes_roth_dataset,hayes_roth_minmax)
n_folds = int(input("Please provide fold number:" ))
num_neighbors = int(input("Please provide number of neighbors:" ))
options = input("If normal values to be viewed press 'a' or if you need normalized values press 'b' : ")
if options == 'a':
    scores = hayes_roth_knn.evaluateAlgorithm(hayes_roth_dataset, hayes_roth_knn.kNN_Algorithm, n_folds, num_neighbors)
elif options == 'b':
    scores = hayes_roth_knn.evaluateAlgorithm(normalized_hayes_roth_dataset, hayes_roth_knn.kNN_Algorithm, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Please provide fold number:10
Please provide number of neighbors:5
If normal values to be viewed press 'a' or if you need normalized values press 'b' : a
Scores: [38.46153846153847, 30.76923076923077, 76.92307692307693, 61.53846153846154, 30.76923076923077, 69.23076923076923, 46.15384615384615, 46.15384615384615, 53.84615384615385, 69.23076923076923]
Mean Accuracy: 52.308%


CARS DATASET

In [11]:
car_knn = KNN()

seed(10)
filename = 'Datasets/car.data'
car_df = car_knn.load_csv(filename)
car_df.columns = ["buying","maint","doors","persons","lug_boot","safety","Class"]
car_df.Class.replace(('unacc','acc','good','vgood'),(0,1,2,3), inplace=True)
car_df.buying.replace(('vhigh','high','med','low'),(0,1,2,3), inplace=True)
car_df.maint.replace(('vhigh','high','med','low'),(0,1,2,3), inplace=True)
car_df.lug_boot.replace(('small','med','big'),(1,2,3), inplace=True)
car_df.safety.replace(('low','med','high'),(1,2,3), inplace=True)
car_df.persons.replace(('2','4','more'),(2,4,5), inplace=True)
car_df.doors.replace(('2','3','4','5more'),(2,3,4,5), inplace=True)
car_df.to_csv(r'C:\Users\jonat\Projects\Python\KNN-Algorithm(Final)\NewCar.csv', index = False)
car_dataset = car_df.values.tolist()
car_minmax = car_knn.dataset_minmax(car_dataset)
normalized_car_dataset = car_knn.normalize_dataset(car_dataset,car_minmax)
n_folds = int(input("Please provide fold number:" ))
num_neighbors = int(input("Please provide number of neighbors:" ))
options = input("If normal values to be viewed press 'a' or if you need normalized values press 'b' : ")
if options == 'a':
    scores = car_knn.evaluateAlgorithm(car_dataset, car_knn.kNN_Algorithm, n_folds, num_neighbors)
elif options == 'b':
    scores = car_knn.evaluateAlgorithm(normalized_car_dataset, car_knn.kNN_Algorithm, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Please provide fold number:10
Please provide number of neighbors:5
If normal values to be viewed press 'a' or if you need normalized values press 'b' : a
Scores: [69.18604651162791, 69.18604651162791, 63.372093023255815, 65.11627906976744, 63.95348837209303, 59.883720930232556, 66.86046511627907, 67.44186046511628, 65.11627906976744, 67.44186046511628]
Mean Accuracy: 65.756%


BREAST-CANCER DATASET

In [12]:
cancer_knn = KNN()

seed(6)
filename = 'Datasets/breast-cancer.data'
cancer_df = cancer_knn.load_csv(filename)
cancer_df.columns = ["Class","age","menopause","tumor","inv","node","degMalig","breast","breastQuad","irradiat"]
cols = list(cancer_df.columns.values)
cancer_df = cancer_df[cols[1:] + [cols[0]]]
cancer_df.Class.replace(('no-recurrence-events','recurrence-events'),(1,2), inplace=True)
cancer_df.age.replace(('10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99'),(1,2,3,4,5,6,7,8,9), inplace=True)
cancer_df.menopause.replace(('lt40','ge40','premeno'),(1,2,3), inplace=True)
cancer_df.tumor.replace(('0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59'),(1,2,3,4,5,6,7,8,9,10,11,12), inplace=True)
cancer_df.inv.replace(('0-2','3-5','6-8','9-11','12-14','15-17','18-20','21-23','24-26','27-29','30-32','33-35','36-39'),(1,2,3,4,5,6,7,8,9,10,11,12,13), inplace=True)
cancer_df.node.replace(('yes','no','?'),(1,2,3), inplace=True)
cancer_df.breast.replace(('left','right'),(1,2), inplace=True)
cancer_df.breastQuad.replace(('left_up','left_low','right_up','right_low','central','?'),(1,2,3,4,5,6), inplace=True)
cancer_df.irradiat.replace(('yes','no','?'),(1,2,3), inplace=True)
cancer_dataset = cancer_df.values.tolist()
cancer_minmax = cancer_knn.dataset_minmax(cancer_dataset)
normalized_cancer_dataset = cancer_knn.normalize_dataset(cancer_dataset,cancer_minmax)
n_folds = int(input("Please provide fold number:" ))
num_neighbors = int(input("Please provide number of neighbors:" ))
options = input("If normal values to be viewed press 'a' or if you need normalized values press 'b' : ")
if options == 'a':
    scores = cancer_knn.evaluateAlgorithm(cancer_dataset, cancer_knn.kNN_Algorithm, n_folds, num_neighbors)
elif options == 'b':
    scores = cancer_knn.evaluateAlgorithm(normalized_cancer_dataset, cancer_knn.kNN_Algorithm, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Please provide fold number:10
Please provide number of neighbors:5
If normal values to be viewed press 'a' or if you need normalized values press 'b' : a
Scores: [75.0, 75.0, 57.14285714285714, 39.285714285714285, 67.85714285714286, 64.28571428571429, 75.0, 71.42857142857143, 71.42857142857143, 82.14285714285714]
Mean Accuracy: 67.857%
