In [None]:
import pandas as pd
import numpy as np
from math import sqrt
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv ('data.csv')

In [None]:
df[0:5]

In [None]:
df

In [None]:
#Set values to numerical categorical

#Iris-Setosa = 1
df.loc [22:33,['Class']] = [1]

#Iris-Versicolor = 2
df.loc [0:21,['Class']] = [2]

#Iris-Virginica = 3
df.loc [34:48,['Class']] = [3]

In [None]:
#Rename Columns

df.rename(columns= {'Sepal Length (cm)': 'sepal_len'}, inplace= True)
df.rename(columns= {'Sepal Width (cm)': 'sepal_wid'}, inplace= True)
df.rename(columns= {'Petal Length (cm)': 'petal_len'}, inplace= True)
df.rename(columns= {'Petal Width (cm)': 'petal_wid'}, inplace= True)
df.rename(columns= {'Class': '_species_type'}, inplace= True)


In [None]:
df[0:5]

In [None]:
#Re-arrange the columns:

df = df[['_species_type', 'sepal_len','sepal_wid','petal_len','petal_wid']]

In [None]:
df

In [None]:
#Data Splitting
train = df.iloc[0:30] #start from 0 and before 30 (29)
test = df.iloc[20:30] #start from 20 and before 30 (29)

In [None]:
def euclidean_distance(row1, row2, columns, label):
    distance = 0.0

    for column in columns:
        #Only euclidean distance for features is calculated
        if column != label:
            distance += (row1[column] - row2[column])**2

    return sqrt(distance)

def manhattan_distance(row1, row2, columns, label):
    distance = 0.0

    for column in columns:
        if column != label:
            distance += abs(row1[column] - row2[column])

    return distance

def minkowski_distance(point1, point2, p):
    if len(point1) != len(point2):
        raise ValueError("Points must have the same number of elements/dimensions")

    if p <= 0:
        raise ValueError("Parameter p must be greater than 0")

    distance = 0
    n = len(point1)

    for i in range(n):
        distance += abs(point1[i] - point2[i]) ** p

    return distance ** (1 / p)


In [None]:
def KNN(train, test_row, k, label):
    temp = train.copy()

    #Calculate distance for each instance in train to single test instance
    # temp['dist'] = temp.apply(lambda row: euclidean_distance(row, test_row, train.columns, label), axis=1)

    # Manhattan distance
    temp['dist'] = temp.apply(lambda row: manhattan_distance(row, test_row, train.columns, label), axis=1)

    #Getting the k neighbors having minimum distances
    sorted_distances = temp['dist'].sort_values()
    k_neighbors_distances = sorted_distances [:k]
    

    #Getting the majority label from the k neighbors
    k_neighbors = temp[temp.index.isin(k_neighbors_distances.index)]
    k_neighbors_labels = list(k_neighbors[label])
    count_labels = Counter(k_neighbors_labels)
    predicted_label = count_labels.most_common()[0][0]
    return predicted_label

In [None]:
#Varrying K

pred = []
actual = []
scores = []

#For Checking Values
pred_values = []
actual_values = []

for k in range (1,3):
    for i in range (test.shape[0]):
        pred.append( KNN(train=df, test_row=test.iloc[i], k=k, label = '_species_type')) #.iloc is the index locator,
        actual.append(test.iloc[i, 0]) #Get 0 column of i row, 0 column is the '_species_type'
        scores.append(accuracy_score(actual, pred))

        pred_values.append(KNN(train=df, test_row=test.iloc[i], k=k, label = '_species_type'))
        actual_values.append(test.iloc[i, 0])

        pred = []
        actual = []

In [None]:
print(scores)

In [None]:
print('Mean Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))

In [None]:
plt.plot(scores)
plt.title('Accuracy score of different k neighbors')
plt.xlabel('k neighbors')
plt.ylabel('accuracy score')