In [None]:
import pandas as pd
import numpy as np
from math import sqrt
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:

filename = 'data'
file_extensions = {'xlsx': pd.read_excel, 'csv':pd.read_csv, 'xls':pd.read_excel}
df = None

for ext, func in file_extensions.items():
    try:
        df = func(f'{filename}.{ext}')
        break
    except FileNotFoundError:
        continue

# Original : df = pd.read_csv ('data.csv')

In [None]:
df[0:5]

In [None]:
df

In [None]:
#Set values to numerical categorical


# NY Garden = 1
df.loc [0,['Name']] = [1]

# Brooklyn Garden = 2
df.loc [1,['Name']] = [2]

# Queens Gardern = 3
df.loc [2,['Name']] = [3]

# Snug Garden = 4
df.loc [3,['Name']] = [4]

"""
# Original 
#Iris-Setosa = 1
df.loc [22:33,['Class']] = [1]

#Iris-Versicolor = 2
df.loc [0:21,['Class']] = [2]

#Iris-Virginica = 3
df.loc [34:48,['Class']] = [3]
"""

df

In [None]:
#Rename Columns

df.rename(columns= {'LATITUDE': 'LT'}, inplace= True)
df.rename(columns= {'LONGITUDE': 'LO'}, inplace= True)
df.rename(columns= {'My Latitude': 'm_LT'}, inplace= True)
df.rename(columns= {'My Longitude': 'm_LO'}, inplace= True)
df.rename(columns= {'NAME': 'Garden Name'}, inplace= True)


In [None]:
df[0:72]

In [None]:
#Re-arrange the columns:

df = df[['Garden Name', 'LT','LO','m_LT','m_LO']]

In [None]:
df[:72]

In [None]:
#Data Splitting
train = df.iloc[0:36] # NY and BRK for training
test = df.iloc[37:72] # Q and SNG for test

# train = 0:36
# test = 37:71
# for data1

In [None]:
def euclidean_distance(row1, row2, columns, label):
    distance = 0.0

    for column in columns:
        #Only euclidean distance for features is calculated
        if column != label:
            distance += (row1[column] - row2[column])**2

    return sqrt(distance)


In [None]:

def manhattan_distance(row1, row2, columns, label):
    distance = 0.0

    for column in columns:
        if column != label:
            distance += abs(row1[column] - row2[column])

    return distance


In [None]:

def minkowski_distance(row1, row2, columns, label, p):

    distance = 0

    for column in columns:
        if column != label:
            distance += abs(row1[column] - row2[column]) ** p

    return distance ** (1 / p)


In [None]:
def KNN(train, test_row, k, label):
    temp = train.copy()

    #Calculate distance for each instance in train to single test instance
    temp['dist'] = temp.apply(lambda row: euclidean_distance(row, test_row, train.columns, label), axis=1)

    # Manhattan distance
    # temp['dist'] = temp.apply(lambda row: manhattan_distance(row, test_row, train.columns, label), axis=1)
    
    # Minkowski distance
    # temp['dist'] = temp.apply(lambda row: minkowski_distance(row, test_row, train.columns, label, 2), axis=1)

    #Getting the k neighbors having minimum distances
    sorted_distances = temp['dist'].sort_values()
    k_neighbors_distances = sorted_distances [:k]
    

    #Getting the majority label from the k neighbors
    k_neighbors = temp[temp.index.isin(k_neighbors_distances.index)]
    k_neighbors_labels = list(k_neighbors[label])
    count_labels = Counter(k_neighbors_labels)
    predicted_label = count_labels.most_common()[0][0]
    return predicted_label

In [None]:
#Varrying K

pred = []
actual = []
scores = []

#For Checking Values
pred_values = []
actual_values = []

for k in range (1,3):
    for i in range (test.shape[0]):
        pred.append( KNN(train=df, test_row=test.iloc[i], k=k, label = 'Garden Name')) #.iloc is the index locator,
        actual.append(test.iloc[i, 0]) #Get 0 column of i row, 0 column is the '_species_type'
        scores.append(accuracy_score(actual, pred))

        pred_values.append(KNN(train=df, test_row=test.iloc[i], k=k, label = 'Garden Name'))
        actual_values.append(test.iloc[i, 0])

    pred = []
    actual = []

In [None]:
print(scores)

In [None]:
print('Mean Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))

In [None]:
plt.plot(scores)
plt.title('Accuracy score of different k neighbors')
plt.xlabel('k neighbors')
plt.ylabel('accuracy score')