# **Importing Libraries**

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# **Loading Data**

In [0]:
df = pd.read_csv("irisdata.csv")

## Enter the fraction of dataset to be used for testing (evaluation) 

In [0]:
test_frac = 0.3

In [0]:
p = 0.35
train, test = train_test_split(df.to_numpy(), test_size = test_frac)
train_data = train[:,0:4]
train_class = train[:,4]
test_data = test[:,0:4]
test_class = test[:,4]

# **Functions Definitions**

In [0]:
def euclideanDistance(x,y):
  distance = 0
  for i in range(x.size):
    distance += (x[i] - y[i])**2
  return np.sqrt(distance)

def first_comes_first(arr,x):
    arr[1:] = arr[0:-1]
    arr[0] = x
    return arr

def kNN(test_data,train_data,train_class,k):
  test_pred = np.zeros(test_data.shape[0])
  for i in range(test_data.shape[0]):
    dist_arr = np.asarray(np.zeros(k))
    nearest = np.asarray(np.zeros(k))
    ret_arr = np.array([])
    for j in range(k):
      dist_arr[j] = euclideanDistance(train_data[j],test_data[i])
      nearest[j]=j

    for j in range(train_data.shape[0]):
      temp_dist = euclideanDistance(test_data[i],train_data[j])
      if(temp_dist <= np.amin(dist_arr)):
        dist_arr = first_comes_first(dist_arr,temp_dist)
        nearest = first_comes_first(nearest,j)

    for j in (nearest):
      ret_arr = np.append(ret_arr,train_class[int(j)])
    ret_arr = np.array([int(q) for q in ret_arr])
    test_pred[i] = np.argmax(np.bincount(ret_arr))

  return test_pred

def getAccuracy(testSet, predictions):
  correct = 0
  testSet = np.array(testSet)
  for x in range(testSet.shape[0]):
    if (testSet[x] == predictions[x]):
      correct += 1
  return (correct/float(testSet.shape[0])) * 100.0

## Enter k value to be used in k-NN algorithm

In [0]:
k = 3

In [11]:
test_pred = kNN(test_data, train_data, train_class, 3)
acc = getAccuracy(test_class, test_pred)
print(acc)

95.55555555555556


## **Evaluating Results for k = 3,5**

In [12]:
k = np.array([3,5])
acc_arr = np.zeros(k.size)
for i in range(k.size):
  test_pred = kNN(test_data, train_data, train_class, k[i])
  acc_arr[i] = getAccuracy(test_pred, test_class)
  print("Accuracy for k={} = {}".format(k[i],acc_arr[i]))


Accuracy for k=3 = 95.55555555555556
Accuracy for k=5 = 95.55555555555556


# **Inferences**

1. Knn classifier can use k number of neighbors. Higher value of k leads to inclination towards most probable case (we can call prior classifier) which can sometimes lead to poor performance. Lower value of k causes unstable decision boundary for which classifier accuracy can drastically change.So it's best to sweep k for a significant range and see what is the optimal accuracy we can get.

2. Here we tried with K=3 and k=5; the accuracy is comparable and not significantly different in case of k = 3 and k = 5. 

3. It is observed that the initial seeding does have influence on the final accuracy. Lesser number of test data points on boundary better is the accuracy and vice-versa.

4. Thus, by k-NN method we can separate classes which are not linearly separable.