In [1]:
# import required dependency
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score

from collections import Counter

In [2]:
# import iris dataset
iris = datasets.load_iris()
# np.c_ is the numpy concatenate function
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                      columns= iris['feature_names'] + ['target'])
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [3]:
# split into x and y
x= iris_df.iloc[:, :-1]
y= iris_df.iloc[:, -1]

In [4]:
# split the data into train and test sets
# ratio is given in 7:3, according to the homework instruction
x_train, x_test, y_train, y_test= train_test_split(x, y,
                                                   test_size= 0.3,
                                                   shuffle= True, 
                                                   random_state= 0)
x_train= np.asarray(x_train)
y_train= np.asarray(y_train)

x_test= np.asarray(x_test)
y_test= np.asarray(y_test)

In [5]:
# check the size ratio
print(f'training size: {x_train.shape[0]} \ntest size: {x_test.shape[0]} ')

training size: 105 
test size: 45 


In [6]:
# dataset normalize
scaler= Normalizer().fit(x_train) 
normalized_x_train= scaler.transform(x_train) 
normalized_x_test= scaler.transform(x_test) 

In [7]:
# implment KNN algorithm
# step one, find Euclidean Distance
def distance_ecu(x_train, x_test_point):
  distances= []  
  for row in range(len(x_train)): 
      current_train_point= x_train[row] 
      current_distance= 0 

      for col in range(len(current_train_point)): 
          current_distance += (current_train_point[col] - x_test_point[col]) **2
      current_distance= np.sqrt(current_distance)

      distances.append(current_distance) 

  distances= pd.DataFrame(data=distances,columns=['dist'])
  return distances

In [8]:
# implment KNN algorithm
# step two, find nearest neighbors
def nearest_neighbors(distance_point, K):
    df_nearest= distance_point.sort_values(by=['dist'], axis=0)
    df_nearest= df_nearest[:K]
    return df_nearest

In [9]:
# implment KNN algorithm
# step three, vote
def voting(df_nearest, y_train):
    counter_vote= Counter(y_train[df_nearest.index])
    y_pred= counter_vote.most_common()[0][0]
    return y_pred

In [10]:
def KNN(x_train, y_train, x_test, K):
    y_pred=[]

    for x_test_point in x_test:
      distance_point  = distance_ecu(x_train, x_test_point)  
      df_nearest_point= nearest_neighbors(distance_point, K)  
      y_pred_point    = voting(df_nearest_point, y_train) 
      y_pred.append(y_pred_point)
        
    return y_pred  

In [11]:
# print predicted result
K=7
y_pred_scratch= KNN(normalized_x_train, y_train, normalized_x_test, K)
print(y_pred_scratch)

[2.0, 1.0, 0.0, 2.0, 0.0, 2.0, 0.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 2.0, 2.0, 0.0, 2.0, 2.0, 1.0, 0.0, 2.0, 1.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0]


In [12]:
# iterate prediction 10 times, and find the avg accuracy
# with k=7
k=7
y_pred_scratch= KNN(normalized_x_train, y_train, normalized_x_test, k)
total_acc = 0
for i in range(10):
    y_pred_scratch= KNN(normalized_x_train, y_train, normalized_x_test, K)
    acc = accuracy_score(y_test, y_pred_scratch)
    total_acc += acc
    
avg_acc = total_acc/10

print('avg_acc:', avg_acc)

avg_acc: 0.9333333333333333
