In [1]:
# import required dependency
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score

from collections import Counter

In [82]:
# import iris dataset
iris = datasets.load_iris()
# np.c_ is the numpy concatenate function
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                      columns= iris['feature_names'] + ['target'])

x= iris_df.iloc[:, :-1]
y= iris_df.iloc[:, -1]

# splitting to 6:2:2
# for splitting test dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# for splitting val dataset
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1) 

x_train= np.asarray(x_train)
y_train= np.asarray(y_train)

x_test= np.asarray(x_test)
y_test= np.asarray(y_test)

x_val= np.asarray(x_val)
y_val= np.asarray(y_val)

# check the size ratio
print(f'training size: {x_train.shape[0]} \nvalidation size:{x_val.shape[0]} \ntest size: {x_test.shape[0]} ')

training size: 90 
validation size:30 
test size: 30 


In [62]:
scaler= Normalizer().fit(x_train) 
normalized_x_train= scaler.transform(x_train) 
normalized_x_test= scaler.transform(x_test) 
normalized_x_val= scaler.transform(x_val) 

def distance_ecu(x_train, x_test_point):
  distances= []  
  for row in range(len(x_train)): 
      current_train_point= x_train[row] 
      current_distance= 0 

      for col in range(len(current_train_point)): 
          current_distance += (current_train_point[col] - x_test_point[col]) **2
      current_distance= np.sqrt(current_distance)

      distances.append(current_distance) 

  distances= pd.DataFrame(data=distances,columns=['dist'])
  return distances

def nearest_neighbors(distance_point, K):
    df_nearest= distance_point.sort_values(by=['dist'], axis=0)
    df_nearest= df_nearest[:K]
    return df_nearest

def voting(df_nearest, y_train):
    counter_vote= Counter(y_train[df_nearest.index])
    y_pred= counter_vote.most_common()[0][0]
    return y_pred

def KNN(x_train, y_train, x_test, K):
    y_pred=[]

    for x_test_point in x_test:
      distance_point  = distance_ecu(x_train, x_test_point)  
      df_nearest_point= nearest_neighbors(distance_point, K)  
      y_pred_point    = voting(df_nearest_point, y_train) 
      y_pred.append(y_pred_point)
        
    return y_pred

In [87]:
for k in range(3,12):
    total_acc = 0
    y_pred_scratch= KNN(normalized_x_train, y_train, normalized_x_test, k)
    # use predicted result to iterate 10 times for avg acc
    for j in range(10):
        acc = accuracy_score(y_test, y_pred_scratch)
        total_acc += acc
    avg_acc = total_acc / 10
    print(f'avg_acc: {k}', avg_acc)

30
avg_acc: 3 0.9333333333333333
30
avg_acc: 4 0.9333333333333333
30
avg_acc: 5 0.9666666666666666
30
avg_acc: 6 0.9333333333333333
30
avg_acc: 7 0.9666666666666666
30
avg_acc: 8 1.0
30
avg_acc: 9 0.9666666666666666
30
avg_acc: 10 0.9666666666666666
30
avg_acc: 11 0.9666666666666666


In [83]:
x_val

array([[5.5, 2.4, 3.7, 1. ],
       [4.4, 3. , 1.3, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [6.3, 2.5, 4.9, 1.5],
       [5. , 3.5, 1.6, 0.6],
       [6.5, 3. , 5.5, 1.8],
       [5.7, 2.5, 5. , 2. ],
       [6.8, 3.2, 5.9, 2.3],
       [6.3, 3.3, 6. , 2.5],
       [4.9, 3. , 1.4, 0.2],
       [5.1, 3.5, 1.4, 0.2],
       [4.9, 2.4, 3.3, 1. ],
       [5.6, 2.5, 3.9, 1.1],
       [4.3, 3. , 1.1, 0.1],
       [7.2, 3. , 5.8, 1.6],
       [6.4, 2.7, 5.3, 1.9],
       [6.2, 2.2, 4.5, 1.5],
       [5. , 3.4, 1.5, 0.2],
       [6.2, 3.4, 5.4, 2.3],
       [7.4, 2.8, 6.1, 1.9],
       [6.1, 2.8, 4. , 1.3],
       [5. , 3.6, 1.4, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [5.8, 2.7, 5.1, 1.9],
       [5.5, 2.4, 3.8, 1.1],
       [5. , 2. , 3.5, 1. ],
       [6.5, 3. , 5.8, 2.2],
       [6. , 2.2, 4. , 1. ],
       [6.4, 3.2, 5.3, 2.3],
       [5.7, 2.6, 3.5, 1. ]])

In [84]:
x_test

array([[5.8, 4. , 1.2, 0.2],
       [5.1, 2.5, 3. , 1.1],
       [6.6, 3. , 4.4, 1.4],
       [5.4, 3.9, 1.3, 0.4],
       [7.9, 3.8, 6.4, 2. ],
       [6.3, 3.3, 4.7, 1.6],
       [6.9, 3.1, 5.1, 2.3],
       [5.1, 3.8, 1.9, 0.4],
       [4.7, 3.2, 1.6, 0.2],
       [6.9, 3.2, 5.7, 2.3],
       [5.6, 2.7, 4.2, 1.3],
       [5.4, 3.9, 1.7, 0.4],
       [7.1, 3. , 5.9, 2.1],
       [6.4, 3.2, 4.5, 1.5],
       [6. , 2.9, 4.5, 1.5],
       [4.4, 3.2, 1.3, 0.2],
       [5.8, 2.6, 4. , 1.2],
       [5.6, 3. , 4.5, 1.5],
       [5.4, 3.4, 1.5, 0.4],
       [5. , 3.2, 1.2, 0.2],
       [5.5, 2.6, 4.4, 1.2],
       [5.4, 3. , 4.5, 1.5],
       [6.7, 3. , 5. , 1.7],
       [5. , 3.5, 1.3, 0.3],
       [7.2, 3.2, 6. , 1.8],
       [5.7, 2.8, 4.1, 1.3],
       [5.5, 4.2, 1.4, 0.2],
       [5.1, 3.8, 1.5, 0.3],
       [6.1, 2.8, 4.7, 1.2],
       [6.3, 2.5, 5. , 1.9]])

In [100]:
# select k = 8
# iterate 10 times and caculate for avg acc
acc_total = 0
for i in range(10):
    y_pred_scratch= KNN(normalized_x_test, y_test, normalized_x_val, 8) 
    acc = accuracy_score(y_test, y_pred_scratch)
    acc_total += acc

avg_acc = acc_total/10
    
print(avg_acc)

0.9666666666666666
