In [21]:
from collections import Counter
import math
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv("iris.csv")

x = df.drop(columns=['species'])
y = df['species']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
# KNN from scratch
class KNN:
    def __init__(self, k):
        self.k = k

    def fit(self, x_train, y_train):
        self.x_train = np.array(x_train,dtype=float)
        self.y_train = np.array(y_train)

    def chi_square_distance(self, x1, x2):
        distance = np.sum((x1-x2)**2 / (x1 + x2))
        return distance

    def predict(self, x_test):
        predictions = []
        for x in x_test:
            #compute distances
            distances = np.array([self.chi_square_distance(x, xt) for xt in self.x_train])

            nnIndices = distances.argsort()[:self.k]
            nnLabels = self.y_train[nnIndices]

            mostCommon = Counter(nnLabels).most_common(1)[0][0]
            predictions.append(mostCommon)
        return np.array(predictions)
    
    def score(self, x_test, y_test):
        y_test = np.array(y_test)
        x_test = np.array(x_test,dtype=float)
        y_pred = self.predict(x_test)
        return np.mean(y_pred == y_test)

In [24]:
knn = KNN(k=3)
knn.fit(x_train, y_train)
accuracy = knn.score(x_test, y_test)
print("Test Accuracy with Chi-Square KNN:", accuracy)
matrix = confusion_matrix(y_test, knn.predict(np.array(x_test)))
print("Confusion Matrix:\n", matrix)

Test Accuracy with Chi-Square KNN: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
