In [None]:
import pandas as pd
import numpy as np
import random

In [2]:
class ProcessingData:
    @staticmethod
    def shuffleDF(df: pd.DataFrame) -> pd.DataFrame:
        for i in range(len(df)):
            rand= (random.randint(i,len(df)-1))
            temp = df.iloc[i].copy()
            df.iloc[i] = df.iloc[rand]
            df.iloc[rand] = temp
        return df
        # return df.sample(frac=1)
    @staticmethod
    def normalizeDF(df: pd.DataFrame, columnNames: list) -> pd.DataFrame:
        for columnName in columnNames:
            df[columnName] = (df[columnName]-df[columnName].min())/(df[columnName].max()-df[columnName].min())
        return df
    @staticmethod
    def splitDF(df: pd.DataFrame) -> pd.DataFrame:
        trainIndexes = random.sample(range(1, len(df)), len(df)//3)
        train_X = pd.DataFrame(columns=df.columns)
        test_X = pd.DataFrame(columns=df.columns)
        for i in range(len(df)):
            if i in trainIndexes:
                train_X = train_X.append(df.iloc[i])
            else:
                test_X = test_X.append(df.iloc[i])

        return test_X, train_X
        #return train_test_split(df)

In [3]:
class KNN:
    def __init__(self, m: int, k: int):
        self.m = m
        self.k = k

    @staticmethod
    def dst(x: np.array, y: np.array, m: int) -> float:
        return sum([(abs(xi-yi)**m)
                    for xi,yi in zip(x,y)])**(1/m)

    def fit(self, df: pd.DataFrame):
        self.df = df
    
    def predict(self, point: pd.DataFrame) -> str:
        types = {}
        for v in pd.unique(df["variety"]):
            types[v] = 0
        result = []
        for sample in self.df.values:
            result.append([KNN.dst(sample[:-1], point, self.m), sample[-1]])
        result.sort(key=lambda x:x[0])
        for i in range(self.k):
            types[result[i][1]] += 1
        return max(types, key=types.get)
    
    def score(self, test_X: pd.DataFrame) -> float:
        good = 0
        bad = 0
        for sample in test_X.values:
            if (x:=self.predict(sample)) == sample[-1]:
                good += 1
            else:
                bad +=1
        return good/(bad+good)*100

In [11]:
df = pd.read_csv("iris.csv")
print()
df = ProcessingData.normalizeDF(df, ['sepal.width','sepal.length','petal.width','petal.length'])
df = ProcessingData.shuffleDF(df)
test_X, train_X = ProcessingData.splitDF(df)
print(df.head(5))
print(f"len(train_X):{len(train_X)}")
print(f"len(test_X):{len(test_X)}")
for k in range(2,5):
    knn = KNN(1,k) # m, k
    knn.fit(train_X)
    score = knn.score(test_X[:-1])
    print(f"for k={k} score: {score}")


   sepal.length  sepal.width  petal.length  petal.width     variety
0      0.583333     0.333333      0.779661     0.875000   Virginica
1      0.944444     0.416667      0.864407     0.916667   Virginica
2      0.305556     0.583333      0.118644     0.041667      Setosa
3      0.027778     0.375000      0.067797     0.041667      Setosa
4      0.444444     0.500000      0.644068     0.708333  Versicolor
len(train_X):50
len(test_X):100
for k=2 score: 94.94949494949495
for k=3 score: 95.95959595959596
for k=4 score: 95.95959595959596
