In [11]:
import pandas as pd
import numpy as np
import random
import sklearn
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

In [4]:
class ProcessingData:
    @staticmethod
    def shuffleDF(df: pd.DataFrame) -> pd.DataFrame:
        for i in range(len(df)):
            rand= (random.randint(i,len(df)-1))
            temp = df.iloc[i].copy()
            df.iloc[i] = df.iloc[rand]
            df.iloc[rand] = temp
        return df
        # return df.sample(frac=1)
    @staticmethod
    def normalizeDF(df: pd.DataFrame, columnNames: list) -> pd.DataFrame:
        for columnName in columnNames:
            df[columnName] = (df[columnName]-df[columnName].min())/(df[columnName].max()-df[columnName].min())
        return df
    @staticmethod
    def splitDF(df: pd.DataFrame) -> pd.DataFrame:
        trainIndexes = random.sample(range(1, len(df)), len(df)//3)
        train_X = pd.DataFrame(columns=df.columns)
        test_X = pd.DataFrame(columns=df.columns)
        for i in range(len(df)):
            if i in trainIndexes:
                train_X = train_X.append(df.iloc[i])
            else:
                test_X = test_X.append(df.iloc[i])

        return test_X, train_X
        #return train_test_split(df)

In [19]:
class KNN:
    def __init__(self, m: int, k: int):
        self.m = m
        self.k = k

    @staticmethod
    def dst(x: np.array, y: np.array, m: int) -> float:
        return sum([(abs(xi-yi)**m)
                    for xi,yi in zip(x,y)])**(1/m)

    def fit(self, df: pd.DataFrame):
        self.df = df
    
    def predict(self, point: pd.DataFrame) -> str:
        types = {}
        for v in pd.unique(df[:-1]):
            types[v] = 0
        result = []
        for sample in self.df.values:
            result.append([KNN.dst(sample[:-1], point, self.m), sample[-1]])
        result.sort(key=lambda x:x[0])
        for i in range(self.k):
            types[result[i][1]] += 1
        return max(types, key=types.get)
    
    def score(self, test_X: pd.DataFrame) -> float:
        good = 0
        bad = 0
        for sample in test_X.values:
            if (x:=self.predict(sample)) == sample[-1]:
                good += 1
            else:
                bad +=1
        return good/(bad+good)*100

In [14]:
df = sklearn.datasets.load_digits()
print(df.data.shape)
n_samples = len(df.images)
clf = KNN(28,3)
data = df.images.reshape((n_samples, -1))

(1797, 64)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    data, df.target, test_size=0.5, shuffle=False
)

[array([[ 0.,  0.,  8., ..., 12.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  4., ...,  0.,  0.,  0.],
        ...,
        [ 0.,  0.,  6., ...,  0.,  0.,  0.],
        [ 0.,  0.,  1., ...,  3.,  0.,  0.],
        [ 0.,  0.,  5., ...,  0.,  0.,  0.]]),
 array([[ 0.,  0.,  1., ..., 15., 11.,  1.],
        [ 0.,  0.,  8., ..., 16., 11.,  1.],
        [ 0.,  0.,  9., ...,  5.,  0.,  0.],
        ...,
        [ 0.,  3., 15., ...,  0.,  0.,  0.],
        [ 0.,  0.,  7., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 12.,  8.,  0.]]),
 array([3, 7, 7, ..., 7, 1, 7]),
 array([3, 9, 5, 2, 5, 2, 2, 8, 2, 5, 7, 1, 5, 4, 4, 4, 5, 6, 5, 4, 5, 6,
        1, 1, 5, 0, 4, 4, 8, 3, 0, 1, 8, 9, 8, 9, 1, 7, 8, 3, 1, 9, 4, 3,
        3, 6, 2, 8, 7, 0, 0, 8, 5, 3, 3, 4, 6, 6, 1, 1, 5, 3, 8, 7, 9, 2,
        1, 6, 7, 5, 7, 8, 8, 7, 3, 7, 5, 1, 9, 8, 0, 4, 0, 1, 7, 1, 2, 0,
        9, 0, 6, 2, 0, 5, 9, 6, 3, 5, 0, 1, 7, 0, 8, 2, 1, 4, 0, 3, 1, 4,
        3, 7, 1, 1, 9, 8, 9

In [20]:
clf.fit(pd.DataFrame(X_train, y_train))

In [21]:
clf.score(pd.DataFrame(X_test, y_test))

KeyError: 'variety'