In [20]:
import numpy as np
# 这里直接引入sklearn里的数据集，iris鸢尾花
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split  # 切分数据集为训练集和测试集
from sklearn.metrics import accuracy_score  # 计算分类预测的准确率

In [21]:
from pandas import DataFrame

# Prepare source and transform to be trained and tested by the model
iris = load_iris()
source = DataFrame(data=iris.data, columns=iris.feature_names)
source["class"] = iris.target
source["class"] = source["class"].map({0: iris.target_names[0], 1: iris.target_names[1], 2: iris.target_names[2]})
print(source)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8   

         class  
0       se

In [35]:
# Split the source into train dataset and test dataset
x, y = iris.data, iris.target.reshape(-1, 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=35, stratify=y)

(150, 4) (150, 1)


In [92]:
def distance(A, b, method="l1"):
    return np.sum(np.abs(A - b), axis=1) if method == "l1" else np.sqrt(np.sum((A - b) ** 2, axis=1))


class KNN:
    def __init__(self, k, method="l2"):
        self.k, self.method, self.func = k, method, distance

    def fit(self, train_x, train_y):
        self.train_x, self.train_y = train_x, train_y

    def predict(self, data_X):
        y_pred = np.zeros((data_X.shape[0], 1), dtype=self.train_y.dtype)
        for i, x in enumerate(data_X):
            distances = self.func(self.train_x, x, self.method)
            dis_indices = np.argsort(distances)[:self.k]
            class_labels = self.train_y[dis_indices].ravel()
            y_pred[i] = np.argmax(np.bincount(class_labels))
        return y_pred

In [93]:
# Test
result = []
for method in ["l1", "l2"]:
    for k in range(1, 10):
        model = KNN(k, method)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        result.append([method, k, accuracy_score(y_test, y_pred)])
res = DataFrame(result, columns=["method", "k", "accuracy"]).sort_values(by="accuracy", ascending=False)
print(res)

   method  k  accuracy
17     l2  9  0.977778
16     l2  8  0.977778
4      l1  5  0.977778
15     l2  7  0.977778
14     l2  6  0.977778
13     l2  5  0.977778
12     l2  4  0.955556
3      l1  4  0.955556
6      l1  7  0.955556
7      l1  8  0.955556
8      l1  9  0.955556
0      l1  1  0.933333
11     l2  3  0.933333
10     l2  2  0.933333
1      l1  2  0.933333
5      l1  6  0.933333
2      l1  3  0.933333
9      l2  1  0.933333
