### 加载数据

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [3]:
x = iris.data
y = iris.target

In [4]:
x.shape, y.shape

((150, 4), (150,))

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=233, stratify=y)

In [6]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((105, 4), (45, 4), (105,), (45,))

### 超参数

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [21]:
neigh = KNeighborsClassifier(
    n_neighbors=3,      # 设置K值为3，表示在进行分类时考虑最近的3个邻居
    weights='distance', # 权重计算方式:
                       # 'uniform': 所有近邻的权重相等，即每个近邻对预测结果的贡献相同
                       # 'distance': 近邻的权重与其距离成反比，即:权重 = 1 / 距离
                       #            - 距离越近的样本权重越大，影响越大
                       #            - 距离越远的样本权重越小，影响越小
    p=2                # 设置距离度量参数
                      # p=1: 曼哈顿距离 (Manhattan Distance)
                      # p=2: 欧氏距离 (Euclidean Distance)
                      # p>2: 闵可夫斯基距离 (Minkowski Distance)
)

In [22]:
neigh.fit(x_train, y_train)

In [None]:
# 计算训练集预估准确度
neigh.score(x_test, y_test)

0.9777777777777777

In [None]:
# 手写代码实现搜索做合适的参数
best_score = -1
best_n  = -1
best_weight = ''
best_p = -1

for n in range(1, 20):
    for weight in ['uniform', 'distance']:
        for p in range(1, 7):
            neigh = KNeighborsClassifier(
                n_neighbors=n,
                weights=weight,
                p = p
            )
            neigh.fit(x_train, y_train)
            score = neigh.score(x_test, y_test)
            
            if score > best_score:
                best_score = score
                best_n = n
                best_weight = weight
                best_p = p

print("n_neighbors:", best_n)
print("weights:", best_weight)
print("p:", best_p)
print("score:", best_score)

n_neighbors: 5
weights: uniform
p: 2
score: 1.0


### sklearn 超参数搜索

In [12]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'n_neighbors': [n for n in range(1, 20)],   # - n_neighbors: 1到19的K值
    'weights': ['uniform', 'distance'],         # - weights: 'uniform'或'distance'权重选项
    'p': [p for p in range(1, 7)]               # - p: 1到6的距离度量参数(1:曼哈顿距离, 2:欧氏距离, >2:闵可夫斯基距离)
}

In [None]:
grid = GridSearchCV(
    estimator=KNeighborsClassifier(), # 创建KNN分类器实例作为基础分类器
    param_grid=params, # 定义需要搜索的参数网格    
    n_jobs=-1 # n_jobs=-1表示使用所有可用的CPU核心进行并行计算,可以加快网格搜索的速度
)

In [None]:
# 执行网格搜索来找到最佳超参数组合
# grid对象在之前已经通过GridSearchCV创建，包含了所需搜索的参数网格
# x_train和y_train是训练数据和对应的标签
# fit方法会遍历所有可能的参数组合，对每组参数训练模型并评估性能
# fit 方法在 GridSearchCV 中通过交叉验证（Cross-Validation）来评估模型性能并选择最佳超参数。
# 训练集和测试集的选取
# 在 GridSearchCV 中，测试集通过交叉验证的方式从 x_train 和 y_train 中动态划分。具体步骤如下：
    # 1. 将 x_train 和 y_train 分成 k 份），默认值为 cv=5，即分成 5 份
    # 2. 每次从 k 份中选取 1 个作为验证集，其余 k-1 个作为训练集
    # 3. 对每组超参数组合，重复上述过程 k 次，计算每次验证集的性能得分
grid.fit(x_train, y_train)

In [16]:
grid.best_params_

{'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}

In [17]:
grid.best_score_

np.float64(0.961904761904762)

In [18]:
grid.best_estimator_

In [19]:
grid.best_estimator_.predict(x_test)

array([2, 2, 0, 1, 1, 1, 2, 0, 2, 0, 0, 1, 0, 2, 1, 1, 0, 2, 2, 1, 0, 1,
       1, 2, 2, 0, 0, 1, 1, 0, 2, 2, 0, 1, 1, 2, 1, 1, 0, 0, 0, 2, 0, 1,
       1])

In [20]:
grid.best_estimator_.score(x_test, y_test)

0.9555555555555556