# 实验一：KNN——基于KNN的手写数字识别

## 实验条件：给定semeion手写数字数据集，给定kNN分类算法

1. 初级要求：编程实现kNN算法；给出在不同k值（5，9，13）情况下，kNN算法对手写数字的识别精度（要求采用留一法）
2. 中级要求：与机器学习包或平台(如weka)中的kNN分类器结果进行对比，性能指标为精度ACC，其他指标如归一化互信息NMI、混淆熵CEN任选其一（或两者）
3. 高级要求：采用旋转等手段对原始数据进行处理，进行至少两个方向（左上，左下）旋转，采用CNN或其他深度学习方法实现手写体识别

## 1.初级要求：
###     编程实现kNN算法；给出在不同k值（5，9，13）情况下，kNN算法对手写数字的识别精度（要求采用留一法）

In [2]:
import numpy as np
from collections import Counter


# 读取 Semeion 数据集
def load_semeion_data(file_path):
    data = np.loadtxt(file_path)
    features = data[:, :256]  # 前256列为特征
    labels = np.argmax(data[:, 256:], axis=1)  # 后10列为one-hot编码的标签
    return features, labels


# 计算欧氏距离
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))


# kNN算法实现
def knn(train_data, train_labels, test_data, k):
    distances = []
    for i in range(len(train_data)):
        dist = euclidean_distance(train_data[i], test_data)
        distances.append((dist, train_labels[i]))

    distances.sort(key=lambda x: x[0])
    k_nearest_neighbors = [label for _, label in distances[:k]]

    most_common_label = Counter(k_nearest_neighbors).most_common(1)[0][0]
    return most_common_label


# 留一法实现
def loocv_knn(data, labels, k):
    correct_predictions = 0

    for i in range(len(data)):
        test_data = data[i]
        test_label = labels[i]

        train_data = np.delete(data, i, axis=0)
        train_labels = np.delete(labels, i)

        predicted_label = knn(train_data, train_labels, test_data, k)

        if predicted_label == test_label:
            correct_predictions += 1

    accuracy = correct_predictions / len(data)
    return accuracy


# 加载数据
file_path = 'semeion.data'
features, labels = load_semeion_data(file_path)

# 计算不同k值下的识别精度
k_values = [5, 9, 13]
for k in k_values:
    accuracy = loocv_knn(features, labels, k)
    print(f'k={k}, Accuracy={accuracy:.4f}')


k=5, Accuracy=0.9140
k=9, Accuracy=0.9240
k=13, Accuracy=0.9153


## 2.中级要求：
### 与机器学习包或平台(如weka)中的kNN分类器结果进行对比，性能指标为精度ACC，其他指标如归一化互信息NMI、混淆熵CEN任选其一（或两者）

In [4]:
import numpy as np
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, normalized_mutual_info_score, confusion_matrix
import scipy.stats


# 加载 Semeion 数据集
def load_semeion_data(file_path):
    data = np.loadtxt(file_path)
    features = data[:, :256]  # 前256列为特征
    labels = np.argmax(data[:, 256:], axis=1)  # 后10列为one-hot编码的标签
    return features, labels


# 自实现 kNN 算法
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))


def knn(train_data, train_labels, test_data, k):
    distances = []
    for i in range(len(train_data)):
        dist = euclidean_distance(train_data[i], test_data)
        distances.append((dist, train_labels[i]))

    distances.sort(key=lambda x: x[0])
    k_nearest_neighbors = [label for _, label in distances[:k]]

    most_common_label = Counter(k_nearest_neighbors).most_common(1)[0][0]
    return most_common_label


def loocv_knn(data, labels, k):
    correct_predictions = 0
    predicted_labels = []

    for i in range(len(data)):
        test_data = data[i]
        test_label = labels[i]

        train_data = np.delete(data, i, axis=0)
        train_labels = np.delete(labels, i)

        predicted_label = knn(train_data, train_labels, test_data, k)
        predicted_labels.append(predicted_label)

        if predicted_label == test_label:
            correct_predictions += 1

    accuracy = correct_predictions / len(data)
    return accuracy, predicted_labels


# 加载数据
file_path = 'semeion.data'
features, labels = load_semeion_data(file_path)

# 使用自实现 kNN 算法，计算识别精度和预测标签
k = 5  # 示例中选一个k值，如5
accuracy, self_predicted_labels = loocv_knn(features, labels, k)
print(f'Self-implemented kNN (k={k}), Accuracy={accuracy:.4f}')

# 使用 scikit-learn 的 kNN 分类器进行对比
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(features, labels)
sklearn_predicted_labels = clf.predict(features)

# 计算 scikit-learn 的性能指标
sklearn_accuracy = accuracy_score(labels, sklearn_predicted_labels)
print(f'Scikit-learn kNN (k={k}), Accuracy={sklearn_accuracy:.4f}')

# 计算归一化互信息（NMI）
self_nmi = normalized_mutual_info_score(labels, self_predicted_labels)
sklearn_nmi = normalized_mutual_info_score(labels, sklearn_predicted_labels)
print(f'Self-implemented kNN (k={k}), NMI={self_nmi:.4f}')
print(f'Scikit-learn kNN (k={k}), NMI={sklearn_nmi:.4f}')

# 计算混淆熵（CEN），假设用归一化的互信息作为近似
self_conf_matrix = confusion_matrix(labels, self_predicted_labels)
self_cen = scipy.stats.entropy(self_conf_matrix.flatten())
sklearn_conf_matrix = confusion_matrix(labels, sklearn_predicted_labels)
sklearn_cen = scipy.stats.entropy(sklearn_conf_matrix.flatten())
print(f'Self-implemented kNN (k={k}), CEN={self_cen:.4f}')
print(f'Scikit-learn kNN (k={k}), CEN={sklearn_cen:.4f}')


Self-implemented kNN (k=5), Accuracy=0.9140
Scikit-learn kNN (k=5), Accuracy=0.9341
Self-implemented kNN (k=5), NMI=0.8372
Scikit-learn kNN (k=5), NMI=0.8722
Self-implemented kNN (k=5), CEN=2.6725
Scikit-learn kNN (k=5), CEN=2.5929


## 3.高级要求：
### 采用旋转等手段对原始数据进行处理，进行至少两个方向（左上，左下）旋转，采用CNN或其他深度学习方法实现手写体识别

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from skimage.transform import rotate


# 加载 Semeion 数据集
def load_semeion_data(file_path):
    data = np.loadtxt(file_path)
    features = data[:, :256].reshape(-1, 16, 16, 1)  # 前256列为特征，转换为16x16图像
    labels = np.argmax(data[:, 256:], axis=1)  # 后10列为one-hot编码的标签
    return features, labels


# 数据增强 - 旋转图像
def augment_data(features, labels):
    augmented_features = []
    augmented_labels = []

    for feature, label in zip(features, labels):
        augmented_features.append(feature)  # 原始图像
        augmented_labels.append(label)

        # 旋转图像（左上和左下）
        augmented_features.append(rotate(feature, angle=15, mode='wrap'))
        augmented_labels.append(label)

        augmented_features.append(rotate(feature, angle=-15, mode='wrap'))
        augmented_labels.append(label)

    return np.array(augmented_features), np.array(augmented_labels)


# 构建 CNN 模型
def create_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(10, activation='softmax'))  # 10 类输出
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


# 加载和增强数据
file_path = 'semeion.data'  
features, labels = load_semeion_data(file_path)
augmented_features, augmented_labels = augment_data(features, labels)

# 将标签转换为 one-hot 编码
augmented_labels = to_categorical(augmented_labels, num_classes=10)

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(augmented_features, augmented_labels, test_size=0.2,
                                                    random_state=42)

# 构建和训练 CNN 模型
input_shape = (16, 16, 1)
cnn_model = create_cnn_model(input_shape)
cnn_model.summary()
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.01)

# 在测试集上评估模型
test_loss, test_accuracy = cnn_model.evaluate(X_test, y_test)
print(f'CNN Model Test Accuracy: {test_accuracy:.4f}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4003 - loss: 1.7910 - val_accuracy: 0.7949 - val_loss: 0.6540
Epoch 2/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8020 - loss: 0.6189 - val_accuracy: 0.8462 - val_loss: 0.4317
Epoch 3/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8775 - loss: 0.4102 - val_accuracy: 0.9231 - val_loss: 0.2988
Epoch 4/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9040 - loss: 0.3155 - val_accuracy: 0.9487 - val_loss: 0.2308
Epoch 5/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9352 - loss: 0.2240 - val_accuracy: 0.9487 - val_loss: 0.1795
Epoch 6/10
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9509 - loss: 0.1758 - val_accuracy: 0.9487 - val_loss: 0.1441
Epoch 7/10
[1m119/119[0m 