In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
import tensorflow_datasets as tfds
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据集
def load_emnist():
    # 使用TensorFlow Datasets加载EMNIST
    (ds_train, ds_test), ds_info = tfds.load(
        'emnist/balanced',
        split=['train', 'test'],
        shuffle_files=True,
        as_supervised=True,
        with_info=True
    )
    
    # 转换为NumPy数组
    images, labels = [], []
    for image, label in tfds.as_numpy(ds_train):
        images.append(image)
        labels.append(label)
    
    return np.array(images), np.array(labels)

# 加载CIFAR-10
def load_cifar10():
    (x_train, y_train), (_, _) = cifar10.load_data()
    return x_train, y_train

# 数据预处理
def preprocess(data, dataset):
    data = data.astype('float32') / 255.
    if dataset == 'emnist':
        # EMNIST图像是转置的
        data = np.rot90(data, axes=(1, 2))
        data = np.transpose(data, (0, 2, 1, 3))
        data = data.reshape(-1, 28 * 28)
    elif dataset == 'cifar10':
        data = data.reshape(-1, 32 * 32 * 3)
    return data

# 限制数据量以便快速实验
SAMPLE_SIZE = 5000
emnist_images, emnist_labels = load_emnist()
emnist_data = preprocess(emnist_images[:SAMPLE_SIZE], 'emnist')
emnist_labels = emnist_labels[:SAMPLE_SIZE]

cifar_images, cifar_labels = load_cifar10()
cifar_data = preprocess(cifar_images[:SAMPLE_SIZE], 'cifar10')
cifar_labels = cifar_labels[:SAMPLE_SIZE].flatten()

Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/emnist/balanced/3.1.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

I0000 00:00:1749626710.507153      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Shuffling /root/tensorflow_datasets/emnist/balanced/incomplete.SD1RGY_3.1.0/emnist-train.tfrecord*...:   0%|  …

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/emnist/balanced/incomplete.SD1RGY_3.1.0/emnist-test.tfrecord*...:   0%|   …

Dataset emnist downloaded and prepared to /root/tensorflow_datasets/emnist/balanced/3.1.0. Subsequent calls will reuse this data.
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


In [3]:
# 构建自编码器
def build_ae(input_dim, encoding_dim=64):
    input_layer = tf.keras.Input(shape=(input_dim,))
    
    # 编码器
    encoded = tf.keras.layers.Dense(256, activation='relu')(input_layer)
    encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
    encoded = tf.keras.layers.Dense(encoding_dim, activation='relu')(encoded)
    
    # 解码器
    decoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
    decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
    decoded = tf.keras.layers.Dense(input_dim, activation='sigmoid')(decoded)
    
    autoencoder = tf.keras.Model(input_layer, decoded)
    encoder = tf.keras.Model(input_layer, encoded)
    
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder, encoder

In [4]:
def evaluate_clustering(data, labels, n_clusters, use_ae=True, encoding_dim=64, verbose=0):
    # 处理不同数据集
    dataset = 'emnist' if data.shape[1] == 28 * 28 else 'cifar10'
    
    if use_ae:
        # 训练自编码器
        autoencoder, encoder = build_ae(data.shape[1], encoding_dim)
        autoencoder.fit(
            data, data,
            epochs=20,
            batch_size=256,
            verbose=verbose,
            callbacks=[
                tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
            ]
        )
        features = encoder.predict(data, verbose=verbose)
    else:
        # 原始特征PCA降维
        pca = PCA(n_components=encoding_dim)
        features = pca.fit_transform(data)
    
    # K-means聚类
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    clusters = kmeans.fit_predict(features)
    
    # 评估指标
    sil_score = silhouette_score(features, clusters)
    ari_score = adjusted_rand_score(labels, clusters)
    
    return sil_score, ari_score, features, clusters, kmeans

In [9]:
def find_optimal_dim(data, labels, n_clusters, dataset_name):
    dims = [16, 32, 64, 128, 256]
    results = {'ae': {'sil': [], 'ari': []}, 'raw': {'sil': [], 'ari': []}}
    
    for dim in dims:
        # 使用AE
        sil_ae, ari_ae, _, _, _ = evaluate_clustering(
            data, labels, n_clusters, True, dim, verbose=0
        )
        # 原始特征
        sil_raw, ari_raw, _, _, _ = evaluate_clustering(
            data, labels, n_clusters, False, dim, verbose=0
        )
        
        results['ae']['sil'].append(sil_ae)
        results['ae']['ari'].append(ari_ae)
        results['raw']['sil'].append(sil_raw)
        results['raw']['ari'].append(ari_raw)
        
        print(f"{dataset_name} Dim={dim}: AE(sil={sil_ae:.4f}, ari={ari_ae:.4f}) | Raw(sil={sil_raw:.4f}, ari={ari_raw:.4f})")
    
    # 可视化结果
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(dims, results['ae']['sil'], 'o-', label='AE')
    plt.plot(dims, results['raw']['sil'], 'o-', label='Raw')
    plt.title(f'{dataset_name} Silhouette Score')
    plt.xlabel('Dimension')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(dims, results['ae']['ari'], 'o-', label='AE')
    plt.plot(dims, results['raw']['ari'], 'o-', label='Raw')
    plt.title(f'{dataset_name} ARI Score')
    plt.xlabel('Dimension')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'{dataset_name}_dim_optim.png')
    plt.close()
    
    return results


In [7]:
# 首先需要重新运行维度优化并保存结果

# 执行优化
print("\n==== EMNIST Dimension Optimization ====")
dims = [16, 32, 64, 128, 256]
results_emnist = {'ae': {'sil': [], 'ari': []}, 'raw': {'sil': [], 'ari': []}}

for dim in dims:
    sil_ae, ari_ae, _, _, _ = evaluate_clustering(
        emnist_data, emnist_labels, 47, True, dim, verbose=0
    )
    sil_raw, ari_raw, _, _, _ = evaluate_clustering(
        emnist_data, emnist_labels, 47, False, dim, verbose=0
    )
    
    results_emnist['ae']['sil'].append(sil_ae)
    results_emnist['ae']['ari'].append(ari_ae)
    results_emnist['raw']['sil'].append(sil_raw)
    results_emnist['raw']['ari'].append(ari_raw)
    
    print(f"EMNIST Dim={dim}: AE(sil={sil_ae:.4f}, ari={ari_ae:.4f}) | Raw(sil={sil_raw:.4f}, ari={ari_raw:.4f})")

# CIFAR10维度优化
print("\n==== CIFAR10 Dimension Optimization ====")
results_cifar = {'ae': {'sil': [], 'ari': []}, 'raw': {'sil': [], 'ari': []}}

for dim in dims:
    sil_ae, ari_ae, _, _, _ = evaluate_clustering(
        cifar_data, cifar_labels, 10, True, dim, verbose=0
    )
    sil_raw, ari_raw, _, _, _ = evaluate_clustering(
        cifar_data, cifar_labels, 10, False, dim, verbose=0
    )
    
    results_cifar['ae']['sil'].append(sil_ae)
    results_cifar['ae']['ari'].append(ari_ae)
    results_cifar['raw']['sil'].append(sil_raw)
    results_cifar['raw']['ari'].append(ari_raw)
    
    print(f"CIFAR10 Dim={dim}: AE(sil={sil_ae:.4f}, ari={ari_ae:.4f}) | Raw(sil={sil_raw:.4f}, ari={ari_raw:.4f})")

# 找出最佳维度
best_dim_emnist = dims[np.argmax(results_emnist['ae']['ari'])]
best_dim_cifar = dims[np.argmax(results_cifar['ae']['ari'])]

# 使用最优维度进行聚类和可视化
print(f"\n==== EMNIST Analysis with AE (Dim={best_dim_emnist}) ====")
sil_ae, ari_ae, feats_emnist, clusters_emnist, _ = evaluate_clustering(
    emnist_data, emnist_labels, 47, True, best_dim_emnist, verbose=1
)
emnist_errors = visualize_comparison(
    emnist_data, feats_emnist, emnist_labels, clusters_emnist, 'EMNIST'
)

print(f"\n==== CIFAR10 Analysis with AE (Dim={best_dim_cifar}) ====")
sil_ae_cifar, ari_ae_cifar, feats_cifar, clusters_cifar, _ = evaluate_clustering(
    cifar_data, cifar_labels, 10, True, best_dim_cifar, verbose=1
)
cifar_errors = visualize_comparison(
    cifar_data, feats_cifar, cifar_labels, clusters_cifar, 'CIFAR10'
)

# 打印结论
print("\n==== Analysis Conclusion ====")
print("EMNIST最易混淆的字符对:")
for true_label, false_label in emnist_errors.items():
    print(f"  类别 {true_label} 经常被误分类为 {false_label}")

print("\nCIFAR10最易混淆的类别对:")
for true_label, false_label in cifar_errors.items():
    print(f"  类别 {true_label} 经常被误分类为 {false_label}")

# 性能比较 - 使用保存的结果进行公平比较
best_idx_emnist = dims.index(best_dim_emnist)
best_idx_cifar = dims.index(best_dim_cifar)

print("\n==== Performance Summary ====")
print("EMNIST:")
print(f"  AE特征轮廓系数: {results_emnist['ae']['sil'][best_idx_emnist]:.4f}, 调整兰德指数: {results_emnist['ae']['ari'][best_idx_emnist]:.4f}")
print(f"  原始特征轮廓系数: {results_emnist['raw']['sil'][best_idx_emnist]:.4f}, 调整兰德指数: {results_emnist['raw']['ari'][best_idx_emnist]:.4f}")

print("\nCIFAR10:")
print(f"  AE特征轮廓系数: {results_cifar['ae']['sil'][best_idx_cifar]:.4f}, 调整兰德指数: {results_cifar['ae']['ari'][best_idx_cifar]:.4f}")
print(f"  原始特征轮廓系数: {results_cifar['raw']['sil'][best_idx_cifar]:.4f}, 调整兰德指数: {results_cifar['raw']['ari'][best_idx_cifar]:.4f}")


==== EMNIST Dimension Optimization ====


  current = self.get_monitor_value(logs)


EMNIST Dim=16: AE(sil=0.1043, ari=0.0986) | Raw(sil=0.0913, ari=0.1616)


  current = self.get_monitor_value(logs)


EMNIST Dim=32: AE(sil=0.0852, ari=0.1058) | Raw(sil=0.0709, ari=0.1702)


  current = self.get_monitor_value(logs)


EMNIST Dim=64: AE(sil=0.0769, ari=0.1017) | Raw(sil=0.0572, ari=0.1591)


  current = self.get_monitor_value(logs)


EMNIST Dim=128: AE(sil=0.0726, ari=0.1175) | Raw(sil=0.0452, ari=0.1583)


  current = self.get_monitor_value(logs)


EMNIST Dim=256: AE(sil=0.0759, ari=0.1310) | Raw(sil=0.0458, ari=0.1638)

==== CIFAR10 Dimension Optimization ====


  current = self.get_monitor_value(logs)


CIFAR10 Dim=16: AE(sil=0.1171, ari=0.0323) | Raw(sil=0.0991, ari=0.0474)


  current = self.get_monitor_value(logs)


CIFAR10 Dim=32: AE(sil=0.1030, ari=0.0362) | Raw(sil=0.0739, ari=0.0451)


  current = self.get_monitor_value(logs)


CIFAR10 Dim=64: AE(sil=0.1018, ari=0.0441) | Raw(sil=0.0718, ari=0.0470)


  current = self.get_monitor_value(logs)


CIFAR10 Dim=128: AE(sil=0.0861, ari=0.0438) | Raw(sil=0.0658, ari=0.0482)


  current = self.get_monitor_value(logs)


CIFAR10 Dim=256: AE(sil=0.0900, ari=0.0489) | Raw(sil=0.0616, ari=0.0474)

==== EMNIST Analysis with AE (Dim=256) ====
Epoch 1/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - loss: 0.1701
Epoch 2/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0903
Epoch 3/20
[1m18/20[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 3ms/step - loss: 0.0821 

  current = self.get_monitor_value(logs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0817
Epoch 4/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0697
Epoch 5/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0568
Epoch 6/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0473
Epoch 7/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0415
Epoch 8/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0380
Epoch 9/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0339
Epoch 10/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0318
Epoch 11/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0297
Epoch 12/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0279
Epoch 13/20
[1m20

  current = self.get_monitor_value(logs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0431
Epoch 4/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0413
Epoch 5/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0383
Epoch 6/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0365
Epoch 7/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0351
Epoch 8/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0305
Epoch 9/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0338
Epoch 10/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0280
Epoch 11/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0263
Epoch 12/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0258
Epoch 13/20
[1m20