In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import time
from sklearn.preprocessing import StandardScaler

# 数据加载和预处理
def load_data():
    train_data = pd.read_csv("MNIST/train_resized.csv").values
    test_data = pd.read_csv("MNIST/test_resized.csv").values

    X_train, y_train = train_data[:, 1:], train_data[:, 0]
    X_test, y_test = test_data[:, 1:], test_data[:, 0]

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data()

# 添加标准化器
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 二分类任务（线性核）
def svm_linear_binary(X_train, y_train, X_test, y_test):
    print("\n--- Binary Classification (3 vs 6, Linear Kernel) ---")
    train_idx = np.isin(y_train, [3, 6])
    test_idx = np.isin(y_test, [3, 6])
    X_train_bin, y_train_bin = X_train[train_idx], y_train[train_idx]
    X_test_bin, y_test_bin = X_test[test_idx], y_test[test_idx]
    y_train_bin = (y_train_bin == 6).astype(int)
    y_test_bin = (y_test_bin == 6).astype(int)

    print(f"Training Samples: {X_train_bin.shape[0]}, Test Samples: {X_test_bin.shape[0]}")

    param_grid = {'C': [0.001, 0.01, 0.1]}
    clf = GridSearchCV(SVC(kernel='linear'), param_grid, cv=5)
    start_time = time.time()
    clf.fit(X_train_bin, y_train_bin)
    train_time = time.time() - start_time

    best_C = clf.best_params_['C']
    y_pred = clf.predict(X_test_bin)
    misclassification_error = 1 - accuracy_score(y_test_bin, y_pred)

    print(f"Best C: {best_C}")
    print(f"Misclassification Error: {misclassification_error:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test_bin, y_pred))
    print(f"Training Time: {train_time:.2f} seconds")

    return {"C": best_C, "error": misclassification_error, "time": train_time}

# 二分类任务（径向基核）
def svm_rbf_binary(X_train, y_train, X_test, y_test):
    print("\n--- Binary Classification (3 vs 6, RBF Kernel) ---")
    train_idx = np.isin(y_train, [3, 6])
    test_idx = np.isin(y_test, [3, 6])
    X_train_bin, y_train_bin = X_train[train_idx], y_train[train_idx]
    X_test_bin, y_test_bin = X_test[test_idx], y_test[test_idx]
    y_train_bin = (y_train_bin == 6).astype(int)
    y_test_bin = (y_test_bin == 6).astype(int)

    print(f"Training Samples: {X_train_bin.shape[0]}, Test Samples: {X_test_bin.shape[0]}")

    param_grid = {'C': [100, 1000, 10000], 'gamma': [0.0001, 0.001, 0.01]}
    clf = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5)
    start_time = time.time()
    clf.fit(X_train_bin, y_train_bin)
    train_time = time.time() - start_time

    best_params = clf.best_params_
    y_pred = clf.predict(X_test_bin)
    misclassification_error = 1 - accuracy_score(y_test_bin, y_pred)

    print(f"Best Parameters: {best_params}")
    print(f"Misclassification Error: {misclassification_error:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test_bin, y_pred))
    print(f"Training Time: {train_time:.2f} seconds")

    return {"C": best_params["C"], "gamma": best_params["gamma"], "error": misclassification_error, "time": train_time}

# 多分类任务（1, 2, 5, 8 的分类，线性核）
def svm_linear_multi(X_train, y_train, X_test, y_test):
    print("\n--- Multi-Class Classification (1, 2, 5, 8, Linear Kernel) ---")
    train_idx = np.isin(y_train, [1, 2, 5, 8])
    test_idx = np.isin(y_test, [1, 2, 5, 8])
    X_train_multi, y_train_multi = X_train[train_idx], y_train[train_idx]
    X_test_multi, y_test_multi = X_test[test_idx], y_test[test_idx]

    print(f"Training Samples: {X_train_multi.shape[0]}, Test Samples: {X_test_multi.shape[0]}")

    param_grid = {'C': [0.01, 0.1, 1]}
    clf = GridSearchCV(SVC(kernel='linear'), param_grid, cv=5)
    start_time = time.time()
    clf.fit(X_train_multi, y_train_multi)
    train_time = time.time() - start_time

    best_C = clf.best_params_['C']
    y_pred = clf.predict(X_test_multi)
    misclassification_error = 1 - accuracy_score(y_test_multi, y_pred)

    print(f"Best C: {best_C}")
    print(f"Misclassification Error: {misclassification_error:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test_multi, y_pred))
    print(f"Training Time: {train_time:.2f} seconds")

    return {"C": best_C, "error": misclassification_error, "time": train_time}

# 多分类任务（全10类分类，径向基核）
def svm_full(X_train, y_train, X_test, y_test):
    print("\n--- Multi-Class Classification (All 10 Digits, RBF Kernel) ---")
    param_grid = {'C': [10, 100, 1000], 'gamma': [0.0001, 0.001, 0.01]}  # 更新 C 参数范围
    clf = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5)
    start_time = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - start_time

    best_params = clf.best_params_
    y_pred = clf.predict(X_test)
    misclassification_error = 1 - accuracy_score(y_test, y_pred)

    print(f"Best Parameters: {best_params}")
    print(f"Misclassification Error: {misclassification_error:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Classification Report:\n", classification_report(y_test, y_pred))
    print(f"Training Time: {train_time:.2f} seconds")

    return {"C": best_params["C"], "gamma": best_params["gamma"], "error": misclassification_error, "time": train_time}

# 模型性能比较
def compare_binary_models(linear_results, rbf_results):
    print("\n--- Binary Model Comparison (3 vs 6) ---")
    print(f"Linear Kernel: Best C={linear_results['C']}, Error={linear_results['error']:.4f}, Time={linear_results['time']:.2f}s")
    print(f"RBF Kernel: Best C={rbf_results['C']}, Gamma={rbf_results['gamma']}, Error={rbf_results['error']:.4f}, Time={rbf_results['time']:.2f}s")

# 汇总所有模型的结果
def summarize_results(results):
    print("\n--- Model Performance Summary ---")
    df = pd.DataFrame(results)
    print(df)

# 执行所有任务
results = []
linear_binary = svm_linear_binary(X_train, y_train, X_test, y_test)
results.append({"Task": "Binary (3 vs 6, Linear)", **linear_binary})
rbf_binary = svm_rbf_binary(X_train, y_train, X_test, y_test)
results.append({"Task": "Binary (3 vs 6, RBF)", **rbf_binary})
compare_binary_models(linear_binary, rbf_binary)

linear_multi = svm_linear_multi(X_train, y_train, X_test, y_test)
results.append({"Task": "Multi-Class (1, 2, 5, 8, Linear)", **linear_multi})
full_class = svm_full(X_train, y_train, X_test, y_test)
results.append({"Task": "Full 10-Class (RBF)", **full_class})

summarize_results(results)


--- Binary Classification (3 vs 6, Linear Kernel) ---
Training Samples: 6026, Test Samples: 2462
Best C: 0.01
Misclassification Error: 0.0061
Confusion Matrix:
 [[1251   11]
 [   4 1196]]
Training Time: 1.43 seconds

--- Binary Classification (3 vs 6, RBF Kernel) ---
Training Samples: 6026, Test Samples: 2462
Best Parameters: {'C': 1000, 'gamma': 0.001}
Misclassification Error: 0.0049
Confusion Matrix:
 [[1255    7]
 [   5 1195]]
Training Time: 4.90 seconds

--- Binary Model Comparison (3 vs 6) ---
Linear Kernel: Best C=0.01, Error=0.0061, Time=1.43s
RBF Kernel: Best C=1000, Gamma=0.001, Error=0.0049, Time=4.90s

--- Multi-Class Classification (1, 2, 5, 8, Linear Kernel) ---
Training Samples: 11913, Test Samples: 4806
Best C: 0.1
Misclassification Error: 0.0460
Confusion Matrix:
 [[1346    9    1    7]
 [   7 1134   18   26]
 [  18   14 1061   33]
 [  26   17   45 1044]]
Training Time: 11.82 seconds

--- Multi-Class Classification (All 10 Digits, RBF Kernel) ---
Best Parameters: {'C':