# 1. PCA (1, 2151) 的准确率对比

In [1]:
import time
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib_inline import backend_inline
backend_inline.set_matplotlib_formats('svg') # 展示高清图，在 Jupyter Notebook 中设置 matplotlib 图形的输出格式为 SVG 格式

import torch
import torch.nn as nn
from scipy.signal import savgol_filter
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as TTS
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import random_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report

import random
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
pcp_values = [1] + [29]+ [int(p) for p in np.arange(0.05, 1.05, 0.05)*2151]
Train_Accuracy = []
Test_Accuracy = []
results = []

for pcp in pcp_values:
    # 导入数据，降维
    data = pd.read_csv(r"F:\Code_Data\2_Rocks_Spectrum_Reflectance_essay_30_origin.csv",encoding='utf-8') #encoding='GBK',防止中文乱码
    X = data.iloc[:,1:-1]
    y = data.iloc[:,-1]
    X_SG = savgol_filter(X, 5, 2)
    X_SG_mms = MinMaxScaler().fit_transform(X_SG)
    Label = LabelEncoder().fit_transform(y)
    data.iloc[:,1:-1] = X_SG_mms
    data.iloc[:, -1] = Label
    X = data.iloc[:,1:-1]
    y = data.iloc[:,-1]
    pca = PCA(n_components=pcp)
    X_dr = pca.fit_transform(X)

    # 划分数据集
    X_tensor = torch.tensor(X_dr, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.long)
    X_tensor = X_tensor.to(device)
    y_tensor = y_tensor.to(device)
    Xtrain, Xtest, Ytrain, Ytest = TTS(X_dr,y,test_size=0.3,random_state=0)
    X_train_tensor = torch.tensor(Xtrain, dtype=torch.float32)
    X_test_tensor = torch.tensor(Xtest, dtype=torch.float32)
    y_train_tensor = torch.tensor(Ytrain, dtype=torch.long)
    y_test_array = Ytest.values
    y_test_tensor = torch.tensor(y_test_array, dtype=torch.long)
    X_train_tensor = X_train_tensor.to(device)
    X_test_tensor = X_test_tensor.to(device)
    y_train_tensor = y_train_tensor.to(device)
    y_test_tensor = y_test_tensor.to(device)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    train_size = len(train_dataset)
    test_size = len(test_dataset)

    # 搭建网络
    class DNN(nn.Module):
        def __init__(self, input_size, num_classes, hidden_layer_sizes, dropout_prob):
            ''' 搭建神经网络各层 '''
            super(DNN, self).__init__()
            self.hidden_layer_sizes = hidden_layer_sizes  # 调整隐藏层尺寸
            self.dropout_prob = dropout_prob  # 调整 dropout 概率
            layers = []
            prev_layer_size = input_size
            for layer_size in self.hidden_layer_sizes:
                layers.append(nn.Linear(prev_layer_size, layer_size))
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(self.dropout_prob))
                prev_layer_size = layer_size
            layers.append(nn.Linear(prev_layer_size, num_classes))
            self.net = nn.Sequential(*layers)   
        def forward(self, x):
            ''' 前向传播 '''
            y = self.net(x) # x 即输入数据
            return y # y 即输出数据

    # 超参数
    num_classes = 33
    input_size = X_tensor.shape[1]
    hidden_layer_sizes_2 = [170, 241, 97, 177, 164]
    dropout_prob_2 = 0.16

    # 创建实例
    model_2 = DNN(input_size, num_classes, hidden_layer_sizes_2, dropout_prob_2)
    model_2.to(device)
    
    # 训练
    loss_fn = nn.CrossEntropyLoss(reduction='mean') # 对所有样本的损失求平均，得到一个标量
    learning_rate_2 = 0.0028690154931344
    optimizer_2 = torch.optim.Adam(model_2.parameters(), lr=learning_rate_2)
    epochs = 200
    accuracies_2 = [] # 记录准确率变化的列表
    losses_2 = [] # 记录损失函数变化的列表
    model_2.train()
    for epoch in range(epochs):
        total_correct_2 = 0
        total_samples_2 = 0
        batch_loss_2 = 0
        for (x, y) in train_dataloader: # 获取小批次的 x 与 y
            x, y = x.to(device), y.to(device)
            Pred = model_2(x) # 一次前向传播（小批量）
            _, Pred_classes = torch.max(Pred, dim=1)
            total_correct_2 += torch.sum(Pred_classes == y)
            total_samples_2 += y.size(0)
            loss = loss_fn(Pred, y) # 计算损失函数
            batch_loss_2 += loss.item()  # 累加当前 batch 的损失
            #losses.append(loss.item()) # 记录损失函数的变化
            optimizer_2.zero_grad() # 清理上一轮滞留的梯度
            loss.backward() # 一次反向传播
            optimizer_2.step() # 优化内部参数
        accuracy_2 = total_correct_2 / total_samples_2
        accuracies_2.append(accuracy_2.item())

    # 测试
    correct_2 = 0
    total_2 = 0
    all_preds_2 = []
    all_labels_2 = []
    model_2.eval()
    with torch.no_grad(): # 该局部关闭梯度计算功能
        for (x, y) in test_dataloader: # 获取小批次的 x 与 y
            x, y = x.to(device), y.to(device)
            Pred = model_2(x) # 一次前向传播（小批量）
            _, Pred_classes = torch.max(Pred, dim=1)  # 找到最大概率对应的类别索引
            correct_2 += torch.sum(Pred_classes == y)  # 计算正确的个数
            all_preds_2.extend(Pred_classes.cpu().numpy())  # 将预测转换为NumPy数组并存储
            all_labels_2.extend(y.cpu().numpy())  # 将真实标签转换为NumPy数组并存储
            total_2 += y.size(0)
    accuracy_2 = correct_2/total_2

    accuracy_score_train = max(accuracies_2)
    accuracy_score_test = accuracy_2
    Train_Accuracy.append(max(accuracies_2))
    Test_Accuracy.append(accuracy_2)
    
    print(f'PCP={pcp}: Train Accuracy = {accuracy_score_train}, Test Accuracy = {accuracy_score_test}')
    results.append({'PCP': pcp, 'Train Accuracy': accuracy_score_train, 'Test Accuracy': accuracy_score_test})
    
results_df = pd.DataFrame(results)
results_df

PCP=1: Train Accuracy = 0.21306920051574707, Test Accuracy = 0.19413919746875763
PCP=29: Train Accuracy = 0.998671293258667, Test Accuracy = 0.9988729357719421
PCP=107: Train Accuracy = 0.999275267124176, Test Accuracy = 0.9980276226997375
PCP=215: Train Accuracy = 0.9996376037597656, Test Accuracy = 0.9985911846160889
PCP=322: Train Accuracy = 0.999396026134491, Test Accuracy = 0.9985911846160889
PCP=430: Train Accuracy = 0.999275267124176, Test Accuracy = 0.9980276226997375
PCP=537: Train Accuracy = 0.999275267124176, Test Accuracy = 0.9988729357719421
PCP=645: Train Accuracy = 0.999396026134491, Test Accuracy = 0.997464120388031
PCP=752: Train Accuracy = 0.9996376037597656, Test Accuracy = 0.9980276226997375
PCP=860: Train Accuracy = 0.9998791813850403, Test Accuracy = 0.9983094334602356
PCP=967: Train Accuracy = 0.9995168447494507, Test Accuracy = 0.9966188073158264
PCP=1075: Train Accuracy = 0.9995168447494507, Test Accuracy = 0.9980276226997375
PCP=1183: Train Accuracy = 0.999516

Unnamed: 0,PCP,Train Accuracy,Test Accuracy
0,1,0.213069,"tensor(0.1941, device='cuda:0')"
1,29,0.998671,"tensor(0.9989, device='cuda:0')"
2,107,0.999275,"tensor(0.9980, device='cuda:0')"
3,215,0.999638,"tensor(0.9986, device='cuda:0')"
4,322,0.999396,"tensor(0.9986, device='cuda:0')"
5,430,0.999275,"tensor(0.9980, device='cuda:0')"
6,537,0.999275,"tensor(0.9989, device='cuda:0')"
7,645,0.999396,"tensor(0.9975, device='cuda:0')"
8,752,0.999638,"tensor(0.9980, device='cuda:0')"
9,860,0.999879,"tensor(0.9983, device='cuda:0')"


In [6]:
import pandas as pd

data = {
    'PCP': [1, 29, 107, 215, 322, 430, 537, 645, 752, 860, 967, 1075, 1183, 1290, 1398, 1505, 1613, 1720, 1828, 1935, 2043, 2151],
    'Train Accuracy': [0.21306920051574707, 0.998671293258667, 0.999275267124176, 0.9996376037597656, 0.999396026134491, 0.999275267124176, 0.999275267124176, 0.999396026134491, 0.9996376037597656, 0.9998791813850403, 0.9995168447494507, 0.9995168447494507, 0.9995168447494507, 0.999396026134491, 0.9995168447494507, 0.9997584223747253, 0.999275267124176, 0.9990336894989014, 0.999275267124176, 0.999396026134491, 0.9998791813850403, 0.9991544485092163],
    'Test Accuracy': [0.19413919746875763, 0.9988729357719421, 0.9980276226997375, 0.9985911846160889, 0.9985911846160889, 0.9980276226997375, 0.9988729357719421, 0.997464120388031, 0.9980276226997375, 0.9983094334602356, 0.9966188073158264, 0.9980276226997375, 0.9983094334602356, 0.997182309627533, 0.9994364976882935, 0.997464120388031, 0.9985911846160889, 0.9977458715438843, 0.9983094334602356, 0.9985911846160889, 0.9949281811714172, 0.9952099323272705]
}

df = pd.DataFrame(data)
df

Unnamed: 0,PCP,Train Accuracy,Test Accuracy
0,1,0.213069,0.194139
1,29,0.998671,0.998873
2,107,0.999275,0.998028
3,215,0.999638,0.998591
4,322,0.999396,0.998591
5,430,0.999275,0.998028
6,537,0.999275,0.998873
7,645,0.999396,0.997464
8,752,0.999638,0.998028
9,860,0.999879,0.998309
