In [1]:
import time
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib_inline import backend_inline
backend_inline.set_matplotlib_formats('svg') # 展示高清图，在 Jupyter Notebook 中设置 matplotlib 图形的输出格式为 SVG 格式

import torch
import torch.nn as nn
from scipy.signal import savgol_filter
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as TTS
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import random_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report

import random
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 检测系统是否支持 CUDA，即是否有 NVIDIA GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# 加载数据
data = pd.read_csv(r"D:\向航\Jupyter_project\02_HSNI data classification\Final_Model\data_cz_test_ANN\data_33_lithology_train_33+1(85).csv",encoding='utf-8') #encoding='GBK',防止中文乱码

# 数据预处理
X_origin = data.iloc[:,1:-1]
y_origin = data.iloc[:,-1]
X_SG = savgol_filter(X_origin, 5, 2)
Label = LabelEncoder().fit_transform(y_origin)
data.iloc[:,1:-1] = X_SG
data.iloc[:, -1] = Label
X = data.iloc[:,1:-1]
y = data.iloc[:,-1]
X.shape, y.shape

# 降维
X_dr = PCA(29).fit_transform(X)

# 转换为 torch 中的张量格式
X_tensor = torch.tensor(X_dr, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# 将数据移动到GPU
X_tensor = X_tensor.to(device)
y_tensor = y_tensor.to(device)

Xtrain, Xtest, Ytrain, Ytest = TTS(X_dr,y,test_size=0.4,random_state=0)

# 转换为 torch 中的张量格式
X_train_tensor = torch.tensor(Xtrain, dtype=torch.float32)
X_test_tensor = torch.tensor(Xtest, dtype=torch.float32)
y_train_tensor = torch.tensor(Ytrain, dtype=torch.long)
y_test_array = Ytest.values
y_test_tensor = torch.tensor(y_test_array, dtype=torch.long)

# 将数据移动到GPU
X_train_tensor = X_train_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

# 创建训练集和测试集的 TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# 创建 DataLoader 对象
# train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

train_size = len(train_dataset)
test_size = len(test_dataset)

train_size, test_size

cuda


(7147, 4766)

In [3]:
# 初始化 LabelEncoder
label_encoder = LabelEncoder()

# 对原始标签进行拟合
label_encoder.fit(y_origin)

# 创建一个 DataFrame 来展示原始标签与转换后标签的对应关系
label_mapping_df = pd.DataFrame({
    'Original_Label': label_encoder.classes_,
    'Encoded_Label': label_encoder.transform(label_encoder.classes_)
})

# 显示表格
print(label_mapping_df)

   Original_Label  Encoded_Label
0          01_辉绿岩              0
1          02_斜长岩              1
2          03_正长岩              2
3        05_辉石闪长岩              3
4         07_花岗斑岩              4
5          09_橄榄岩              5
6         10_闪长玢岩              6
7        11_粗粒花岗岩              7
8        12_斑状花岗岩              8
9        13_斜长花岗岩              9
10       17_角砾凝灰岩             10
11        26_紫色页岩             11
12        28_炭质页岩             12
13        31_泥质灰岩             13
14       33_泥晶石灰岩             14
15        37_石英砾岩             15
16       38_复成份砾岩             16
17        41_石英砂岩             17
18         42_细砂岩             18
19      43_高岭石粘土岩             19
20      44_蒙脱石粘土岩             20
21      45_伊利石粘土岩             21
22        49_石英岩②             22
23         51_云英岩             23
24       55_粗晶大理岩             24
25       56_雪白大理岩             25
26       58_花岗片麻岩             26
27       59_绿泥石片岩             27
28     60_含榴白云母片岩             28
29      63

In [4]:
# 搭建网络
class DNN(nn.Module):
    def __init__(self, input_size, num_classes, hidden_layer_sizes, dropout_prob):
        ''' 搭建神经网络各层 '''
        super(DNN, self).__init__()

        self.hidden_layer_sizes = hidden_layer_sizes  # 调整隐藏层尺寸
        self.dropout_prob = dropout_prob  # 调整 dropout 概率

        layers = []
        prev_layer_size = input_size

        for layer_size in self.hidden_layer_sizes:
            layers.append(nn.Linear(prev_layer_size, layer_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(self.dropout_prob))
            prev_layer_size = layer_size

        layers.append(nn.Linear(prev_layer_size, num_classes))

        self.net = nn.Sequential(*layers)
        
    def forward(self, x):
        ''' 前向传播 '''
        y = self.net(x) # x 即输入数据
        return y # y 即输出数据

# 贝叶斯优化超参数
num_classes = 34
input_size = X_tensor.shape[1]
hidden_layer_sizes = [170, 241, 97, 177, 164]
dropout_prob = 0.16

# 创建模型实例
model = DNN(input_size, num_classes, hidden_layer_sizes, dropout_prob)
model.to(device)
model

DNN(
  (net): Sequential(
    (0): Linear(in_features=29, out_features=170, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.16, inplace=False)
    (3): Linear(in_features=170, out_features=241, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.16, inplace=False)
    (6): Linear(in_features=241, out_features=97, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.16, inplace=False)
    (9): Linear(in_features=97, out_features=177, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.16, inplace=False)
    (12): Linear(in_features=177, out_features=164, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.16, inplace=False)
    (15): Linear(in_features=164, out_features=34, bias=True)
  )
)

In [5]:
# 训练
loss_fn = nn.CrossEntropyLoss(reduction='mean') # 对所有样本的损失求平均，得到一个标量
learning_rate = 0.0028690154931344
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epochs = 100
accuracies = [] # 记录准确率变化的列表
losses = [] # 记录损失函数变化的列表
model.train()

for epoch in range(epochs):
    total_correct = 0
    total_samples = 0
    batch_loss = 0
    
    for (x, y) in train_dataloader:
        x, y = x.to(device), y.to(device)
        Pred = model(x)
        _, Pred_classes = torch.max(Pred, dim=1)
        total_correct += torch.sum(Pred_classes == y)
        total_samples += y.size(0)
        loss = loss_fn(Pred, y)
        batch_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    epoch_loss = batch_loss / len(train_dataloader)
    epoch_accuracy = total_correct / total_samples
    accuracies.append(epoch_accuracy.item())
    losses.append(epoch_loss)

    # 打印当前 epoch 的损失和累计平均损失
    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {epoch_loss:.4f}, Cumulative Avg Loss: {sum(losses)/(epoch+1):.4f}, Accuracy: {epoch_accuracy:.4f}")

Epoch [1/100] - Loss: 2.0747, Cumulative Avg Loss: 2.0747, Accuracy: 0.3189
Epoch [2/100] - Loss: 0.7316, Cumulative Avg Loss: 1.4031, Accuracy: 0.7353
Epoch [3/100] - Loss: 0.4180, Cumulative Avg Loss: 1.0747, Accuracy: 0.8515
Epoch [4/100] - Loss: 0.2828, Cumulative Avg Loss: 0.8768, Accuracy: 0.9032
Epoch [5/100] - Loss: 0.1807, Cumulative Avg Loss: 0.7375, Accuracy: 0.9386
Epoch [6/100] - Loss: 0.1822, Cumulative Avg Loss: 0.6450, Accuracy: 0.9439
Epoch [7/100] - Loss: 0.1489, Cumulative Avg Loss: 0.5741, Accuracy: 0.9521
Epoch [8/100] - Loss: 0.1222, Cumulative Avg Loss: 0.5176, Accuracy: 0.9639
Epoch [9/100] - Loss: 0.0951, Cumulative Avg Loss: 0.4707, Accuracy: 0.9709
Epoch [10/100] - Loss: 0.0858, Cumulative Avg Loss: 0.4322, Accuracy: 0.9769
Epoch [11/100] - Loss: 0.1023, Cumulative Avg Loss: 0.4022, Accuracy: 0.9696
Epoch [12/100] - Loss: 0.0870, Cumulative Avg Loss: 0.3759, Accuracy: 0.9761
Epoch [13/100] - Loss: 0.0627, Cumulative Avg Loss: 0.3518, Accuracy: 0.9800
Epoch [1

In [6]:
# 测试
correct = 0
total = 0
all_preds = []
all_labels = []
model.eval()

with torch.no_grad(): # 该局部关闭梯度计算功能
    for (x, y) in test_dataloader: # 获取小批次的 x 与 y
        x, y = x.to(device), y.to(device)
        Pred = model(x) # 一次前向传播（小批量）
        _, Pred_classes = torch.max(Pred, dim=1)  # 找到最大概率对应的类别索引
        correct += torch.sum(Pred_classes == y)  # 计算正确的个数
        all_preds.extend(Pred_classes.cpu().numpy())  # 将预测转换为NumPy数组并存储
        all_labels.extend(y.cpu().numpy())  # 将真实标签转换为NumPy数组并存储
        total += y.size(0)
accuracy = correct/total
accuracy

tensor(0.9990, device='cuda:0')

In [7]:
from sklearn.metrics import classification_report
import pandas as pd

# 假设 all_labels_2 和 all_preds_2 分别是真实标签和预测标签的列表
report = classification_report(all_labels, all_preds, output_dict=True)

# 将报告转换为 Pandas DataFrame
report_df = pd.DataFrame(report).transpose()

# 删除总结行（如果不需要的话）
report_df = report_df.drop(['accuracy', 'macro avg', 'weighted avg'])

print(report_df)

    precision    recall  f1-score  support
0    1.000000  1.000000  1.000000    159.0
1    1.000000  1.000000  1.000000    154.0
2    0.993631  1.000000  0.996805    156.0
3    1.000000  1.000000  1.000000    116.0
4    1.000000  1.000000  1.000000    156.0
5    1.000000  1.000000  1.000000    169.0
6    1.000000  1.000000  1.000000    156.0
7    1.000000  1.000000  1.000000    160.0
8    0.986577  1.000000  0.993243    147.0
9    1.000000  1.000000  1.000000    159.0
10   1.000000  0.988571  0.994253    175.0
11   0.994012  1.000000  0.996997    166.0
12   1.000000  1.000000  1.000000    115.0
13   1.000000  1.000000  1.000000    154.0
14   1.000000  1.000000  1.000000    156.0
15   1.000000  1.000000  1.000000     71.0
16   1.000000  1.000000  1.000000     76.0
17   1.000000  1.000000  1.000000    110.0
18   1.000000  1.000000  1.000000    123.0
19   1.000000  1.000000  1.000000    177.0
20   1.000000  1.000000  1.000000    174.0
21   1.000000  1.000000  1.000000    161.0
22   1.0000

In [8]:
from sklearn.metrics import confusion_matrix

# 生成混淆矩阵
cm = confusion_matrix(all_labels, all_preds)

# 提取标签 33 的混淆矩阵行
label_33_row = cm[33]

# 输出标签 33 的所有预测情况
for label, count in enumerate(label_33_row):
    if label == 33:
        print(f"标签 33 正确预测的数量: {count}")
    elif count > 0:
        print(f"标签 33 被错误预测为标签 {label} 的数量: {count}")



标签 33 被错误预测为标签 2 的数量: 1
标签 33 正确预测的数量: 35
