In [7]:
import time
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib_inline import backend_inline
backend_inline.set_matplotlib_formats('svg') # 展示高清图，在 Jupyter Notebook 中设置 matplotlib 图形的输出格式为 SVG 格式

import torch
import torch.nn as nn
from scipy.signal import savgol_filter
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as TTS
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import random_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report

import random
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [8]:
# 检测系统是否支持 CUDA，即是否有 NVIDIA GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# 加载数据
data = pd.read_csv(r"D:\向航\Jupyter_project\02_HSNI data classification\Final_Model\data_cz_test_ANN\data_33_lithology_train.csv",encoding='utf-8') #encoding='GBK',防止中文乱码

# 数据预处理
X_origin = data.iloc[:,1:-1]
y_origin = data.iloc[:,-1]
X_SG = savgol_filter(X_origin, 5, 2)
Label = LabelEncoder().fit_transform(y_origin)
data.iloc[:,1:-1] = X_SG
data.iloc[:, -1] = Label
X = data.iloc[:,1:-1]
y = data.iloc[:,-1]
X.shape, y.shape

# 降维
X_dr = PCA(29).fit_transform(X)

# 转换为 torch 中的张量格式
X_tensor = torch.tensor(X_dr, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# 假设 device 已经设置为 "cuda" 或 "cpu"
X_tensor = X_tensor.to(device)
y_tensor = y_tensor.to(device)

# 创建整个数据集的 TensorDataset
dataset = TensorDataset(X_tensor, y_tensor)

# 创建 DataLoader 对象
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)  # 可以调整 batch_size 和 shuffle

# 获取数据集的大小
dataset_size = len(dataset)

dataset_size

cuda


11913

In [3]:
# 初始化 LabelEncoder
label_encoder = LabelEncoder()

# 对原始标签进行拟合
label_encoder.fit(y_origin)

# 创建一个 DataFrame 来展示原始标签与转换后标签的对应关系
label_mapping_df = pd.DataFrame({
    'Original_Label': label_encoder.classes_,
    'Encoded_Label': label_encoder.transform(label_encoder.classes_)
})

# 显示表格
print(label_mapping_df)

   Original_Label  Encoded_Label
0          01_辉绿岩              0
1          02_斜长岩              1
2          03_正长岩              2
3        05_辉石闪长岩              3
4         07_花岗斑岩              4
5          09_橄榄岩              5
6         10_闪长玢岩              6
7        11_粗粒花岗岩              7
8        12_斑状花岗岩              8
9        13_斜长花岗岩              9
10       17_角砾凝灰岩             10
11        26_紫色页岩             11
12        28_炭质页岩             12
13        31_泥质灰岩             13
14       33_泥晶石灰岩             14
15        37_石英砾岩             15
16       38_复成份砾岩             16
17        41_石英砂岩             17
18         42_细砂岩             18
19      43_高岭石粘土岩             19
20      44_蒙脱石粘土岩             20
21      45_伊利石粘土岩             21
22        49_石英岩②             22
23         51_云英岩             23
24       55_粗晶大理岩             24
25       56_雪白大理岩             25
26       58_花岗片麻岩             26
27       59_绿泥石片岩             27
28     60_含榴白云母片岩             28
29      63

In [4]:
# 搭建网络
class DNN(nn.Module):
    def __init__(self, input_size, num_classes, hidden_layer_sizes, dropout_prob):
        ''' 搭建神经网络各层 '''
        super(DNN, self).__init__()

        self.hidden_layer_sizes = hidden_layer_sizes  # 调整隐藏层尺寸
        self.dropout_prob = dropout_prob  # 调整 dropout 概率

        layers = []
        prev_layer_size = input_size

        for layer_size in self.hidden_layer_sizes:
            layers.append(nn.Linear(prev_layer_size, layer_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(self.dropout_prob))
            prev_layer_size = layer_size

        layers.append(nn.Linear(prev_layer_size, num_classes))

        self.net = nn.Sequential(*layers)
        
    def forward(self, x):
        ''' 前向传播 '''
        y = self.net(x) # x 即输入数据
        return y # y 即输出数据

# 贝叶斯优化超参数
num_classes = 33
input_size = X_tensor.shape[1]
hidden_layer_sizes = [170, 241, 97, 177, 164]
dropout_prob = 0.16

# 创建模型实例
model = DNN(input_size, num_classes, hidden_layer_sizes, dropout_prob)
model.to(device)
model

DNN(
  (net): Sequential(
    (0): Linear(in_features=29, out_features=170, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.16, inplace=False)
    (3): Linear(in_features=170, out_features=241, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.16, inplace=False)
    (6): Linear(in_features=241, out_features=97, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.16, inplace=False)
    (9): Linear(in_features=97, out_features=177, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.16, inplace=False)
    (12): Linear(in_features=177, out_features=164, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.16, inplace=False)
    (15): Linear(in_features=164, out_features=34, bias=True)
  )
)

In [5]:
# 训练
loss_fn = nn.CrossEntropyLoss(reduction='mean') # 对所有样本的损失求平均，得到一个标量
learning_rate = 0.0028690154931344
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epochs = 15
accuracies = [] # 记录准确率变化的列表
losses = [] # 记录损失函数变化的列表
model.train()

for epoch in range(epochs):
    total_correct = 0
    total_samples = 0
    batch_loss = 0
    
    for (x, y) in data_loader:
        x, y = x.to(device), y.to(device)
        Pred = model(x)
        _, Pred_classes = torch.max(Pred, dim=1)
        total_correct += torch.sum(Pred_classes == y)
        total_samples += y.size(0)
        loss = loss_fn(Pred, y)
        batch_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    epoch_loss = batch_loss / len(data_loader)
    epoch_accuracy = total_correct / total_samples
    accuracies.append(epoch_accuracy.item())
    losses.append(epoch_loss)

    # 打印当前 epoch 的损失和累计平均损失
    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {epoch_loss:.4f}, Cumulative Avg Loss: {sum(losses)/(epoch+1):.4f}, Accuracy: {epoch_accuracy:.4f}")

Epoch [1/15] - Loss: 1.6022, Cumulative Avg Loss: 1.6022, Accuracy: 0.4656
Epoch [2/15] - Loss: 0.4246, Cumulative Avg Loss: 1.0134, Accuracy: 0.8498
Epoch [3/15] - Loss: 0.2327, Cumulative Avg Loss: 0.7532, Accuracy: 0.9216
Epoch [4/15] - Loss: 0.1410, Cumulative Avg Loss: 0.6001, Accuracy: 0.9559
Epoch [5/15] - Loss: 0.1124, Cumulative Avg Loss: 0.5026, Accuracy: 0.9655
Epoch [6/15] - Loss: 0.1156, Cumulative Avg Loss: 0.4381, Accuracy: 0.9644
Epoch [7/15] - Loss: 0.0951, Cumulative Avg Loss: 0.3891, Accuracy: 0.9752
Epoch [8/15] - Loss: 0.0727, Cumulative Avg Loss: 0.3496, Accuracy: 0.9798
Epoch [9/15] - Loss: 0.0617, Cumulative Avg Loss: 0.3176, Accuracy: 0.9820
Epoch [10/15] - Loss: 0.0534, Cumulative Avg Loss: 0.2912, Accuracy: 0.9846
Epoch [11/15] - Loss: 0.0596, Cumulative Avg Loss: 0.2701, Accuracy: 0.9834
Epoch [12/15] - Loss: 0.0876, Cumulative Avg Loss: 0.2549, Accuracy: 0.9793
Epoch [13/15] - Loss: 0.0368, Cumulative Avg Loss: 0.2381, Accuracy: 0.9905
Epoch [14/15] - Loss:

In [6]:
# 加载新数据
test_data = pd.read_csv(r"D:\向航\Jupyter_project\02_HSNI data classification\Final_Model\data_cz_test_ANN\data_cz_test_2.csv", encoding='utf-8')

# 应用相同的数据预处理
X_test_origin = test_data.iloc[:, 1:]  # 假设第一列是索引或非特征列
X_test_SG = savgol_filter(X_test_origin, 5, 2)

# 降维
X_test_dr = PCA(29).fit_transform(X_test_SG)

# 转换为 torch 中的张量格式
X_test_tensor = torch.tensor(X_test_dr, dtype=torch.float32)
X_test_tensor = X_test_tensor.to(device)

# 使用模型进行预测
model.eval()  # 将模型设置为评估模式
with torch.no_grad():
    test_pred = model(X_test_tensor)
    _, test_pred_classes = torch.max(test_pred, dim=1)

# 输出预测结果
predicted_labels = test_pred_classes.cpu().numpy()
print(predicted_labels)

# 创建一个 DataFrame 来保存预测标签
predicted_df = pd.DataFrame(predicted_labels, columns=['Predicted_Label'])

# 计算每个预测标签的出现次数
label_counts = predicted_df['Predicted_Label'].value_counts()

# 将结果转换为 DataFrame 并重置索引
label_counts_df = label_counts.reset_index()
label_counts_df.columns = ['Predicted_Label', 'Count']

# 对 DataFrame 按标签进行排序（从小到大）
sorted_label_counts_df = label_counts_df.sort_values(by='Predicted_Label')

# 显示排序后的结果
print(sorted_label_counts_df)

[ 8 11 32 15 16  8 29 15 22 29 32  0 29  8 15 23 26 20 29 28 29 16 15  7
 28 16 29 29 15 29 16 23 30 29 32 28 17 17 17 30 17 28 15 28 30 17 28 30
  2 24 17 28 28 28 17 30 28 17 30 17 28 17 28 29 29 29 26 17 30  7 17 29
 17 22 28 22 13  2 17 17 17 30 17 30  0]
    Predicted_Label  Count
11                0      2
12                2      2
13                7      2
7                 8      3
16               11      1
15               13      1
4                15      6
5                16      4
0                17     17
17               20      1
8                22      3
9                23      2
14               24      1
10               26      2
2                28     13
1                29     13
3                30      9
6                32      3
