In [14]:
import time
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import chardet
import os

from joblib import dump, load
from scipy.signal import savgol_filter
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [21]:
# 加载数据
data = pd.read_csv(r"D:\向航\Jupyter_project\02_HSNI data classification\Final_Model\data_cz_test_RF\data_33_lithology_train_33+1(240).csv",encoding='utf-8') #encoding='GBK',防止中文乱码

# 数据预处理
X = data.iloc[:,1:-1]
y_origin = data.iloc[:,-1]
X_SG = savgol_filter(X, 5, 2)
# X_SG_mms = MinMaxScaler().fit_transform(X_SG)
Label = LabelEncoder().fit_transform(y_origin)
# data.iloc[:,1:-1] = X_SG_mms
data.iloc[:,1:-1] = X_SG
data.iloc[:, -1] = Label
X = data.iloc[:,1:-1]
y = data.iloc[:,-1]
X.shape, y.shape

# 降维
X_dr = PCA(29).fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_dr, y, test_size=0.3, random_state=0)

# 贝叶斯优化最佳参数
rfc = RFC(n_estimators=10
            ,max_depth=6
            ,max_features=9
            ,min_samples_leaf=1
            ,min_samples_split=5
            ,random_state=0
           )
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)

# 计算精确率、召回率和F1分数
precision, recall, f1, _ = precision_recall_fscore_support(Y_test, Y_pred, labels=[33])

# 打印结果
print(f"Precision for label 33: {precision[0]}")
print(f"Recall for label 33: {recall[0]}")
print(f"F1 Score for label 33: {f1[0]}")

Precision for label 33: 0.8805970149253731
Recall for label 33: 0.9076923076923077
F1 Score for label 33: 0.8939393939393939


In [18]:
# 初始化 LabelEncoder
label_encoder = LabelEncoder()

# 对原始标签进行拟合
label_encoder.fit(y_origin)

# 创建一个 DataFrame 来展示原始标签与转换后标签的对应关系
label_mapping_df = pd.DataFrame({
    'Original_Label': label_encoder.classes_,
    'Encoded_Label': label_encoder.transform(label_encoder.classes_)
})

# 显示表格
print(label_mapping_df)

   Original_Label  Encoded_Label
0          01_辉绿岩              0
1          02_斜长岩              1
2          03_正长岩              2
3        05_辉石闪长岩              3
4         07_花岗斑岩              4
5          09_橄榄岩              5
6         10_闪长玢岩              6
7        11_粗粒花岗岩              7
8        12_斑状花岗岩              8
9        13_斜长花岗岩              9
10       17_角砾凝灰岩             10
11        26_紫色页岩             11
12        28_炭质页岩             12
13        31_泥质灰岩             13
14       33_泥晶石灰岩             14
15        37_石英砾岩             15
16       38_复成份砾岩             16
17        41_石英砂岩             17
18         42_细砂岩             18
19      43_高岭石粘土岩             19
20      44_蒙脱石粘土岩             20
21      45_伊利石粘土岩             21
22        49_石英岩②             22
23         51_云英岩             23
24       55_粗晶大理岩             24
25       56_雪白大理岩             25
26       58_花岗片麻岩             26
27       59_绿泥石片岩             27
28     60_含榴白云母片岩             28
29      63

In [5]:
# 保存模型
dump(rfc, 'rfc_model.joblib')

# 加载模型
rfc_model = load('rfc_model.joblib')

# 原始测试数据的文件路径
original_file_path = r"D:\向航\Jupyter_project\02_HSNI data classification\Final_Model\data_cz_test_RF\data_cz_test_240.csv"

# 加载测试数据（假设没有标签列）
data_test = pd.read_csv(original_file_path, encoding='GBK')

# 应用预处理
X_test = data_test.iloc[:, 1:]  # 假设第一列是索引或其他非特征列
X_test_SG = savgol_filter(X_test, 5, 2)
X_test_dr = PCA(29).fit_transform(X_test_SG)  # 应用 PCA 降维

# 进行预测
predictions = rfc_model.predict(X_test_dr)

# 创建新的数据框，只包含原始数据的第一列和预测标签
result_df = pd.DataFrame({
    'Index': data_test.iloc[:, 0],  # 原始数据的第一列
    'Predictions': predictions
})

# 构建新的文件路径，添加 "_predictions" 后缀
file_dir, file_name = os.path.split(original_file_path)
file_name_without_ext = os.path.splitext(file_name)[0]
new_file_name = file_name_without_ext + "_predictions.csv"
new_file_path = os.path.join(file_dir, new_file_name)

# 使用新文件路径保存结果
result_df.to_csv(new_file_path, index=False, encoding='GBK')

print("仅包含索引和预测标签的结果已保存到文件中：" + new_file_path)

仅包含索引和预测标签的结果已保存到文件中：D:\向航\Jupyter_project\02_HSNI data classification\Final_Model\data_cz_test_RF\data_cz_test_240_predictions.csv


In [6]:
# 将 predictions 数列转换为 Pandas Series
predictions_series = pd.Series(predictions)

# 计算每个预测标签的出现次数
label_counts = predictions_series.value_counts()

# 计算每个标签的占比
label_proportions = label_counts / len(predictions_series)

# 创建一个 DataFrame 来存储结果
results_df = pd.DataFrame({
    'Label': label_counts.index,
    'Count': label_counts.values,
    'Proportion': label_proportions.values
})

# 按照 Label 列进行升序排序
sorted_results_df = results_df.sort_values(by='Label')

# 显示排序后的结果
print(sorted_results_df)

# 筛选出特定标签 7, 8, 9 的计数
specific_label_counts = label_counts.loc[[7, 8, 9]]

# 计算这些标签的总计数
specific_total_count = specific_label_counts.sum()

# 计算这些标签的总占比
specific_total_proportion = specific_total_count / len(predictions_series)

print(f"标签 7, 8, 9 的总计数: {specific_total_count}")
print(f"标签 7, 8, 9 的总占比: {specific_total_proportion:.4f}")


    Label  Count  Proportion
23      0      1    0.004167
2       1     20    0.083333
1       2     20    0.083333
6       3     11    0.045833
17      4      4    0.016667
15      6      4    0.016667
19      7      3    0.012500
0       8     71    0.295833
5       9     12    0.050000
13     10      4    0.016667
4      11     13    0.054167
3      13     17    0.070833
10     14      7    0.029167
21     16      1    0.004167
20     19      2    0.008333
14     20      4    0.016667
22     22      1    0.004167
9      23      7    0.029167
11     26      5    0.020833
12     28      5    0.020833
16     29      4    0.016667
8      30     10    0.041667
18     31      4    0.016667
7      32     10    0.041667
标签 7, 8, 9 的总计数: 86
标签 7, 8, 9 的总占比: 0.3583
