# 1. 导入模块和库

In [None]:
import time
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.signal import savgol_filter
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report

# 2. 导入数据+预处理+PCA

In [None]:
data = pd.read_csv(r"F:\Code_Data\2_Rocks_Spectrum_Reflectance_essay_30_origin_modify_2.csv",encoding='utf-8') #encoding='GBK',防止中文乱码
X = data.iloc[:,1:-1]
y = data.iloc[:,-1]
X.shape, y.shape

X_SG = savgol_filter(X, 5, 2)
X_SG_mms = MinMaxScaler().fit_transform(X_SG)
Label = LabelEncoder().fit_transform(y)

data.iloc[:,1:-1] = X_SG_mms
data.iloc[:, -1] = Label
X = data.iloc[:,1:-1]
y = data.iloc[:,-1]

In [None]:
# 贝叶斯优化最佳参数
rfc_2 = RFC(n_estimators=129
            ,max_depth=33
            ,max_features=9
            ,min_samples_leaf=1
            ,min_samples_split=5
            ,random_state=0
           )
rfc_2 = rfc_2.fit(X_train, Y_train)
pred_2 = rfc_2.predict(X_test)

# 贝叶斯优化评价指标
accuracy_score_2_train = rfc_2.score(X_train, Y_train)
accuracy_score_2_test = rfc_2.score(X_test, Y_test)
kappa_score_2 = cohen_kappa_score(Y_test, pred_2)
cm_2 = confusion_matrix(Y_test, pred_2)
report = classification_report(Y_test, pred_2, output_dict=True)

In [None]:
accuracy_score_2_train, accuracy_score_2_test

# 3. 不同主成分占比下的准确率

In [None]:
pcp_values = [1] + [int(p) for p in np.arange(0.05, 1.05, 0.05)*X.shape[1]]
Train_Accuracy = []
Test_Accuracy = []
results = []

for pcp in pcp_values:
    pca = PCA(n_components=pcp)
    X_dr = pca.fit_transform(X)

    X_train, X_test, Y_train, Y_test = train_test_split(X_dr, y, test_size=0.3, random_state=0)
    
    rfc_2 = RFC(n_estimators=129
            ,max_depth=33
            ,max_features=9
            ,min_samples_leaf=1
            ,min_samples_split=5
            ,random_state=0
           )
    rfc_2 = rfc_2.fit(X_train, Y_train)
    
    accuracy_score_train = rfc_2.score(X_train, Y_train)
    accuracy_score_test = rfc_2.score(X_test, Y_test)

    Train_Accuracy.append(accuracy_score_train)
    Test_Accuracy.append(accuracy_score_test)
    
    print(f'PCP={pcp}: Train Accuracy = {accuracy_score_train}, Test Accuracy = {accuracy_score_test}')
    results.append({'PCP': pcp, 'Train Accuracy': accuracy_score_train, 'Test Accuracy': accuracy_score_test})
    
results_df = pd.DataFrame(results)
results_df