In [41]:
import numpy as np
import random as rd
import shap
from math import log
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import svm
from sklearn.metrics import roc_curve,auc
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import validation_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from mlxtend.plotting import plot_decision_regions
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as pl

In [42]:
def plot_matrix(y_true, y_pred, labels_name, title=None, thresh=0.8, axis_labels=None,fontsize=20,title_fontsize=22, axis_title_fontsize=18):
    # 利用sklearn中的函数生成混淆矩阵并归一化
    cm = metrics.confusion_matrix(y_true, y_pred, labels=labels_name, sample_weight=None)  # 生成混淆矩阵
    print(cm,"\n")
    epsilon = 1e-7
    cm = cm.astype('float') / (cm.sum(axis=1)[:, np.newaxis] + epsilon)  # 归一化
    print(cm,"\n")
    cm[np.isnan(cm)] = 1
    print(cm)

    cmp = ConfusionMatrixDisplay(cm, display_labels=np.arange(25))
    fig, ax = plt.subplots(figsize=(20,10))

    im=pl.imshow(cm, interpolation='nearest', cmap=pl.get_cmap('Blues'),vmin=0.0)
    cbar = plt.colorbar(im, ax=ax)  # 获取颜色渐进条对象
    cbar.ax.tick_params(labelsize=axis_title_fontsize)
    # pl.colorbar()  # 绘制图例

    # 图像标题
    if title is not None:
        pl.title(title)
        pl.title(title, fontsize=title_fontsize)
    # 绘制坐标
    num_local = np.array(range(len(labels_name)))
    if axis_labels is None:
        axis_labels = labels_name
    pl.xticks(num_local, axis_labels, rotation=45,fontsize=axis_title_fontsize)  # 将标签印在x轴坐标上， 并倾斜45度
    pl.yticks(num_local, axis_labels,fontsize=axis_title_fontsize)  # 将标签印在y轴坐标上
    pl.ylabel('True label',fontsize=axis_title_fontsize)
    pl.xlabel('Predicted label',fontsize=axis_title_fontsize)

    # 为底部的标签留出更多的空间
    fig.subplots_adjust(bottom=0.4, right=0.85) 

    # 调整子图间距以适应坐标轴标签
    plt.tight_layout()
    
    for i in range(np.shape(cm)[0]):
        for j in range(np.shape(cm)[1]):
            percentage = cm[i][j] * 100
        # 检查百分比是否大于0以避免在文本中显示0%
            if percentage >= 0:
            # 格式化为固定的小数点后两位的百分比字符串
               percentage_str = "{:.0f}%".format(percentage)
            # 根据阈值设置文本颜色
               color = "white" if cm[i][j] > thresh else "black"
            # 在混淆矩阵的相应位置添加文本
               plt.text(j, i, percentage_str,
                        ha="center", va="center",
                        color=color,fontsize=fontsize)
    
    # 保存为矢量图
    plt.savefig("KNN_confusion_matrix.svg", format="svg")  # 保存为SVG格式的矢量图
    # 显示
    pl.show()


In [43]:
def count_nber(nb,arr):
    return sum(element == nb for element in arr)

In [44]:
def cross_val_class_score(clf, X, y, cv=2):
    kfold = StratifiedKFold(n_splits=cv).split(X, y)
    class_accuracy = []
    for k, (train, test) in enumerate(kfold):
        clf.fit(X[train], y[train])  # 使用训练数据拟合模型
        y_test = y[test]
        y_pred = clf.predict(X[test])
        # 计算混淆矩阵，通过混淆矩阵找出对于每一个折，分类是0或者1的概率
        class_acc = cmat.diagonal() / cmat.sum(axis=1)
        class_accuracy.append(class_acc)
        print('fold: {:d} accuracy {:s}'.format(k + 1, str(class_acc)))
    return np.array(class_accuracy)

In [45]:
def plot_feature_importances(feature_importances, title, feature_names):
    # Normalize the importance values
    # feature_importances = 100.0 * (feature_importances / max(feature_importances))

    # Sort the values and flip them
    index_sorted = np.flipud(np.argsort(feature_importances))
    
    print(index_sorted,type(index_sorted))
    # Arrange the X ticks
    pos = np.arange(index_sorted.shape[0]) + 0.5
    # Plot the bar graph
    plt.figure(figsize=(25,8),dpi=80)
    plt.bar(pos, feature_importances[index_sorted], align='center')
    print(feature_names[index_sorted].values.tolist())
    plt.xticks(pos, feature_names[index_sorted].values.tolist()[0]) 
    plt.ylabel('Relative Importance')
    plt.title(title)
    plt.show()

In [None]:
data = pd.read_excel('T-VMS.xlsx')
display(data)

sm = SMOTE(random_state=40)
X_res, y_res = sm.fit_resample(data.iloc[:,:-1],data.iloc[:,-1])
display(X_res,y_res)
X_train, X_test, Y_train, Y_test = train_test_split(X_res,  y_res,
                                                    test_size=0.3, random_state=30, stratify = y_res)

mm = preprocessing.MinMaxScaler()
ss = preprocessing.StandardScaler()
# 获取数据源
X_train = pd.DataFrame(ss.fit_transform(X_train))
X_test = pd.DataFrame(ss.fit_transform(X_test))
# 归一化处理

display(X_train, X_test, Y_train, Y_test)
np.random.seed(1)
clf = RandomForestClassifier(n_estimators=400)
# clf.fit(X, y)
# max_depths: list[int] = [3, 4, 5, 6, 7, 9, 11, 13, 15, 17, 19]
train_scores, test_scores = validation_curve(
    estimator=clf,
    X=X_train,
    y=Y_train,
    param_name='max_depth',
    param_range=max_depths,
    cv=5
)
train_scores1 = train_scores.flatten()
test_scores1 = test_scores.flatten()
scores = np.vstack((train_scores1, test_scores1)).T
scores


In [47]:
def plot_validation_curve(train_scores, test_scores,
                          param_range, xlabel='', log=False):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    fig = plt.figure()
    plt.plot(param_range, train_mean,
             color=sns.color_palette('Set1')[1], marker='o',
             markersize=5, label='training accuracy')

    plt.fill_between(param_range, train_mean + train_std,
                     train_mean - train_std, alpha=0.15,
                     color=sns.color_palette('Set1')[1])

    plt.plot(param_range, test_mean,
             color=sns.color_palette('Set1')[0], linestyle='--',
             marker='s', markersize=5,
             label='validation accuracy')

    plt.fill_between(param_range,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.15, color=sns.color_palette('Set1')[0])

    # if log:
    #     plt.xscale('log')
    # plt.legend(loc='lower right')
    # if xlabel:
    #     plt.xlabel(xlabel)
    # plt.ylabel('Accuracy')
    # plt.ylim(0.9, 1.0)
    # return fig

In [None]:
plot_validation_curve(train_scores, test_scores, max_depths, xlabel='max_depth')
plt.xlim(3, 19)
plt.ylim(0, 1)

In [None]:
# plt.savefig(validation_curve_path, bbox_inches='tight', dpi=300)
np.random.seed(1)

# clf = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)
clf.fit(X_train, Y_train)

# pred = clf.predict(X_test)
# 训练集评分
print(clf.score(X=X_train, y=Y_train))
print(metrics.accuracy_score(y_true=Y_train, y_pred=clf.predict(X_train)))
# 测试集评分
print(clf.score(X_test, Y_test))
print(metrics.accuracy_score(Y_test,pred))

#scores = cross_val_class_score(clf, X_train, Y_train)
cmat = confusion_matrix(Y_test, pred)
columns=["Biomdal-Felsic",
"Felsic-Siliciclastic",
"Mafic-Siliciclastic",
"Mafic",
"Biomdal-Mafic"
]
# index =  [0,1,2,3,4]
# plot_matrix(Y_test, pred,index, title='KNN_Confusion_matrix',
#                     axis_labels=columns)

#print('accuracy {} +/- {}'.format(scores.mean(axis=0), scores.std(axis=0))) 
# boxplot
# data=pd.DataFrame(scores, columns=["a","b"])
# display(data)
# sns.boxplot(data=pd.DataFrame(scores, columns=["a","b"]), palette=sns.color_palette('Set1'))
# plt.xlabel('Left')
# plt.ylabel('accuracy')
# plt.show()

#### from sklearn.metrics import accuracy_score # 引入准确度评分函数
y_pred = clf.predict(X_test)
print('训练集模型分数:', clf.score(X_train,Y_train))
print('测试集模型分数:', clf.score(X_test,Y_test))
print("训练集准确率: %.3f" % accuracy_score(Y_train, clf.predict(X_train)))
print("测试集准确率: %.3f" % accuracy_score(Y_test, y_pred))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 计算精确度
precision = precision_score(Y_test, pred, average='macro')
print(f'Precision: {precision}')

# 计算召回率
recall = recall_score(Y_test, pred, average='macro')
print(f'Recall: {recall}')

# 计算 F1 分数
f1 = f1_score(Y_test, pred, average='macro')
print(f'F1 Score: {f1}')


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
cm = confusion_matrix(Y_test, pred)
num_classes = len(np.unique(Y_test))

for i in range(num_classes):
    precision_i = cm[i][i]/sum([cm[j][i] for j in range(num_classes)])
    recall_i = cm[i][i]/sum([cm[i][j] for j in range(num_classes)])
    f1_i = 2 * (precision_i * recall_i)/(precision_i + recall_i)
    print(f"类别 {columns[i]} 的Precision: {precision_i:.4f}")
    print(f"类别 {columns[i]} 的Recall: {recall_i:.4f}")
    print(f"类别 {columns[i]} 的F1-score: {f1_i:.4f}")