In [1]:
import numpy as np 
import pandas as pd
import sklearn
import time
import os
import shap
import re, pip, conda
import seaborn as sns # 用于特征重要性柱状图 Seaborn默认的图形样式和调色板使得绘图更美观，无需手动调整样式
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve # 使用 roc_curve 函数计算 ROC 曲线的 FPR 和 TPR，并将这些数据存储为 pandas DataFrame
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, KFold
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from ngboost import NGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, make_scorer
from deap import base, creator, tools, algorithms
import random



Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# 设置全局随机种子
np.random.seed(42)
random.seed(42)

In [3]:
# 读取数据
data1 = pd.read_excel(r'D:\Users\刘洋\Desktop\测试负2.xlsx')
X = data1.iloc[:, 1:12].values  # 取除最后一列之外的所有列作为特征
y = data1.iloc[:, 12].values   # 最后一列为标签

In [4]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# 数据标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# 自定义评分标准
scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

In [7]:
# %% 定义遗传算法的适应度和个体
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# 注册生成 0 到 1 之间的随机浮点数，确保在合理范围内
toolbox = base.Toolbox()
toolbox.register("attr_float", random.uniform, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=8)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


In [8]:
# 自定义变异函数，确保不生成 complex 类型的值
def custom_mutate(individual, indpb=0.2):
    for i in range(len(individual)):
        if random.random() < indpb:
            mutation = random.uniform(-0.1, 0.1)  # 设置一个小范围的随机变异
            individual[i] += mutation
            individual[i] = max(0, min(individual[i], 1))  # 确保值在 [0, 1] 范围内
    return individual,

In [9]:
# %% 遗传算法参数设置
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", custom_mutate)  # 使用自定义变异函数
toolbox.register("select", tools.selTournament, tournsize=3)

In [10]:
# %% 评估函数定义（针对各个模型）
def evaluate_xgboost(individual):
    model = XGBClassifier(
        max_depth=int(max(min(individual[0] * 7 + 3, 10), 3)),
        learning_rate=max(min(individual[1] * 0.29 + 0.01, 0.3), 0.01),
        n_estimators = int(max(min(individual[2] * 250 + 50, 300), 50)),
        subsample=max(min(individual[3] * 0.5 + 0.5, 1.0), 0.5),
        reg_lambda=max(individual[4] * 10, 0),
        gamma = max(individual[5] * 5, 0),
        min_child_weight=int(max(min(individual[6] * 9 + 1, 10), 1)),
        colsample_bytree=max(min(individual[7] * 0.5 + 0.5, 1.0), 0.5),
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42  # 保持一致性
    )
    scores = cross_validate(model, X_train, y_train, cv=3, scoring='roc_auc')['test_score']
    return (scores.mean(),)

In [11]:
def evaluate_lightgbm(individual):
   
    model = LGBMClassifier(
        max_depth = int(max(min(individual[0] * 5 + 3, 8), 3)),
       
        learning_rate=max(min(individual[1] * 0.29 + 0.01, 0.3), 0.01),
        n_estimators=int(max(min(individual[2] * 250 + 50, 300), 50)),
        subsample=max(min(individual[3] * 0.5 + 0.5, 1.0), 0.5),
        
        reg_lambda=max(individual[4] * 10, 0),
        min_child_samples=int(max(min(individual[6] * 45 + 5, 50), 5)),
        colsample_bytree=max(min(individual[7] * 0.5 + 0.5, 1.0), 0.5),
        reg_alpha=max(individual[5] * 10, 0),
        random_state=42  # 保持一致性
    )
    scores = cross_validate(model, X_train, y_train, cv=4, scoring='roc_auc')['test_score']
    return (scores.mean(),)

In [12]:
def evaluate_catboost(individual):
    model = CatBoostClassifier(
        depth=int(max(min(individual[0] * 14 + 1, 15), 1)),
        learning_rate=max(min(individual[1] * 0.09 + 0.01, 0.1), 0.01),
        iterations=int(max(min(individual[2] * 450 + 50, 500), 50)),
        od_wait=int(max(min(individual[3] * 90 + 10, 100), 10)),
        l2_leaf_reg=(max(min(individual[5] * 9 + 1, 10),1)),
        bagging_temperature=max(individual[6] * 1, 0),
        colsample_bylevel=max(min(individual[7] * 0.5 + 0.5, 1.0), 0.5),
        verbose=0,
        random_state=42  # 保持一致性
    )
    scores = cross_validate(model, X_train, y_train, cv=2, scoring='roc_auc')['test_score']
    return (scores.mean(),)

In [13]:
def evaluate_ngboost(individual):
    model = NGBClassifier(
        n_estimators=int(max(min(individual[2] * 250 + 50, 300), 50)),
        learning_rate=max(min(individual[1] * 0.09 + 0.01, 0.1), 0.01),
        minibatch_frac=max(min(individual[3] * 0.3 + 0.7, 1.0), 0.7),
        natural_gradient=bool(round(individual[6])),
        random_state=42  # 保持一致性
    )
    scores = cross_validate(model, X_train, y_train, cv=3, scoring='roc_auc')['test_score']
    return (scores.mean(),)

In [14]:
# 使用遗传算法优化模型超参数
def optimize_with_ga(evaluate_func, model_name):
    toolbox.register("evaluate", evaluate_func)
    population = toolbox.population(n=20)    #种群规模
    ngen = 40  # 迭代次数
    cxpb = 0.5  # 交叉概率
    mutpb = 0.2  # 变异概率

    algorithms.eaSimple(population, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen,
                         stats=None, halloffame=None, verbose=False
                         )
    best_individual = tools.selBest(population, k=1)[0]
     # 将最佳个体转换为模型的实际参数字典
    if model_name == "XGBoost":
        optimal_params = {
            'max_depth': int(max(min(best_individual[0] * 7 + 3, 10), 3)),
            'learning_rate': max(min(best_individual[1] * 0.29 + 0.01, 0.3), 0.01),
            'n_estimators': int(max(min(best_individual[2] * 250 + 50, 300), 50)),
            'subsample': max(min(best_individual[3] * 0.5 + 0.5, 1.0), 0.5),
            'reg_lambda': max(best_individual[4] * 10, 0),
            'gamma': max(best_individual[5] * 5, 0),
            'min_child_weight': int(max(min(best_individual[6] * 9 + 1, 10), 1)),
            'colsample_bytree': max(min(best_individual[7] * 0.5 + 0.5, 1.0), 0.5)
        }
    elif model_name == "LGBM":
        optimal_params = {
            'max_depth': int(max(min(best_individual[0] * 5 + 3, 8), 3)),
            'learning_rate': max(min(best_individual[1] * 0.29 + 0.01, 0.3), 0.01),
            'n_estimators': int(max(min(best_individual[2] * 250 + 50, 300), 50)),
            'subsample': max(min(best_individual[3] * 0.5 + 0.5, 1.0), 0.5),
            'reg_lambda': max(best_individual[4] * 10, 0),
            'min_child_samples': int(max(min(best_individual[6] * 45 + 5, 50), 5)),
            'colsample_bytree': max(min(best_individual[7] * 0.5 + 0.5, 1.0), 0.5),
            'reg_alpha': max(best_individual[5] * 10, 0)
        }
    elif model_name == "CatBoost":
        optimal_params = {
            'depth': int(max(min(best_individual[0] * 14 + 1, 15), 1)),
            'learning_rate': max(min(best_individual[1] * 0.09 + 0.01, 0.1), 0.01),
            'iterations': int(max(min(best_individual[2] * 450 + 50, 500), 50)),
            'od_wait': int(max(min(best_individual[3] * 90 + 10, 100), 10)),
            'l2_leaf_reg': max(min(best_individual[4] * 9 + 1, 10), 1),
            'bagging_temperature': max(best_individual[5] * 1, 0),
            'colsample_bylevel': max(min(best_individual[6] * 0.5 + 0.5, 1.0), 0.5)
        }
    elif model_name == "NGBoost":
        optimal_params = {
            'n_estimators': int(max(min(best_individual[2] * 250 + 50, 300), 50)),
            'learning_rate': max(min(best_individual[1] * 0.09 + 0.01, 0.1), 0.01),
            'minibatch_frac': max(min(best_individual[3] * 0.3 + 0.7, 1.0), 0.7),
            'natural_gradient': bool(round(best_individual[6]))
        }
    
    print(f"Optimal Parameters for {model_name}: {optimal_params}")
    return optimal_params

In [None]:
# %% 获取并打印每个模型的最优参数
optimal_params = {
    "XGBoost": optimize_with_ga(evaluate_xgboost, "XGBoost"),
    "LGBM": optimize_with_ga(evaluate_lightgbm, "LGBM"),
    "CatBoost": optimize_with_ga(evaluate_catboost, "CatBoost"),
    "NGBoost": optimize_with_ga(evaluate_ngboost, "NGBoost")
}


In [16]:
# %% 将最优参数保存到 Excel 文件
output_file_path = r"D:\Users\刘洋\Desktop\GA最优参数4.xlsx"
data = {'Model': [], 'Parameter': [], 'Value': []}

# 遍历 optimal_params 字典，将参数数据添加到 data 字典中
for model_name, params in optimal_params.items():
    for param_name, param_value in params.items():
        data['Model'].append(model_name)
        data['Parameter'].append(param_name)
        data['Value'].append(round(param_value, 3) if isinstance(param_value, float) else param_value)

# 转换为 DataFrame
df = pd.DataFrame(data)

# 保存到 Excel 文件
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    df.to_excel(writer, index=False, sheet_name='Optimal Hyperparameters')

print("Optimal hyperparameters have been saved to", output_file_path)

Optimal hyperparameters have been saved to D:\Users\刘洋\Desktop\GA最优参数4.xlsx


In [17]:
# %% 使用最优参数初始化模型
models = {
    "XGBoost": XGBClassifier(**optimal_params["XGBoost"], use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LGBM": LGBMClassifier(**optimal_params["LGBM"], random_state=42),
    "CatBoost": CatBoostClassifier(**optimal_params["CatBoost"], random_seed=42, verbose=0),
    "NGBoost": NGBClassifier(**optimal_params["NGBoost"], random_state=42)
}

In [18]:
# 评估函数
def evaluate_model(model, X_train, y_train, X_test, y_test, cv):
    result = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring
                            , return_train_score=True
                            , verbose=False
                            )
    metrics = {
        "train": {
            "RCO-AUC": result['train_roc_auc'].mean(),
            "ACC": result['train_accuracy'].mean(),
            "F1": result['train_f1'].mean(),
            "Precision": result['train_precision'].mean(),
            "Recall": result['train_recall'].mean()
        },
        "test": {
            "RCO-AUC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]),
            "ACC": accuracy_score(y_test, model.predict(X_test)),
            "F1": f1_score(y_test, model.predict(X_test)),
            "Precision": precision_score(y_test, model.predict(X_test)),
            "Recall": recall_score(y_test, model.predict(X_test))
        }
    }
    return metrics

In [None]:
# 交叉验证并评估优化后的模型
results = {}
cv = KFold(n_splits=10, shuffle=True, random_state=1412)
for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_train, y_train, X_test, y_test, cv)
    end = time.time() - start
    metrics["time"] = end
    results[name] = metrics

In [20]:
# 打印结果
for name, metrics in results.items():
    print(f"Model: {name}")
    print(f"Training time: {metrics['time']:.3f} s")
    for phase in ["train", "test"]:
        print(f"{phase.capitalize()} scores:")
        for metric, score in metrics[phase].items():
            print(f"  {metric}: {score:.3f}")
    print()

Model: XGBoost
Training time: 0.425 s
Train scores:
  RCO-AUC: 0.894
  ACC: 0.803
  F1: 0.815
  Precision: 0.751
  Recall: 0.890
Test scores:
  RCO-AUC: 0.872
  ACC: 0.837
  F1: 0.863
  Precision: 0.815
  Recall: 0.917

Model: LGBM
Training time: 0.249 s
Train scores:
  RCO-AUC: 0.870
  ACC: 0.799
  F1: 0.811
  Precision: 0.745
  Recall: 0.890
Test scores:
  RCO-AUC: 0.854
  ACC: 0.837
  F1: 0.863
  Precision: 0.815
  Recall: 0.917

Model: CatBoost
Training time: 45.652 s
Train scores:
  RCO-AUC: 1.000
  ACC: 0.995
  F1: 0.995
  Precision: 1.000
  Recall: 0.989
Test scores:
  RCO-AUC: 0.910
  ACC: 0.860
  F1: 0.880
  Precision: 0.846
  Recall: 0.917

Model: NGBoost
Training time: 1.861 s
Train scores:
  RCO-AUC: nan
  ACC: 1.000
  F1: 1.000
  Precision: 1.000
  Recall: 1.000
Test scores:
  RCO-AUC: 0.884
  ACC: 0.837
  F1: 0.857
  Precision: 0.840
  Recall: 0.875



In [21]:
# 将结果保存到指定位置的表格中
rows = []
for name, metrics in results.items():
    for phase in ["train", "test"]:
        row = {
            'Model': name,
            'Phase': phase,
            'Training Time (s)': metrics['time'] if phase == 'train' else None
        }
        for metric, score in metrics[phase].items():
            row[metric] = score
        rows.append(row)

# 创建 DataFrame
df = pd.DataFrame(rows)

# 保存到 Excel 文件
output_path = r'D:\Users\刘洋\Desktop\GA精度4.xlsx'
df.to_excel(output_path, index=False)

print(f"Results have been saved to {output_path}")

Results have been saved to D:\Users\刘洋\Desktop\GA精度4.xlsx


In [22]:
# 绘制所有模型的ROC曲线
plt.figure(figsize=(10, 8))

for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'GA-{name} (AUC = {roc_auc_score(y_test, y_pred_proba):.3f})')

# 添加图例和图形的标签
plt.plot([0, 1], [0, 1], 'k--',linewidth=1)  # 参考线：随机分类器的ROC曲线
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend(loc='lower right')
plt.grid(True, linestyle='--', alpha=0.6)
plt.savefig(r'D:\Users\刘洋\Desktop\GA_ROC4.png')
plt.close()  # 关闭图像以确保保存后不会影响后续图形的显示



In [None]:
# 读取研究区数据
all_data = pd.read_excel(r'D:\Users\刘洋\Desktop\无坐标 - 副本.xlsx')
X_all = all_data.iloc[:, 1:12].values  # 第2到12列为特征值，共计11列

# 数据标准化
scaler_all = StandardScaler()
X_all = scaler_all.fit_transform(X_all)

# 初始化用于保存所有模型预测概率的 DataFrame
probabilities_df = pd.DataFrame(index=all_data.index)

# 对研究区数据进行概率预测
for name, model in models.items():
    # 使用之前训练好的模型对研究区数据进行预测
    try:
        y_prob_all = model.predict_proba(X_all)[:, 1]  # 获取研究区数据属于“正例”（滑坡易发）的概率
    except AttributeError:
        # 如果模型未训练，进行训练（使用训练集数据）
        data1 = pd.read_excel(r'D:\Users\刘洋\Desktop\测试负2.xlsx', index_col=0)
        X = data1.iloc[:, 0:-1].values  # 取除最后一列之外的所有列作为特征
        y = data1.iloc[:, -1].values  # 最后一列为标签
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        model.fit(X_train, y_train)
        y_prob_all = model.predict_proba(X_all)[:, 1]
    
    # 将预测概率添加到 DataFrame 中
    probabilities_df[name] = y_prob_all
    
    # 绘制研究区数据的概率分布
    plt.figure(figsize=(10, 6))
    sns.histplot(y_prob_all, kde=True, bins=30)
    plt.title(f'Probability Distribution for Landslide Susceptibility - {name}')
    plt.xlabel('Predicted Probability of Landslide Susceptibility')
    plt.ylabel('Frequency')
    plt.show()

    # 输出概率预测的基本统计信息
    print(f"Probability Statistics for {name} Model:")
    print(f"  Mean: {np.mean(y_prob_all):.3f}")
    print(f"  Standard Deviation: {np.std(y_prob_all):.3f}")
    print(f"  Min: {np.min(y_prob_all):.3f}")
    print(f"  Max: {np.max(y_prob_all):.3f}")
    print()

# 将所有模型的概率预测结果输出到 Excel 文件
output_path = r'D:\Users\刘洋\Desktop\GA预测4.csv'
probabilities_df.to_csv(output_path)
print(f"Combined probability predictions have been saved to {output_path}")

In [None]:
# %% SHAP 解释   英文文章的
# 确保特征名称正确
feature_names = all_data.columns[1:12]  # 第2到12列为特征名称

# 对每个模型进行 SHAP 解释
for name, model in models.items():
    print(f"Generating SHAP explanations for model: {name}")
    
    # 使用 TreeExplainer 对树模型（XGBoost、LGBM、CatBoost）进行解释
    if name in ["XGBoost", "LGBM", "CatBoost"]:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_all)

        # 绘制 SHAP 总结图（柱状图）
        shap.summary_plot(shap_values, X_all, feature_names=feature_names, plot_type="bar", show=False)
        plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Summary_{name}_AllData.png')
        plt.close()

        # 绘制 SHAP 散点图（按特征显示的散点图）
        if isinstance(shap_values, list):  # 检查是否有多个类别
            for i in range(len(shap_values)):
                shap.summary_plot(shap_values[i], X_all, feature_names=feature_names, plot_type="dot", show=False)
                plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Scatter_{name}_AllData_class_{i}.png')
                plt.close()
        else:
            shap.summary_plot(shap_values, X_all, feature_names=feature_names, plot_type="dot", show=False)
            plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Scatter_{name}_AllData.png')
            plt.close()

    # 使用通用的 SHAP Explainer 进行解释 (NGBoost)
    elif name == "NGBoost":
        explainer = shap.Explainer(model.predict, X_all)
        shap_values = explainer(X_all)

        # 绘制 SHAP 总结图
        shap.summary_plot(shap_values, X_all, feature_names=feature_names, plot_type="bar", show=False)
        plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Summary_{name}_AllData.png')
        plt.close()

        # 绘制 SHAP 散点图
        shap.summary_plot(shap_values, X_all, feature_names=feature_names, plot_type="dot", show=False)
        plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Scatter_{name}_AllData.png')
        plt.close()

print("SHAP explanations have been generated and saved for each model.")


In [None]:
import shap
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 加载 SimHei 字体
font_path = r"C:\Windows\Fonts\SimHei.ttf"  # 请根据你的系统调整路径
prop = fm.FontProperties(fname=font_path)

# 全局设置 Matplotlib 使用 SimHei 字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 全局设置字体为 SimHei
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 确保特征名称正确
feature_names = all_data.columns[1:12]  # 第2到12列为特征名称

# 对每个模型进行 SHAP 解释
for name, model in models.items():
    print(f"Generating SHAP explanations for model: {name}")

    # 使用 TreeExplainer 对树模型（XGBoost、LGBM、CatBoost）进行解释
    if name in ["XGBoost", "LGBM", "CatBoost"]:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_all)

        # 绘制 SHAP 总结图（柱状图）
        shap.summary_plot(shap_values, X_all, feature_names=feature_names, plot_type="bar", show=False)
        plt.title(f"SHAP 总结图 - {name}", fontproperties=prop)
        plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Summary_{name}_AllData.png', bbox_inches='tight')
        plt.close()

        # 绘制 SHAP 散点图（按特征显示的散点图）
        if isinstance(shap_values, list):  # 检查是否有多个类别
            for i in range(len(shap_values)):
                shap.summary_plot(shap_values[i], X_all, feature_names=feature_names, plot_type="dot", show=False)
                plt.title(f"SHAP 散点图 - {name} - 类别 {i}", fontproperties=prop)
                plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Scatter_{name}_AllData_class_{i}.png', bbox_inches='tight')
                plt.close()
        else:
            shap.summary_plot(shap_values, X_all, feature_names=feature_names, plot_type="dot", show=False)
            plt.title(f"SHAP 散点图 - {name}", fontproperties=prop)
            plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Scatter_{name}_AllData.png', bbox_inches='tight')
            plt.close()

    # 使用通用的 SHAP Explainer 进行解释 (NGBoost)
    elif name == "NGBoost":
        explainer = shap.Explainer(model.predict, X_all)
        shap_values = explainer(X_all)

        # 绘制 SHAP 总结图
        shap.summary_plot(shap_values, X_all, feature_names=feature_names, plot_type="bar", show=False)
        plt.title(f"SHAP 总结图 - {name}", fontproperties=prop)
        plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Summary_{name}_AllData.png', bbox_inches='tight')
        plt.close()

        # 绘制 SHAP 散点图
        shap.summary_plot(shap_values, X_all, feature_names=feature_names, plot_type="dot", show=False)
        plt.title(f"SHAP 散点图 - {name}", fontproperties=prop)
        plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Scatter_{name}_AllData.png', bbox_inches='tight')
        plt.close()

print("SHAP explanations have been generated and saved for each model.")


In [None]:
# 确保特征名称正确   这个是两种因素之间的依赖图
if "CatBoost" in models:
    print("Generating custom SHAP scatter plots for CatBoost model...")

    # 提取 CatBoost 模型和 SHAP 解释器
    model = models["CatBoost"]
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_all)

    # 如果 shap_values 是列表（分类任务），选择第一个类别的 SHAP 值
    if isinstance(shap_values, list):
        shap_values = shap_values[0]

    # 确保 feature_names 是一个列表
    feature_names = list(feature_names)

    # 确保 X_all 是 Pandas 数据框
    import pandas as pd
    if not isinstance(X_all, pd.DataFrame):
        X_all = pd.DataFrame(X_all, columns=feature_names)

    # 遍历每个特征，绘制单特征 SHAP 散点图
    for feature in feature_names:
        print(f"Plotting SHAP scatter plot for feature: {feature}")

        # 获取当前特征值和对应的 SHAP 值
        feature_values = X_all[feature].values
        feature_shap_values = shap_values[:, feature_names.index(feature)]

        # 绘制单特征的散点图
        plt.figure(figsize=(6, 4))
        plt.scatter(feature_values, feature_shap_values, alpha=0.7, s=10, c='blue')
        plt.axhline(y=0, color='gray', linestyle='--', linewidth=0.8)
        plt.xlabel(feature)
        plt.ylabel("SHAP value")
        plt.title(f"SHAP Scatter Plot for {feature}")
        plt.grid(alpha=0.3)

        # 保存图像
        plt.savefig(fr'D:\Users\刘洋\Desktop\SHAP_Custom_CatBoost_{feature}.png')
        plt.close()

    print("Custom SHAP scatter plots for CatBoost model have been generated and saved.")


In [None]:
# 确保特征名称正确   单因素的依赖图 英文论文的
if "CatBoost" in models:
    print("Generating custom SHAP scatter plots for CatBoost model...")

    # 提取 CatBoost 模型和 SHAP 解释器
    model = models["CatBoost"]
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_all)

    # 如果 shap_values 是列表（分类任务），选择第一个类别的 SHAP 值
    if isinstance(shap_values, list):
        shap_values = shap_values[0]

    # 确保 feature_names 是一个列表
    feature_names = list(feature_names)

    # 遍历每个特征，绘制单特征 SHAP 散点图
    for feature in feature_names:
        print(f"Plotting SHAP scatter plot for feature: {feature}")

        # 提取原始特征值（确保原始数据与标准化数据对齐）
        # original_data 应该是未处理过的原始数据
        original_data = pd.read_excel(r'D:\Users\刘洋\Desktop\original_data - 副本.xlsx')  # 替换为原始数据文件路径

       

        original_data = original_data.reset_index(drop=True)
        X_all = X_all.reset_index(drop=True)
        feature_values = original_data[feature].values  # 使用原始数据的特征值

        # 提取对应的 SHAP 值
        feature_shap_values = shap_values[:, feature_names.index(feature)]

        # 绘制单特征的散点图
        plt.figure(figsize=(6, 6))
        plt.scatter(feature_values, feature_shap_values, alpha=0.7, s=10, c='blue')
        plt.axhline(y=0, color='gray', linestyle='--', linewidth=0.8)

        # 增大横纵坐标的标度值字体大小
        plt.xticks(fontsize=12)  # 设置横坐标标度值字体大小
        plt.yticks(fontsize=12)  # 设置纵坐标标度值字体大小


        plt.xlabel(f"{feature} (Original Feature Values)", fontsize=12)  # 横坐标为原始特征值
        plt.ylabel("SHAP value ", fontsize=12)
        #plt.title(f"SHAP Scatter Plot for {feature}")
        plt.grid(alpha=0.3)
        
        # 去除多余空白
        plt.tight_layout()

        # 保存图像
        plt.savefig(fr'D:\Users\刘洋\Desktop\{feature}.png')
        plt.close()

    print("Custom SHAP scatter plots for CatBoost model have been generated and saved.")


In [None]:
# 确保特征名称正确 单因素的依赖图 专利
if "CatBoost" in models:
    print("Generating custom SHAP scatter plots for CatBoost model...")

    # 提取 CatBoost 模型和 SHAP 解释器
    model = models["CatBoost"]
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_all)

    # 如果 shap_values 是列表（分类任务），选择第一个类别的 SHAP 值
    if isinstance(shap_values, list):
        shap_values = shap_values[0]

    # 确保 feature_names 是一个列表
    feature_names = list(feature_names)

    # 将 X_all 转换为 DataFrame 以便使用 reset_index()
    X_all_df = pd.DataFrame(X_all, columns=feature_names)
    X_all_df = X_all_df.reset_index(drop=True)

    # 遍历每个特征，绘制单特征 SHAP 散点图
    for feature in feature_names:
        print(f"Plotting SHAP scatter plot for feature: {feature}")

        # 提取原始特征值（确保原始数据与标准化数据对齐）
        # original_data 应该是未处理过的原始数据
        original_data = pd.read_excel(r'D:\Users\刘洋\Desktop\original_data - 副本.xlsx')  # 替换为原始数据文件路径

        original_data = original_data.reset_index(drop=True)
        feature_values = original_data[feature].values  # 使用原始数据的特征值

        # 提取对应的 SHAP 值
        feature_shap_values = shap_values[:, feature_names.index(feature)]

        # 绘制单特征的散点图
        plt.figure(figsize=(6, 6))
        plt.scatter(feature_values, feature_shap_values, alpha=0.7, s=10, c='blue')
        plt.axhline(y=0, color='gray', linestyle='--', linewidth=0.8)

        # 增大横纵坐标的标度值字体大小
        plt.xticks(fontsize=16)  # 设置横坐标标度值字体大小
        plt.yticks(fontsize=16)  # 设置纵坐标标度值字体大小

        plt.xlabel(f"{feature} ", fontsize=16, family='SimHei')  # 横坐标为原始特征值
        plt.ylabel("SHAP 值 ", fontsize=16, family='SimHei')
        # plt.title(f"SHAP Scatter Plot for {feature}")
        plt.grid(alpha=0.3)

        # 去除多余空白
        plt.tight_layout()

        # 保存图像
        plt.savefig(fr'D:\Users\刘洋\Desktop\{feature}.png')
        plt.close()

    print("Custom SHAP scatter plots for CatBoost model have been generated and saved.")