# 财务舞弊识别集成学习框架

本Notebook实现了基于XGBoost、LightGBM和CatBoost的集成学习模型，用于财务舞弊检测。
包含以下主要功能：
1. 数据加载与预处理
2. 混合采样方法处理不平衡数据
3. 使用Optuna进行贝叶斯超参数优化
4. 加权软投票集成方法
5. 堆叠集成(Stacking)方法
6. 模型性能比较与可视化

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import optuna
import pickle
import json
import warnings

warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

## 1. 数据加载与预处理

In [2]:
# 假设数据已经预处理并保存为CSV文件
try:
    data = pd.read_csv('reduced_data.csv')
    print('数据加载成功，共有 {} 行和 {} 列'.format(data.shape[0], data.shape[1]))
except Exception as e:
    print('数据加载失败: {}'.format(e))


# 分离特征和标签
X = data.drop(['Stkcd', 'Accper', 'Typrep ', 'isviolation'], axis=1)
y = data['isviolation']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('训练集大小: {}，测试集大小: {}'.format(X_train.shape[0], X_test.shape[0]))

# 查看类别分布
print('原始训练集类别分布：', Counter(y_train))
print('原始测试集类别分布：', Counter(y_test))

# 计算不平衡比例
neg_count = sum(y_train == 0)
pos_count = sum(y_train == 1)
imbalance_ratio = neg_count / pos_count
print('训练集不平衡比例（负样本/正样本）：{:.2f}'.format(imbalance_ratio))

数据加载成功，共有 119060 行和 31 列
训练集大小: 95248，测试集大小: 23812
原始训练集类别分布： Counter({0: 81458, 1: 13790})
原始测试集类别分布： Counter({0: 20365, 1: 3447})
训练集不平衡比例（负样本/正样本）：5.91


## 2. 混合采样方法处理数据不平衡

In [3]:
# 混合采样策略：先过采样，再欠采样
over_strategy = {1: min(int(pos_count * 2), neg_count - 100)}  # 过采样正样本到负样本的一半
under_strategy = {0: int(over_strategy[1] * 1.5)}  # 欠采样负样本到正样本的1.5倍

# 创建采样管道
over_sampler = SMOTE(random_state=42, sampling_strategy=over_strategy)
under_sampler = RandomUnderSampler(random_state=42, sampling_strategy=under_strategy)

# 先过采样，再过欠采样
X_train_resampled, y_train_resampled = over_sampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train_resampled, y_train_resampled)

print('采样后训练集类别分布：', Counter(y_train_resampled))
print('采样后不平衡比例：{:.2f}'.format(sum(y_train_resampled == 0) / sum(y_train_resampled == 1)))

采样后训练集类别分布： Counter({0: 41370, 1: 27580})
采样后不平衡比例：1.50


## 3. XGBoost模型优化

In [8]:
# -------------------------- 预划分CV索引 --------------------------
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 确保数据集已定义（请替换为你的实际数据）
try:
    cv_splits = list(skf.split(X_train_resampled, y_train_resampled))
except NameError:
    raise NameError("请先定义 X_train_resampled、y_train_resampled、X_test、y_test 数据集！")

# -------------------------- 目标函数（无早停+缩小参数范围） --------------------------
def objective_xgb(trial):
    param = {
        # 核心：基于最优分布缩小搜索范围
        'n_estimators': trial.suggest_int('n_estimators', 500, 530),  # 原500-600 → 500-530
        'max_depth': trial.suggest_int('max_depth', 13, 14),          # 原12-15 → 13-14
        'learning_rate': trial.suggest_float('learning_rate', 0.23, 0.25),  # 原0.18-0.25 → 0.23-0.25
        'subsample': trial.suggest_float('subsample', 0.88, 0.90),    # 原0.85-0.95 → 0.88-0.90
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.73, 0.75),  # 原0.65-0.75 → 0.73-0.75
        'gamma': trial.suggest_float('gamma', 0.05, 0.10),             # 原0.05-0.3 → 0.05-0.10
        'min_child_weight': trial.suggest_int('min_child_weight', 8, 9),  # 原8-10 → 8-9
        'reg_alpha': trial.suggest_float('reg_alpha', 4.7, 4.9),      # 原4.5-5.5 → 4.7-4.9
        'reg_lambda': trial.suggest_float('reg_lambda', 3.3, 3.5),    # 原3.0-4.0 → 3.3-3.5
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'auc',
        'n_jobs': 4  # 限制线程数
    }

    auc_scores = []
    for train_idx, val_idx in cv_splits:
        # 兼容DataFrame和numpy数组索引
        if hasattr(X_train_resampled, 'iloc'):
            X_train_fold, X_val_fold = X_train_resampled.iloc[train_idx], X_train_resampled.iloc[val_idx]
            y_train_fold, y_val_fold = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]
        else:
            X_train_fold, X_val_fold = X_train_resampled[train_idx], X_train_resampled[val_idx]
            y_train_fold, y_val_fold = y_train_resampled[train_idx], y_train_resampled[val_idx]

        model = xgb.XGBClassifier(**param)
        
        # 核心：仅保留基础训练参数，无任何早停逻辑
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],  # 保留验证集计算AUC，不影响训练
            verbose=False  # 关闭冗余的训练日志输出
        )

        # 计算验证集AUC（调优的核心指标）
        y_val_pred_proba = model.predict_proba(X_val_fold)[:, 1]
        auc_score = roc_auc_score(y_val_fold, y_val_pred_proba)
        auc_scores.append(auc_score)

    return np.mean(auc_scores)

# -------------------------- 启动Optuna调优 --------------------------
print('开始XGBoost参数优化（缩小范围+无早停）...')
study_xgb = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),  # 保留Optuna剪枝，提前终止差的试验
    sampler=optuna.samplers.TPESampler(seed=42)
)

# 范围缩小后，20次trial足够找到更优参数
study_xgb.optimize(objective_xgb, n_trials=20, n_jobs=2)

# -------------------------- 输出最优结果 --------------------------
print(f"\n【最优交叉验证AUC】: {study_xgb.best_value:.6f}")
print("【最优参数】:")
for key, value in study_xgb.best_params.items():
    print(f"  {key}: {value}")

# -------------------------- 训练最终模型（无早停+关闭日志） --------------------------
best_params_xgb = study_xgb.best_params
best_params_xgb.update({
    'use_label_encoder': False,
    'eval_metric': 'auc',
    'random_state': 42,
    'n_jobs': -1  # 最终模型用全线程加速
})

xgb_model = xgb.XGBClassifier(** best_params_xgb)

# 最终模型训练也无早停逻辑
xgb_model.fit(
    X_train_resampled, y_train_resampled,
    eval_set=[(X_test, y_test)],  # 保留测试集评估指标
    verbose=False
)

# -------------------------- 评估测试集 --------------------------
y_test_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
xgb_auc = roc_auc_score(y_test, y_test_pred_proba_xgb)
print(f"\n【测试集最终AUC】: {xgb_auc:.6f}")

[I 2025-11-20 13:28:29,082] A new study created in memory with name: no-name-99bba749-0e9b-437d-9ad9-984d596a1556


开始XGBoost参数优化（缩小范围+无早停）...


[I 2025-11-20 13:29:00,991] Trial 1 finished with value: 0.8484106740460179 and parameters: {'n_estimators': 516, 'max_depth': 13, 'learning_rate': 0.23808831490597251, 'subsample': 0.8905149147302976, 'colsample_bytree': 0.7390922645435068, 'gamma': 0.07086812708644882, 'min_child_weight': 9, 'reg_alpha': 4.821403174849755, 'reg_lambda': 3.4503765320212927}. Best is trial 1 with value: 0.8484106740460179.
[I 2025-11-20 13:29:02,611] Trial 0 finished with value: 0.8515065107014298 and parameters: {'n_estimators': 519, 'max_depth': 13, 'learning_rate': 0.23638437235537466, 'subsample': 0.8901637561172621, 'colsample_bytree': 0.7446828419788466, 'gamma': 0.05714831910327781, 'min_child_weight': 8, 'reg_alpha': 4.7754252419124406, 'reg_lambda': 3.368687440544859}. Best is trial 0 with value: 0.8515065107014298.
[I 2025-11-20 13:29:29,891] Trial 2 finished with value: 0.8505954512023916 and parameters: {'n_estimators': 502, 'max_depth': 14, 'learning_rate': 0.23078869247899225, 'subsample'


【最优交叉验证AUC】: 0.852961
【最优参数】:
  n_estimators: 530
  max_depth: 14
  learning_rate: 0.2352126104186026
  subsample: 0.8961185528547606
  colsample_bytree: 0.7303698751544976
  gamma: 0.050879559154835045
  min_child_weight: 8
  reg_alpha: 4.76721529233421
  reg_lambda: 3.3814729277275077

【测试集最终AUC】: 0.789192


In [14]:
# 保存模型
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

# 保存最佳参数
with open('xgb_best_params.json', 'w') as f:
    json.dump(best_params_xgb, f)

## 4. LightGBM模型优化

In [9]:
# LightGBM参数优化函数
def objective_lgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'random_state': 42,
        'verbose': -1
    }
    
    # 5折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []
    
    for train_idx, val_idx in skf.split(X_train_resampled, y_train_resampled):
        X_train_fold, X_val_fold = X_train_resampled.iloc[train_idx], X_train_resampled.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**param)
        model.fit(X_train_fold, y_train_fold)
        
        y_val_pred_proba = model.predict_proba(X_val_fold)[:, 1]
        auc_score = roc_auc_score(y_val_fold, y_val_pred_proba)
        auc_scores.append(auc_score)
    
    return np.mean(auc_scores)

# 运行Optuna优化
print('开始LightGBM参数优化...')
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=50)

print('LightGBM最佳参数:')
for key, value in study_lgb.best_params.items():
    print('{}: {}'.format(key, value))

print('交叉验证最佳AUC: {:.4f}'.format(study_lgb.best_value))

# 使用最佳参数训练最终模型
best_params_lgb = study_lgb.best_params
best_params_lgb['random_state'] = 42
best_params_lgb['verbose'] = -1

lgb_model = lgb.LGBMClassifier(**best_params_lgb)
lgb_model.fit(X_train_resampled, y_train_resampled)

# 预测和评估
y_test_pred_proba_lgb = lgb_model.predict_proba(X_test)[:, 1]
lgb_auc = roc_auc_score(y_test, y_test_pred_proba_lgb)

print('LightGBM测试集 AUC: {:.4f}'.format(lgb_auc))

[I 2025-11-20 16:02:04,313] A new study created in memory with name: no-name-fae5dda7-be4d-45b1-856b-0ae3b8aab2ce


开始LightGBM参数优化...


[I 2025-11-20 16:02:13,441] Trial 0 finished with value: 0.8039115865367508 and parameters: {'n_estimators': 411, 'max_depth': 8, 'learning_rate': 0.10243774399350054, 'subsample': 0.947319491588874, 'colsample_bytree': 0.6045739672105842, 'reg_alpha': 0.8942885100804299, 'reg_lambda': 0.07062078083064471, 'num_leaves': 55, 'min_child_samples': 90}. Best is trial 0 with value: 0.8039115865367508.
[I 2025-11-20 16:02:28,215] Trial 1 finished with value: 0.7841863071596235 and parameters: {'n_estimators': 477, 'max_depth': 14, 'learning_rate': 0.027006766786775348, 'subsample': 0.8247377010707932, 'colsample_bytree': 0.6642981746918198, 'reg_alpha': 3.3525612060862575, 'reg_lambda': 7.265601515190201, 'num_leaves': 82, 'min_child_samples': 30}. Best is trial 0 with value: 0.8039115865367508.
[I 2025-11-20 16:02:32,841] Trial 2 finished with value: 0.7720567788557356 and parameters: {'n_estimators': 193, 'max_depth': 8, 'learning_rate': 0.07420569937182112, 'subsample': 0.863979111617436,

LightGBM最佳参数:
n_estimators: 876
max_depth: 15
learning_rate: 0.1997576770286354
subsample: 0.914859086744174
colsample_bytree: 0.8477897996564947
reg_alpha: 1.8189730243432607
reg_lambda: 0.9710205052469383
num_leaves: 79
min_child_samples: 20
交叉验证最佳AUC: 0.8720
LightGBM测试集 AUC: 0.7917


In [15]:

# 保存模型
with open('lgb_model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)

# 保存最佳参数
with open('lgb_best_params.json', 'w') as f:
    json.dump(best_params_lgb, f)

## 5. CatBoost模型优化

In [13]:
from optuna.pruners import MedianPruner  

def objective_catboost(trial):
    # 核心修改：基于最优实验的参数，设置极小的微调范围（仅最优值附近）
    param = {
        # 最优值650，微调范围600-700，步长50（仅3个候选值）
        'n_estimators': trial.suggest_int('n_estimators', 600, 700, step=50),  
        # 最优值11/12，微调范围11-12（仅2个候选值）
        'max_depth': trial.suggest_int('max_depth', 11, 12),  
        # 最优值0.0957/0.1262，微调范围0.09-0.13（窄区间）
        'learning_rate': trial.suggest_float('learning_rate', 0.09, 0.13),  
        # 最优值0.7085/0.8033，微调范围0.70-0.82（窄区间）
        'subsample': trial.suggest_float('subsample', 0.70, 0.82),  
        # 最优值0.6725/0.6854，微调范围0.67-0.69（仅0.02的区间）
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.67, 0.69),  
        # 最优值0.4288/4.8741，分两个小区间（覆盖两个最优值）
        'reg_lambda': trial.suggest_float('reg_lambda', 0.4, 5.0),  
        # 最优值70/86，微调范围70-90（步长5，减少候选）
        'min_child_samples': trial.suggest_int('min_child_samples', 70, 90, step=5),  
        'random_seed': 42,
        'verbose': False,
        # 保留加速配置
        'thread_count': -1,  # 使用所有CPU核心
        'early_stopping_rounds': 50  # 验证集指标50轮不提升则早停
    }
    
    # 可选：GPU加速（如有NVIDIA GPU，取消注释）
    # param['task_type'] = 'GPU'
    # param['gpu_ram_part'] = 0.8  # 分配80%的GPU显存

    # 保留3折交叉验证（加速）
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    auc_scores = []
    
    for train_idx, val_idx in skf.split(X_train_resampled, y_train_resampled):
        X_train_fold, X_val_fold = X_train_resampled.iloc[train_idx], X_train_resampled.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]
        
        model = cb.CatBoostClassifier(**param)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_val_fold, y_val_fold),
            verbose=False
        )
        
        y_val_pred_proba = model.predict_proba(X_val_fold)[:, 1]
        auc_score = roc_auc_score(y_val_fold, y_val_pred_proba)
        auc_scores.append(auc_score)
    
    # 计算平均AUC
    mean_auc = np.mean(auc_scores)
    
    # 剪枝机制（因试验次数少，实际剪枝概率低，可保留）
    trial.report(mean_auc, step=0)
    if trial.should_prune():
        raise optuna.TrialPruned()
    
    return mean_auc

# 运行Optuna优化
print('开始CatBoost参数微调（5次以内）...')
# 保留剪枝器（试验次数少，可简化为无剪枝，注释掉pruner即可）
study_catboost = optuna.create_study(
    direction='maximize',
    pruner=MedianPruner(n_warmup_steps=1)  # 仅前1个试验不剪枝
)
# 核心修改：将试验次数设为5次（也可改为3次，如n_trials=3）
study_catboost.optimize(objective_catboost, n_trials=5)  

print('CatBoost最佳参数:')
for key, value in study_catboost.best_params.items():
    print('{}: {}'.format(key, value))

print('交叉验证最佳AUC: {:.4f}'.format(study_catboost.best_value))

# 使用最佳参数训练最终模型（保留加速配置）
best_params_catboost = study_catboost.best_params
best_params_catboost.update({
    'random_seed': 42,
    'verbose': False,
    'thread_count': -1,
    'early_stopping_rounds': 50
    # 如有GPU，添加：'task_type': 'GPU', 'gpu_ram_part': 0.8
})

catboost_model = cb.CatBoostClassifier(**best_params_catboost)
catboost_model.fit(
    X_train_resampled, y_train_resampled,
    eval_set=(X_test, y_test),  # 最终训练加入测试集早停
    verbose=False
)

# 预测和评估
y_test_pred_proba_catboost = catboost_model.predict_proba(X_test)[:, 1]
catboost_auc = roc_auc_score(y_test, y_test_pred_proba_catboost)

print('CatBoost测试集 AUC: {:.4f}'.format(catboost_auc))

[I 2025-11-20 18:25:54,469] A new study created in memory with name: no-name-a3c83d5f-54d6-435f-ab91-c9a650f25f0c


开始CatBoost参数微调（5次以内）...


[I 2025-11-20 18:32:16,177] Trial 0 finished with value: 0.8698226089374591 and parameters: {'n_estimators': 700, 'max_depth': 12, 'learning_rate': 0.1058954229672149, 'subsample': 0.7052271673323328, 'colsample_bylevel': 0.6807416702319337, 'reg_lambda': 3.570574415195616, 'min_child_samples': 70}. Best is trial 0 with value: 0.8698226089374591.
[I 2025-11-20 18:37:52,029] Trial 1 finished with value: 0.8640145910800566 and parameters: {'n_estimators': 700, 'max_depth': 11, 'learning_rate': 0.09301796524321296, 'subsample': 0.7151403828025522, 'colsample_bylevel': 0.6785755728566395, 'reg_lambda': 3.2319055052563246, 'min_child_samples': 70}. Best is trial 0 with value: 0.8698226089374591.
[I 2025-11-20 19:34:26,794] Trial 2 finished with value: 0.8639991305109911 and parameters: {'n_estimators': 650, 'max_depth': 11, 'learning_rate': 0.11284189014351063, 'subsample': 0.7401761089470932, 'colsample_bylevel': 0.6701912530901575, 'reg_lambda': 2.9909859830267895, 'min_child_samples': 85

CatBoost最佳参数:
n_estimators: 650
max_depth: 12
learning_rate: 0.11612879005438699
subsample: 0.7534834328845864
colsample_bylevel: 0.6887655251280405
reg_lambda: 3.750863688329549
min_child_samples: 75
交叉验证最佳AUC: 0.8699
CatBoost测试集 AUC: 0.7881


In [16]:
# 保存模型
with open('catboost_model.pkl', 'wb') as f:
    pickle.dump(catboost_model, f)

# 保存最佳参数
with open('catboost_best_params.json', 'w') as f:
    json.dump(best_params_catboost, f)

## 6. 加权软投票集成方法

In [17]:
# 方法1: 按AUC分数分配权重
total_auc = xgb_auc + lgb_auc + catboost_auc
weights_by_auc = [xgb_auc/total_auc, lgb_auc/total_auc, catboost_auc/total_auc]

print('基于AUC的权重分配:')
print('XGBoost权重: {:.4f}'.format(weights_by_auc[0]))
print('LightGBM权重: {:.4f}'.format(weights_by_auc[1]))
print('CatBoost权重: {:.4f}'.format(weights_by_auc[2]))

# 方法2: 用户指定权重（可调整）
user_weights = [0.33, 0.33, 0.34]  # 近似平均分配

# 实现加权软投票集成
def weighted_voting_predict(models, X, weights):
    # 获取所有模型的预测概率
    predictions = []
    for model in models:
        y_pred_proba = model.predict_proba(X)[:, 1]
        predictions.append(y_pred_proba)
    
    # 加权平均
    weighted_avg = np.zeros_like(predictions[0])
    for i, pred in enumerate(predictions):
        weighted_avg += pred * weights[i]
    
    return weighted_avg

# 集成模型列表
models = [xgb_model, lgb_model, catboost_model]

# 使用AUC权重的集成预测
y_test_pred_proba_ensemble_auc = weighted_voting_predict(models, X_test, weights_by_auc)
ensemble_auc_auc = roc_auc_score(y_test, y_test_pred_proba_ensemble_auc)

# 使用用户权重的集成预测
y_test_pred_proba_ensemble_user = weighted_voting_predict(models, X_test, user_weights)
ensemble_auc_user = roc_auc_score(y_test, y_test_pred_proba_ensemble_user)

print('基于AUC权重的集成测试集 AUC: {:.4f}'.format(ensemble_auc_auc))
print('基于用户权重的集成测试集 AUC: {:.4f}'.format(ensemble_auc_user))

# 也可以使用sklearn的VotingClassifier
voting_ensemble = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('catboost', catboost_model)
    ],
    voting='soft',
    weights=weights_by_auc
).fit(X_train_resampled, y_train_resampled)

y_test_pred_proba_voting = voting_ensemble.predict_proba(X_test)[:, 1]
voting_auc = roc_auc_score(y_test, y_test_pred_proba_voting)

print('Sklearn VotingClassifier测试集 AUC: {:.4f}'.format(voting_auc))

基于AUC的权重分配:
XGBoost权重: 0.3331
LightGBM权重: 0.3342
CatBoost权重: 0.3327
基于AUC权重的集成测试集 AUC: 0.8039
基于用户权重的集成测试集 AUC: 0.8039
Sklearn VotingClassifier测试集 AUC: 0.8039



===== 基于AUC权重的加权投票集成 评估结果 =====
AUC: 0.8039
准确率(Accuracy): 0.8329
精确率(Precision): 0.4341
召回率(Recall): 0.5088
F1分数(F1-Score): 0.4685

分类报告：
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     20365
           1       0.43      0.51      0.47      3447

    accuracy                           0.83     23812
   macro avg       0.67      0.70      0.68     23812
weighted avg       0.84      0.83      0.84     23812


===== 基于用户权重的加权投票集成 评估结果 =====
AUC: 0.8039
准确率(Accuracy): 0.8332
精确率(Precision): 0.4348
召回率(Recall): 0.5088
F1分数(F1-Score): 0.4689

分类报告：
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     20365
           1       0.43      0.51      0.47      3447

    accuracy                           0.83     23812
   macro avg       0.67      0.70      0.68     23812
weighted avg       0.84      0.83      0.84     23812


===== Sklearn VotingClassifier软投票集成 评估结果 =====
AUC: 0.8039
准确率(Acc

In [18]:

# 保存集成模型性能
ensemble_performance = {
    'xgb_auc': xgb_auc,
    'lgb_auc': lgb_auc,
    'catboost_auc': catboost_auc,
    'ensemble_auc_auc': ensemble_auc_auc,
    'ensemble_auc_user': ensemble_auc_user,
    'voting_auc': voting_auc
}

with open('ensemble_performance.json', 'w') as f:
    json.dump(ensemble_performance, f)

## 7. 堆叠集成(Stacking)方法

In [19]:
# 实现堆叠集成
class StackingEnsemble:
    def __init__(self, base_models, meta_model, n_splits=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_splits = n_splits
    
    def fit(self, X, y):
        # 使用分层K折交叉验证生成元特征
        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        
        # 创建元特征矩阵
        meta_features = np.zeros((X.shape[0], len(self.base_models)))
        
        # 对每个基础模型进行训练并生成元特征
        for i, base_model in enumerate(self.base_models):
            for train_idx, val_idx in skf.split(X, y):
                # 在训练折叠上训练基础模型
                base_model.fit(X.iloc[train_idx], y.iloc[train_idx])
                # 在验证折叠上生成预测概率作为元特征
                meta_features[val_idx, i] = base_model.predict_proba(X.iloc[val_idx])[:, 1]
        
        # 使用元特征训练元模型
        self.meta_model.fit(meta_features, y)
        
        # 在完整训练集上重新训练所有基础模型
        for base_model in self.base_models:
            base_model.fit(X, y)
        
        return self
    
    def predict_proba(self, X):
        # 生成测试集的元特征
        meta_features = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, base_model in enumerate(self.base_models):
            meta_features[:, i] = base_model.predict_proba(X)[:, 1]
        
        # 使用元模型预测最终概率
        return self.meta_model.predict_proba(meta_features)
    
    def predict(self, X):
        # 返回类别预测
        return self.predict_proba(X)[:, 1] > 0.5

# 使用逻辑回归作为元模型
meta_model_logistic = LogisticRegression(random_state=42, max_iter=1000)

# 创建堆叠集成模型
stacking_ensemble = StackingEnsemble(
    base_models=[xgb_model, lgb_model, catboost_model],
    meta_model=meta_model_logistic
)

# 训练堆叠集成模型
print('训练堆叠集成模型...')
stacking_ensemble.fit(X_train_resampled, y_train_resampled)

# 预测和评估
y_test_pred_proba_stacking = stacking_ensemble.predict_proba(X_test)[:, 1]
stacking_auc = roc_auc_score(y_test, y_test_pred_proba_stacking)

print('堆叠集成测试集 AUC: {:.4f}'.format(stacking_auc))

# 可选：使用ExtraTrees作为元模型
meta_model_extratrees = ExtraTreesClassifier(random_state=42, n_estimators=100)

stacking_ensemble_extratrees = StackingEnsemble(
    base_models=[xgb_model, lgb_model, catboost_model],
    meta_model=meta_model_extratrees
)

stacking_ensemble_extratrees.fit(X_train_resampled, y_train_resampled)

y_test_pred_proba_stacking_extra = stacking_ensemble_extratrees.predict_proba(X_test)[:, 1]
stacking_auc_extra = roc_auc_score(y_test, y_test_pred_proba_stacking_extra)

print('ExtraTrees元模型堆叠集成测试集 AUC: {:.4f}'.format(stacking_auc_extra))

训练堆叠集成模型...
堆叠集成测试集 AUC: 0.8023
ExtraTrees元模型堆叠集成测试集 AUC: 0.7791


In [20]:

# 更新性能记录
ensemble_performance['stacking_auc'] = stacking_auc
ensemble_performance['stacking_auc_extra'] = stacking_auc_extra

with open('ensemble_performance.json', 'w') as f:
    json.dump(ensemble_performance, f)

In [26]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
def evaluate_model(y_true, y_pred_proba, model_name="集成模型", threshold=0.5):
    """
    输入真实标签、预测概率，计算并打印分类指标
    :param y_true: 真实标签数组
    :param y_pred_proba: 正类预测概率数组
    :param model_name: 模型名称（用于打印）
    :param threshold: 分类阈值（默认0.5，不平衡数据可调整）
    """
    # 将概率转换为类别预测（核心：概率≥阈值为正类1，否则为负类0）
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    # 计算核心指标
    auc = roc_auc_score(y_true, y_pred_proba)
    acc = accuracy_score(y_true, y_pred)  # 准确率
    prec = precision_score(y_true, y_pred)  # 精确率
    rec = recall_score(y_true, y_pred)  # 召回率
    f1 = f1_score(y_true, y_pred)  # F1分数
    
    # 打印指标结果
    print(f"\n===== {model_name} 评估结果 =====")
    print(f"AUC: {auc:.4f}")
    print(f"准确率(Accuracy): {acc:.4f}")
    print(f"精确率(Precision): {prec:.4f}")
    print(f"召回率(Recall): {rec:.4f}")
    print(f"F1分数(F1-Score): {f1:.4f}")
    
    # 打印详细分类报告（包含各类别的精确率、召回率、F1）
    print("\n分类报告：")
    print(classification_report(y_true, y_pred))

# ===================== 对原有集成结果进行多指标评估 =====================
# 1. 评估基于AUC权重的手动加权投票集成
evaluate_model(
    y_true=y_test,
    y_pred_proba=y_test_pred_proba_ensemble_auc,
    model_name="基于AUC权重的加权投票集成"
)

# 2. 评估基于用户权重的手动加权投票集成
evaluate_model(
    y_true=y_test,
    y_pred_proba=y_test_pred_proba_ensemble_user,
    model_name="基于用户权重的加权投票集成"
)

# 3. 评估sklearn的VotingClassifier集成
evaluate_model(
    y_true=y_test,
    y_pred_proba=y_test_pred_proba_voting,
    model_name="Sklearn VotingClassifier软投票集成"
)

evaluate_model(
    y_true=y_test,
    y_pred_proba=y_test_pred_proba_stacking_extra,
    model_name="ExtraTrees元模型堆叠集成"
)

evaluate_model(
    y_true=y_test,
    y_pred_proba=y_test_pred_proba_stacking,
    model_name="逻辑回归元模型堆叠集成"
)


===== 基于AUC权重的加权投票集成 评估结果 =====
AUC: 0.8039
准确率(Accuracy): 0.8329
精确率(Precision): 0.4341
召回率(Recall): 0.5088
F1分数(F1-Score): 0.4685

分类报告：
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     20365
           1       0.43      0.51      0.47      3447

    accuracy                           0.83     23812
   macro avg       0.67      0.70      0.68     23812
weighted avg       0.84      0.83      0.84     23812


===== 基于用户权重的加权投票集成 评估结果 =====
AUC: 0.8039
准确率(Accuracy): 0.8332
精确率(Precision): 0.4348
召回率(Recall): 0.5088
F1分数(F1-Score): 0.4689

分类报告：
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     20365
           1       0.43      0.51      0.47      3447

    accuracy                           0.83     23812
   macro avg       0.67      0.70      0.68     23812
weighted avg       0.84      0.83      0.84     23812


===== Sklearn VotingClassifier软投票集成 评估结果 =====
AUC: 0.8039
准确率(Acc

## 8. 比较不同集成方法的性能并可视化结果

In [21]:
# 确保所有模型性能都已记录
if 'stacking_auc' not in ensemble_performance:
    ensemble_performance['stacking_auc'] = stacking_auc
if 'stacking_auc_extra' not in ensemble_performance:
    ensemble_performance['stacking_auc_extra'] = stacking_auc_extra

# 整理性能数据
model_names = ['XGBoost', 'LightGBM', 'CatBoost', 
              '加权集成(AUC)', '加权集成(用户)', 'VotingClassifier', 
              '堆叠集成(Logistic)', '堆叠集成(ExtraTrees)']
aucs = [ensemble_performance['xgb_auc'],
        ensemble_performance['lgb_auc'],
        ensemble_performance['catboost_auc'],
        ensemble_performance['ensemble_auc_auc'],
        ensemble_performance['ensemble_auc_user'],
        ensemble_performance['voting_auc'],
        ensemble_performance['stacking_auc'],
        ensemble_performance['stacking_auc_extra']]

# 创建性能比较表格
performance_df = pd.DataFrame({
    '模型': model_names,
    'AUC': aucs
})

# 按AUC降序排序
performance_df = performance_df.sort_values('AUC', ascending=False)

print('模型性能比较:')
print(performance_df.round(4))



# 保存性能比较结果
performance_df.to_csv('model_performance_comparison.csv', index=False)

模型性能比较:
                 模型     AUC
4          加权集成(用户)  0.8039
3         加权集成(AUC)  0.8039
5  VotingClassifier  0.8039
6    堆叠集成(Logistic)  0.8023
1          LightGBM  0.7917
0           XGBoost  0.7892
2          CatBoost  0.7881
7  堆叠集成(ExtraTrees)  0.7791


In [None]:
# 可视化AUC性能
plt.figure(figsize=(12, 8))
sns.barplot(x='AUC', y='模型', data=performance_df)
plt.title('不同模型的AUC性能比较')
plt.xlabel('AUC分数')
plt.ylabel('模型类型')
plt.xlim(0.5, 1.0)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# 在柱状图上添加数值标签
for i, v in enumerate(performance_df['AUC']):
    plt.text(v + 0.005, i, '{:.4f}'.format(v), va='center')

plt.tight_layout()
plt.savefig('model_performance_comparison.png', dpi=300)
plt.show()

In [None]:
# 绘制ROC曲线比较
from sklearn.metrics import roc_curve

plt.figure(figsize=(10, 8))

# 获取所有模型的ROC曲线数据
models_to_compare = {
    'XGBoost': y_test_pred_proba_xgb,
    'LightGBM': y_test_pred_proba_lgb,
    'CatBoost': y_test_pred_proba_catboost,
    '加权集成(AUC)': y_test_pred_proba_ensemble_auc,
    '堆叠集成': y_test_pred_proba_stacking
}

# 绘制每个模型的ROC曲线
for name, y_pred_proba in models_to_compare.items():
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr, tpr, lw=2, label='{} (AUC = {:.4f})'.format(name, roc_auc))

# 绘制随机猜测的对角线
plt.plot([0, 1], [0, 1], 'k--', lw=2)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假正例率')
plt.ylabel('真正例率')
plt.title('不同模型的ROC曲线比较')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('roc_curves_comparison.png', dpi=300)
plt.show()

In [22]:
# 找出最佳模型
best_model_name = performance_df.iloc[0]['模型']
best_auc = performance_df.iloc[0]['AUC']

print('最佳模型: {}，AUC: {:.4f}'.format(best_model_name, best_auc))

最佳模型: 加权集成(用户)，AUC: 0.8039


In [None]:
# 获取最佳模型的预测
if best_model_name == 'XGBoost':
    best_pred_proba = y_test_pred_proba_xgb
elif best_model_name == 'LightGBM':
    best_pred_proba = y_test_pred_proba_lgb
elif best_model_name == 'CatBoost':
    best_pred_proba = y_test_pred_proba_catboost
elif best_model_name == '加权集成(AUC)':
    best_pred_proba = y_test_pred_proba_ensemble_auc
elif best_model_name == '加权集成(用户)':
    best_pred_proba = y_test_pred_proba_ensemble_user
elif best_model_name == 'VotingClassifier':
    best_pred_proba = y_test_pred_proba_voting
elif best_model_name == '堆叠集成(Logistic)':
    best_pred_proba = y_test_pred_proba_stacking
else:  # 堆叠集成(ExtraTrees)
    best_pred_proba = y_test_pred_proba_stacking_extra
# 绘制混淆矩阵
best_pred = (best_pred_proba > 0.5).astype(int)
cm = confusion_matrix(y_test, best_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('{}的混淆矩阵'.format(best_model_name))
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300)
plt.show()

In [None]:
# 总结
print('=== 模型集成总结 ===')
print('1. 最佳单模型: {}，AUC: {:.4f}'.format(model_names[np.argmax(aucs[:3])], max(aucs[:3])))
print('2. 最佳集成方法: {}，AUC: {:.4f}'.format(best_model_name, best_auc))
print('3. 集成提升: {:.2f}%'.format((best_auc - max(aucs[:3])) * 100))