Bank Marketing（bank-full）数据集 BTTWD 实验

本 notebook 按步骤运行：环境准备 → 加载配置 → 读取数据 → 预处理 → 桶树划分 → 基线与 BTTWD 实验 → 桶级分析。



In [1]:
# 步骤0：环境与路径设置
import os, sys
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# 将项目根目录加入路径，便于导入 bttwdlib
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root_path not in sys.path:
    sys.path.append(root_path)

from bttwdlib import (
    load_yaml_cfg,
    show_cfg,
    load_dataset,
    prepare_features_and_labels,
    BucketTree,
    run_holdout_experiment,
    run_kfold_experiments,
    log_info,
    set_global_seed,
)

cfg_path = os.path.join(root_path, 'configs', 'bank_bttwd.yaml')
cfg = load_yaml_cfg(cfg_path)
set_global_seed(cfg.get('SEED', {}).get('global_seed', 42))
log_info('【步骤0摘要】环境准备完毕，路径与随机种子已设置。')



【INFO】【2025-12-03 20:50:06】【配置加载】已读取 e:\yan\组\三支决策\机器学习\BT_TWD\configs\bank_bttwd.yaml
【INFO】【2025-12-03 20:50:10】【步骤0摘要】环境准备完毕，路径与随机种子已设置。


In [2]:
# 步骤1：加载配置
show_cfg(cfg)
log_info('【步骤1摘要】配置文件加载完成，关键参数检查通过。')



【INFO】【2025-12-03 20:50:10】【配置-数据】数据集=bank_full, k折=5, 目标列=y, 正类="yes"
【INFO】【2025-12-03 20:50:10】【配置-BTTWD】阈值模式=None, 全局模型=xgb, 桶内模型=none, 后验估计器(兼容字段)=logreg
【INFO】【2025-12-03 20:50:10】【配置-基线】LogReg启用=True, RandomForest启用=False, KNN启用=True, XGBoost启用=True
【INFO】【2025-12-03 20:50:10】【步骤1摘要】配置文件加载完成，关键参数检查通过。


In [3]:
# 步骤2：加载原始数据
df_raw, target_col_model = load_dataset(cfg)  # 返回用于建模的标签列，例如 "y"

display(df_raw.head())
print('用于建模的标签列:', target_col_model)

# 1）画 0/1 标签比例
class_counts = df_raw[target_col_model].value_counts(normalize=True).sort_index()
ax = class_counts.plot(kind='bar', title='订购 vs 未订购比例')
plt.ylabel('比例')

fig_path = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bank_class_distribution.png')
os.makedirs(os.path.dirname(fig_path), exist_ok=True)
plt.savefig(fig_path, bbox_inches='tight')
plt.close()

log_info('【步骤2摘要】Bank Marketing 数据加载与标签分布完成。')



【INFO】【2025-12-03 20:50:10】【数据加载】文本表格 E:\yan\组\三支决策\机器学习\BT_TWD\data\bank\bank-full.csv 已读取，样本数=45211，列数=17
【INFO】【2025-12-03 20:50:10】【数据加载】银行营销数据集已读取，标签已映射为0/1，样本数=45211，正类比例=11.70%
【INFO】【2025-12-03 20:50:10】【数据集信息】名称=bank_full，样本数=45211，目标列=y，正类比例=11.70%


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


用于建模的标签列: y
【INFO】【2025-12-03 20:50:10】【步骤2摘要】Bank Marketing 数据加载与标签分布完成。


In [4]:
# 步骤3：预处理与特征工程
X, y, meta = prepare_features_and_labels(df_raw, cfg)
log_info(f'【预处理】编码特征维度={X.shape[1]}，样本数={X.shape[0]}')
log_info(f"【步骤3摘要】特征预处理完成：连续={len(meta['continuous_cols'])}，类别={len(meta['categorical_cols'])}，编码维度={X.shape[1]}。")



【INFO】【2025-12-03 20:50:10】【预处理】连续特征=7个，类别特征=9个
【INFO】【2025-12-03 20:50:11】【预处理】编码后维度=42
【INFO】【2025-12-03 20:50:11】【预处理】编码特征维度=42，样本数=45211
【INFO】【2025-12-03 20:50:11】【步骤3摘要】特征预处理完成：连续=7，类别=9，编码维度=42。


In [5]:
# 步骤4：构建桶树并检查划分
feature_df_for_bucket = df_raw.drop(columns=[target_col_model])
bucket_tree = BucketTree(cfg['BTTWD']['bucket_levels'], feature_names=feature_df_for_bucket.columns.tolist())
bucket_ids_full = bucket_tree.assign_buckets(feature_df_for_bucket)
group_df = df_raw.groupby(bucket_ids_full)[target_col_model].agg(['size', 'mean']).reset_index()
group_df.columns = ['bucket_id', 'count', 'pos_rate']
bucket_df = group_df.sort_values('count', ascending=False).reset_index(drop=True)

display(bucket_df.head())
bucket_df.set_index('bucket_id')['count'].plot(kind='bar', figsize=(12,4), title='桶样本数分布')
fig_bucket = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bucket_metrics_bar.png')
plt.savefig(fig_bucket, bbox_inches='tight')
plt.close()
log_info(f'【步骤4摘要】桶树划分完成，共有 {bucket_ids_full.nunique()} 个叶子桶。')




【INFO】【2025-12-03 20:50:11】【桶树】已为样本生成桶ID，共 87 个组合


Unnamed: 0,bucket_id,count,pos_rate
0,L1_age=30-40|L2_job=white_collar|L3_contact=ce...,7237,0.141219
1,L1_age=40-50|L2_job=white_collar|L3_contact=ce...,3747,0.133974
2,L1_age=30-40|L2_job=blue_collar|L3_contact=cel...,3380,0.099704
3,L1_age=<=30|L2_job=white_collar|L3_contact=cel...,2434,0.199671
4,L1_age=30-40|L2_job=blue_collar|L3_contact=OTHER,2399,0.036682


【INFO】【2025-12-03 20:50:12】【步骤4摘要】桶树划分完成，共有 87 个叶子桶。


In [6]:
# 步骤5：运行基线模型实验占位
# 基线部分在 run_kfold_experiments 内统一调度（仅在 use_kfold=True 时执行）
log_info('【步骤5】基线模型（LogReg / XGBoost）将在交叉验证模式中一并运行。')
log_info('【步骤5摘要】基线模型性能将作为后续对比基准。')



【INFO】【2025-12-03 20:50:12】【步骤5】基线模型（LogReg / XGBoost）将在交叉验证模式中一并运行。
【INFO】【2025-12-03 20:50:12】【步骤5摘要】基线模型性能将作为后续对比基准。


In [7]:
# 步骤6：运行 BTTWD 实验（k 折或单次留出）
use_kfold_raw = cfg.get('DATA', {}).get('use_kfold', False)
if isinstance(use_kfold_raw, str):
    use_kfold = use_kfold_raw.strip().lower() in ['true', '1', 'yes']
else:
    use_kfold = bool(use_kfold_raw)

if use_kfold:
    log_info('【步骤6】检测到 use_kfold=True，进入 k 折实验。')
    results = run_kfold_experiments(X, y, feature_df_for_bucket, cfg)
    summary_df = pd.read_csv(os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'metrics_kfold_summary.csv'))
    display(summary_df)
    summary_df.plot(x='model', kind='bar', figsize=(8,4), title='模型指标对比')
    fig_compare = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'metrics_compare.png')
    plt.savefig(fig_compare, bbox_inches='tight')
    plt.close()
    log_info('【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。')
else:
    log_info('【步骤6】use_kfold=False，执行单次留出验证流程。')
    holdout_metrics = run_holdout_experiment(X, y, feature_df_for_bucket, cfg)
    display(pd.DataFrame(holdout_metrics))
    log_info('【步骤6摘要】单次留出验证完成，指标已列出。')



【INFO】【2025-12-03 20:50:12】【步骤6】检测到 use_kfold=True，进入 k 折实验。
【INFO】【2025-12-03 20:50:12】【基线-LogReg】使用模型自定义阈值=0.400（per_model 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-12-03 20:50:14】【基线-LogReg】整体指标：AUC_mean=0.907, AUC_std=0.005, BAC_mean=0.701, BAC_std=0.006, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.513, F1_std=0.008, Kappa_mean=0.461, Kappa_std=0.008, MCC_mean=0.471, MCC_std=0.008, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.624, Precision_std=0.018, Recall_mean=0.437, Recall_std=0.015, Regret_mean=0.155, Regret_std=0.002
【INFO】【2025-12-03 20:50:14】【基线-KNN】使用通用阈值=0.400（per_model 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-03 20:50:17】【基线-KNN】整体指标：AUC_mean=0.877, AUC_std=0.005, BAC_mean=0.728, BAC_std=0.003, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.533, F1_std=0.007, Kappa_mean=0.474, Kappa_std=0.009, MCC_mean=0.475, MCC_std=0.009, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.560, Precision_std=0.013, Recall_mean=0.508, Recall_std=0.006, Regret_mean=0.150, Regret_std=0.002
【INFO】【2025-12-03 20:50:17】【基线-XGB】使用模型自定义阈值=0.400（per_model 模式）


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-03 20:50:20】【基线-XGB】整体指标：AUC_mean=0.935, AUC_std=0.003, BAC_mean=0.776, BAC_std=0.007, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.605, F1_std=0.010, Kappa_mean=0.553, Kappa_std=0.011, MCC_mean=0.553, MCC_std=0.011, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.606, Precision_std=0.011, Recall_mean=0.604, Recall_std=0.013, Regret_mean=0.127, Regret_std=0.003
【INFO】【2025-12-03 20:50:20】【K折实验】正在执行第 1/5 折...
【INFO】【2025-12-03 20:50:20】[BT] 使用桶评分配置：mode=f1_regret_bnd, f1_weight=1.0, regret_weight=1.0, bnd_weight=1.0
【INFO】【2025-12-03 20:50:20】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-03 20:50:21] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=36168
[INFO][BT][2025-12-03 20:50:21] 创建桶 bucket_id=L1_age=30-40，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="30-40"，n_samples=14173
[INFO][BT][2025-12-03 20:50:21] 创建桶 bucket_id=L1_age=40-50，level=1，parent_id=ROOT，split_name=L1



【INFO】【2025-12-03 20:50:23】【阈值】桶 L1_age=30-40（n_val=1295）使用本地阈值 α=0.3000, β=0.2000
[INFO][BT][2025-12-03 20:50:23] 桶 bucket_id=L1_age=30-40 level=1：
    n_train=2977, n_val=1295,
    BAC=0.813, F1=0.644, AUC=0.944,
    Regret=0.097, BND_ratio=0.042, POS_coverage=0.137,
    Score(f1_regret_bnd )=0.505
[INFO][BT][2025-12-03 20:50:23] 桶 bucket_id=L1_age=30-40：
    parent_id=ROOT，parent_Score=0.474, bucket_Score=0.505,
    Gain=+0.031, is_weak=False
【INFO】【2025-12-03 20:50:23】【阈值】桶 L1_age=40-50 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:50:23] 桶 bucket_id=L1_age=40-50 level=1：
    n_train=1859, n_val=840,
    BAC=0.747, F1=0.608, AUC=0.945,
    Regret=0.082, BND_ratio=0.058, POS_coverage=0.069,
    Score(f1_regret_bnd )=0.468
[INFO][BT][2025-12-03 20:50:23] 桶 bucket_id=L1_age=40-50：
    parent_id=ROOT，parent_Score=0.474, bucket_Score=0.468,
    Gain=-0.006, is_weak=True
【INFO】【2025-12-03 20:50:23】【阈值】桶 L1_age=50-60（n_val=563）使用本地阈值 α=0.3000, β=0.2000
[INFO][BT][2025-12-03 20:50:23] 桶 

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-03 20:50:31】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-03 20:50:31] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=36169
[INFO][BT][2025-12-03 20:50:31] 创建桶 bucket_id=L1_age=30-40，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="30-40"，n_samples=14083
[INFO][BT][2025-12-03 20:50:31] 创建桶 bucket_id=L1_age=40-50，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="40-50"，n_samples=9058
[INFO][BT][2025-12-03 20:50:31] 创建桶 bucket_id=L1_age=50-60，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="50-60"，n_samples=6487
[INFO][BT][2025-12-03 20:50:31] 创建桶 bucket_id=L1_age=<=30，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="<=30"，n_samples=5587
[INFO][BT][2025-12-03 20:50:31] 创建桶 bucket_id=L1_age=>60，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule=">60"，n_samples=954
[INFO][BT][2025-12-03 20:50:31] 



【INFO】【2025-12-03 20:50:33】【阈值】桶 L1_age=30-40 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:50:33] 桶 bucket_id=L1_age=30-40 level=1：
    n_train=2969, n_val=1295,
    BAC=0.792, F1=0.608, AUC=0.933,
    Regret=0.108, BND_ratio=0.038, POS_coverage=0.136,
    Score(f1_regret_bnd )=0.462
[INFO][BT][2025-12-03 20:50:33] 桶 bucket_id=L1_age=30-40：
    parent_id=ROOT，parent_Score=0.472, bucket_Score=0.462,
    Gain=-0.010, is_weak=True
【INFO】【2025-12-03 20:50:33】【阈值】桶 L1_age=40-50 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:50:33] 桶 bucket_id=L1_age=40-50 level=1：
    n_train=1921, n_val=806,
    BAC=0.774, F1=0.575, AUC=0.939,
    Regret=0.095, BND_ratio=0.038, POS_coverage=0.103,
    Score(f1_regret_bnd )=0.442
[INFO][BT][2025-12-03 20:50:33] 桶 bucket_id=L1_age=40-50：
    parent_id=ROOT，parent_Score=0.472, bucket_Score=0.442,
    Gain=-0.031, is_weak=True
【INFO】【2025-12-03 20:50:34】【阈值】桶 L1_age=50-60（n_val=603）使用本地阈值 α=0.3000, β=0.2000
[INFO][BT][2025-12-03 20:50:34] 桶 bucket_id=L1_ag

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-03 20:50:41】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-03 20:50:41] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=36169
[INFO][BT][2025-12-03 20:50:41] 创建桶 bucket_id=L1_age=30-40，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="30-40"，n_samples=14168
[INFO][BT][2025-12-03 20:50:41] 创建桶 bucket_id=L1_age=40-50，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="40-50"，n_samples=8935
[INFO][BT][2025-12-03 20:50:41] 创建桶 bucket_id=L1_age=50-60，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="50-60"，n_samples=6450
[INFO][BT][2025-12-03 20:50:41] 创建桶 bucket_id=L1_age=<=30，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="<=30"，n_samples=5673
[INFO][BT][2025-12-03 20:50:41] 创建桶 bucket_id=L1_age=>60，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule=">60"，n_samples=943
[INFO][BT][2025-12-03 20:50:41] 



【INFO】【2025-12-03 20:50:44】【阈值】桶 L1_age=30-40（n_val=1289）使用本地阈值 α=0.3000, β=0.2000
[INFO][BT][2025-12-03 20:50:44] 桶 bucket_id=L1_age=30-40 level=1：
    n_train=3010, n_val=1289,
    BAC=0.839, F1=0.662, AUC=0.948,
    Regret=0.085, BND_ratio=0.039, POS_coverage=0.133,
    Score(f1_regret_bnd )=0.538
[INFO][BT][2025-12-03 20:50:44] 桶 bucket_id=L1_age=30-40：
    parent_id=ROOT，parent_Score=0.479, bucket_Score=0.538,
    Gain=+0.059, is_weak=False
【INFO】【2025-12-03 20:50:44】【阈值】桶 L1_age=40-50（n_val=826）使用本地阈值 α=0.3000, β=0.1000
[INFO][BT][2025-12-03 20:50:44] 桶 bucket_id=L1_age=40-50 level=1：
    n_train=1883, n_val=826,
    BAC=0.804, F1=0.663, AUC=0.946,
    Regret=0.086, BND_ratio=0.086, POS_coverage=0.116,
    Score(f1_regret_bnd )=0.491
[INFO][BT][2025-12-03 20:50:44] 桶 bucket_id=L1_age=40-50：
    parent_id=ROOT，parent_Score=0.479, bucket_Score=0.491,
    Gain=+0.012, is_weak=False
【INFO】【2025-12-03 20:50:44】【阈值】桶 L1_age=50-60 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:50:44] 桶

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-03 20:50:51】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-03 20:50:52] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=36169
[INFO][BT][2025-12-03 20:50:52] 创建桶 bucket_id=L1_age=30-40，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="30-40"，n_samples=14179
[INFO][BT][2025-12-03 20:50:52] 创建桶 bucket_id=L1_age=40-50，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="40-50"，n_samples=8981
[INFO][BT][2025-12-03 20:50:52] 创建桶 bucket_id=L1_age=50-60，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="50-60"，n_samples=6436
[INFO][BT][2025-12-03 20:50:52] 创建桶 bucket_id=L1_age=<=30，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="<=30"，n_samples=5614
[INFO][BT][2025-12-03 20:50:52] 创建桶 bucket_id=L1_age=>60，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule=">60"，n_samples=959
[INFO][BT][2025-12-03 20:50:52] 



【INFO】【2025-12-03 20:50:54】【阈值】桶 L1_age=30-40 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:50:54] 桶 bucket_id=L1_age=30-40 level=1：
    n_train=3026, n_val=1276,
    BAC=0.801, F1=0.638, AUC=0.934,
    Regret=0.102, BND_ratio=0.046, POS_coverage=0.131,
    Score(f1_regret_bnd )=0.490
[INFO][BT][2025-12-03 20:50:54] 桶 bucket_id=L1_age=30-40：
    parent_id=ROOT，parent_Score=0.528, bucket_Score=0.490,
    Gain=-0.038, is_weak=True
【INFO】【2025-12-03 20:50:54】【阈值】桶 L1_age=40-50 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:50:54] 桶 bucket_id=L1_age=40-50 level=1：
    n_train=1863, n_val=869,
    BAC=0.799, F1=0.634, AUC=0.952,
    Regret=0.087, BND_ratio=0.025, POS_coverage=0.098,
    Score(f1_regret_bnd )=0.522
[INFO][BT][2025-12-03 20:50:54] 桶 bucket_id=L1_age=40-50：
    parent_id=ROOT，parent_Score=0.528, bucket_Score=0.522,
    Gain=-0.006, is_weak=True
【INFO】【2025-12-03 20:50:54】【阈值】桶 L1_age=50-60 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:50:54] 桶 bucket_id=L1_age=50-60 level

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-03 20:51:03】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-03 20:51:03] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=36169
[INFO][BT][2025-12-03 20:51:03] 创建桶 bucket_id=L1_age=30-40，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="30-40"，n_samples=14145
[INFO][BT][2025-12-03 20:51:03] 创建桶 bucket_id=L1_age=40-50，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="40-50"，n_samples=9039
[INFO][BT][2025-12-03 20:51:03] 创建桶 bucket_id=L1_age=50-60，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="50-60"，n_samples=6488
[INFO][BT][2025-12-03 20:51:03] 创建桶 bucket_id=L1_age=<=30，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule="<=30"，n_samples=5571
[INFO][BT][2025-12-03 20:51:03] 创建桶 bucket_id=L1_age=>60，level=1，parent_id=ROOT，split_name=L1_age，split_type=numeric_bin，split_rule=">60"，n_samples=926
[INFO][BT][2025-12-03 20:51:03] 



【INFO】【2025-12-03 20:51:06】【阈值】桶 L1_age=30-40 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:51:06] 桶 bucket_id=L1_age=30-40 level=1：
    n_train=2970, n_val=1320,
    BAC=0.802, F1=0.627, AUC=0.930,
    Regret=0.099, BND_ratio=0.030, POS_coverage=0.124,
    Score(f1_regret_bnd )=0.498
[INFO][BT][2025-12-03 20:51:06] 桶 bucket_id=L1_age=30-40：
    parent_id=ROOT，parent_Score=0.525, bucket_Score=0.498,
    Gain=-0.027, is_weak=True
【INFO】【2025-12-03 20:51:07】【阈值】桶 L1_age=40-50 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:51:07] 桶 bucket_id=L1_age=40-50 level=1：
    n_train=1895, n_val=848,
    BAC=0.771, F1=0.617, AUC=0.952,
    Regret=0.092, BND_ratio=0.077, POS_coverage=0.108,
    Score(f1_regret_bnd )=0.449
[INFO][BT][2025-12-03 20:51:07] 桶 bucket_id=L1_age=40-50：
    parent_id=ROOT，parent_Score=0.525, bucket_Score=0.449,
    Gain=-0.076, is_weak=True
【INFO】【2025-12-03 20:51:07】【阈值】桶 L1_age=50-60 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-03 20:51:07] 桶 bucket_id=L1_age=50-60 level

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-03 20:51:16】[BASELINE] 全局 XGB 模型训练完成
【INFO】【2025-12-03 20:51:16】[BASELINE] 阈值搜索开始
【INFO】【2025-12-03 20:51:16】[BASELINE] 最佳阈值找到: alpha=0.3000, beta=0.2000, regret=0.1096
【INFO】【2025-12-03 20:51:16】【桶树】已为样本生成桶ID，共 81 个组合
【INFO】【2025-12-03 20:51:16】[BASELINE] 测试集桶映射完成，共 81 个桶


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expecte

【INFO】【2025-12-03 20:51:18】[BASELINE] 桶 L1_age=30-40|L2_job=blue_collar|L3_contact=OTHER: BAC=0.8364, Regret=0.0454, Precision=0.4286, Recall=0.7059
【INFO】【2025-12-03 20:51:18】[BASELINE] 桶 L1_age=40-50|L2_job=white_collar|L3_contact=OTHER: BAC=0.6532, Regret=0.0596, Precision=0.2727, Recall=0.3333
【INFO】【2025-12-03 20:51:18】[BASELINE] 桶 L1_age=30-40|L2_job=blue_collar|L3_contact=cellular: BAC=0.8189, Regret=0.1164, Precision=0.5258, Recall=0.7183
【INFO】【2025-12-03 20:51:18】[BASELINE] 桶 L1_age=40-50|L2_job=white_collar|L3_contact=cellular: BAC=0.8305, Regret=0.1193, Precision=0.5556, Recall=0.7447
【INFO】【2025-12-03 20:51:18】[BASELINE] 桶 L1_age=40-50|L2_job=self_emp|L3_contact=OTHER: BAC=0.7914, Regret=0.0754, Precision=0.7500, Recall=0.6000
【INFO】【2025-12-03 20:51:18】[BASELINE] 桶 L1_age=>60|L2_job=retired|L3_contact=cellular: BAC=0.5870, Regret=0.4045, Precision=0.5000, Recall=0.7708
【INFO】【2025-12-03 20:51:18】[BASELINE] 桶 L1_age=30-40|L2_job=white_collar|L3_contact=cellular: BAC=0.8061

Unnamed: 0,model,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,BAC_mean,BAC_std,AUC_mean,...,MCC_mean,MCC_std,Kappa_mean,Kappa_std,BND_ratio_mean,BND_ratio_std,POS_Coverage_mean,POS_Coverage_std,Regret_mean,Regret_std
0,BTTWD,0.548081,0.009321,0.732271,0.01477,0.62689,0.010255,0.826132,0.007536,0.932804,...,0.577294,0.012045,0.569252,0.011773,0.050187,0.004988,0.139612,0.002874,0.11385,0.004633
1,LogReg,0.623769,0.018441,0.436565,0.014859,0.513241,0.00834,0.700773,0.006167,0.906632,...,0.470631,0.008159,0.461357,0.008428,0.0,0.0,,,0.155018,0.002173
2,KNN,0.560451,0.013462,0.508036,0.006213,0.532867,0.007157,0.727591,0.00321,0.87708,...,0.475154,0.008858,0.474378,0.008585,0.0,0.0,,,0.150107,0.002215
3,XGBoost,0.606381,0.010677,0.60389,0.013179,0.605067,0.010262,0.775969,0.006619,0.935239,...,0.552913,0.011429,0.552873,0.011455,0.0,0.0,,,0.127082,0.003287


【INFO】【2025-12-03 20:51:18】【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。


In [8]:
# 步骤7：桶级别分析
bucket_metrics_path = os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'bucket_metrics.csv')
if os.path.exists(bucket_metrics_path):
    bucket_metrics_df = pd.read_csv(bucket_metrics_path)
    display(bucket_metrics_df.head())
    bucket_metrics_df.plot(x='bucket_id', y='pos_rate_all', kind='bar', figsize=(12,4), title='桶正类比例')
    plt.ylabel('正类比例')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(fig_bucket, bbox_inches='tight')
    plt.close()
log_info('【步骤7摘要】桶级指标（如存在）已整理，可用于局部化分析。')



Unnamed: 0,bucket_id,layer,parent_bucket_id,n_train,n_val,pos_rate_train,pos_rate_val,alpha,beta,regret_val,...,is_weak,threshold_source_bucket,parent_with_threshold,n_test,pos_rate_test,BND_ratio_test,POS_Coverage_test,regret_test,fold,pos_rate
0,ROOT,L1,,7594,3252,0.115881,0.118081,0.3,0.2,0.112162,...,False,ROOT,,,,,,,1,0.116982
1,L1_age=30-40,L1,ROOT,2977,1295,0.100437,0.110425,0.3,0.2,0.096718,...,False,L1_age=30-40,,,,,,,1,0.101531
2,L1_age=40-50,L1,ROOT,1859,840,0.086606,0.079762,0.3,0.2,0.082143,...,True,ROOT,,,,,,,1,0.090015
3,L1_age=30-40|L2_job=white_collar,L2,L1_age=30-40,1648,702,0.113471,0.133903,0.3,0.2,0.097578,...,False,L1_age=30-40|L2_job=white_collar,,,,,,,1,0.117017
4,L1_age=50-60,L1,ROOT,1368,563,0.096491,0.090586,0.3,0.2,0.071048,...,False,L1_age=50-60,,,,,,,1,0.099423


  plt.tight_layout()


【INFO】【2025-12-03 20:51:24】【步骤7摘要】桶级指标（如存在）已整理，可用于局部化分析。


In [9]:
# 步骤8：结果汇总
log_info('【步骤8】检查结果文件与图表。')
results_dir = os.path.join(root_path, cfg['OUTPUT']['results_dir'])
figs_dir = os.path.join(root_path, cfg['OUTPUT']['figs_dir'])
os.makedirs(results_dir, exist_ok=True)
os.makedirs(figs_dir, exist_ok=True)
print(os.listdir(results_dir))
print(os.listdir(figs_dir))
log_info('【全部步骤完成】Bank Marketing 数据集的 BT-TWD 实验结束。')



【INFO】【2025-12-03 20:51:24】【步骤8】检查结果文件与图表。
['baseline_bucket_metrics.csv', 'bucket_fallback_stats.csv', 'bucket_metrics.csv', 'bucket_metrics_gain.csv', 'bucket_thresholds.csv', 'bucket_thresholds_per_fold.csv', 'bucket_tree_structure.csv', 'metrics_kfold_per_fold.csv', 'metrics_kfold_summary.csv', 'metrics_overview.csv']
['bank_class_distribution.png', 'bucket_metrics_bar.png', 'class_distribution.png', 'metrics_compare.png']
【INFO】【2025-12-03 20:51:24】【全部步骤完成】Bank Marketing 数据集的 BT-TWD 实验结束。
