Bank Marketing（bank-full）数据集 BTTWD 实验

本 notebook 按步骤运行：环境准备 → 加载配置 → 读取数据 → 预处理 → 桶树划分 → 基线与 BTTWD 实验 → 桶级分析。



In [1]:
# 步骤0：环境与路径设置
import os, sys
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# 将项目根目录加入路径，便于导入 bttwdlib
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root_path not in sys.path:
    sys.path.append(root_path)

from bttwdlib import (
    load_yaml_cfg,
    show_cfg,
    load_dataset,
    prepare_features_and_labels,
    BucketTree,
    run_holdout_experiment,
    run_kfold_experiments,
    log_info,
    set_global_seed,
)

cfg_path = os.path.join(root_path, 'configs', 'bank_bttwd.yaml')
cfg = load_yaml_cfg(cfg_path)
set_global_seed(cfg.get('SEED', {}).get('global_seed', 42))
log_info('【步骤0摘要】环境准备完毕，路径与随机种子已设置。')



【INFO】【2025-11-26 20:19:52】【配置加载】已读取 e:\yan\组\三支决策\机器学习\BT_TWD\configs\bank_bttwd.yaml
【INFO】【2025-11-26 20:19:54】【步骤0摘要】环境准备完毕，路径与随机种子已设置。


In [2]:
# 步骤1：加载配置
show_cfg(cfg)
log_info('【步骤1摘要】配置文件加载完成，关键参数检查通过。')



【INFO】【2025-11-26 20:19:54】【配置-数据】数据集=bank_full, k折=5, 目标列=y, 正类="yes"
【INFO】【2025-11-26 20:19:54】【配置-BTTWD】阈值模式=None, 全局模型=xgb, 桶内模型=none, 后验估计器(兼容字段)=logreg
【INFO】【2025-11-26 20:19:54】【配置-基线】LogReg启用=True, RandomForest启用=False, KNN启用=True, XGBoost启用=True
【INFO】【2025-11-26 20:19:54】【步骤1摘要】配置文件加载完成，关键参数检查通过。


In [3]:
# 步骤2：加载原始数据
df_raw, target_col_model = load_dataset(cfg)  # 返回用于建模的标签列，例如 "y"

display(df_raw.head())
print('用于建模的标签列:', target_col_model)

# 1）画 0/1 标签比例
class_counts = df_raw[target_col_model].value_counts(normalize=True).sort_index()
ax = class_counts.plot(kind='bar', title='订购 vs 未订购比例')
plt.ylabel('比例')

fig_path = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bank_class_distribution.png')
os.makedirs(os.path.dirname(fig_path), exist_ok=True)
plt.savefig(fig_path, bbox_inches='tight')
plt.close()

log_info('【步骤2摘要】Bank Marketing 数据加载与标签分布完成。')



【INFO】【2025-11-26 20:19:54】【数据加载】文本表格 E:\yan\组\三支决策\机器学习\BT_TWD\data\bank\bank-full.csv 已读取，样本数=45211，列数=17
【INFO】【2025-11-26 20:19:55】【数据加载】银行营销数据集已读取，标签已映射为0/1，样本数=45211，正类比例=11.70%
【INFO】【2025-11-26 20:19:55】【数据集信息】名称=bank_full，样本数=45211，目标列=y，正类比例=11.70%


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


用于建模的标签列: y
【INFO】【2025-11-26 20:19:55】【步骤2摘要】Bank Marketing 数据加载与标签分布完成。


In [4]:
# 步骤3：预处理与特征工程
X, y, meta = prepare_features_and_labels(df_raw, cfg)
log_info(f'【预处理】编码特征维度={X.shape[1]}，样本数={X.shape[0]}')
log_info(f"【步骤3摘要】特征预处理完成：连续={len(meta['continuous_cols'])}，类别={len(meta['categorical_cols'])}，编码维度={X.shape[1]}。")



【INFO】【2025-11-26 20:19:55】【预处理】连续特征=7个，类别特征=9个
【INFO】【2025-11-26 20:19:55】【预处理】编码后维度=42
【INFO】【2025-11-26 20:19:55】【预处理】编码特征维度=42，样本数=45211
【INFO】【2025-11-26 20:19:55】【步骤3摘要】特征预处理完成：连续=7，类别=9，编码维度=42。


In [5]:
# 步骤4：构建桶树并检查划分
feature_df_for_bucket = df_raw.drop(columns=[target_col_model])
bucket_tree = BucketTree(cfg['BTTWD']['bucket_levels'], feature_names=feature_df_for_bucket.columns.tolist())
bucket_ids_full = bucket_tree.assign_buckets(feature_df_for_bucket)
group_df = df_raw.groupby(bucket_ids_full)[target_col_model].agg(['size', 'mean']).reset_index()
group_df.columns = ['bucket_id', 'count', 'pos_rate']
bucket_df = group_df.sort_values('count', ascending=False).reset_index(drop=True)

display(bucket_df.head())
bucket_df.set_index('bucket_id')['count'].plot(kind='bar', figsize=(12,4), title='桶样本数分布')
fig_bucket = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bucket_metrics_bar.png')
plt.savefig(fig_bucket, bbox_inches='tight')
plt.close()
log_info(f'【步骤4摘要】桶树划分完成，共有 {bucket_ids_full.nunique()} 个叶子桶。')




【INFO】【2025-11-26 20:19:55】【桶树】已为样本生成桶ID，共 87 个组合


Unnamed: 0,bucket_id,count,pos_rate
0,L1_age=30-40|L2_job=white_collar|L3_contact=ce...,7237,0.141219
1,L1_age=40-50|L2_job=white_collar|L3_contact=ce...,3747,0.133974
2,L1_age=30-40|L2_job=blue_collar|L3_contact=cel...,3380,0.099704
3,L1_age=<=30|L2_job=white_collar|L3_contact=cel...,2434,0.199671
4,L1_age=30-40|L2_job=blue_collar|L3_contact=OTHER,2399,0.036682


【INFO】【2025-11-26 20:19:56】【步骤4摘要】桶树划分完成，共有 87 个叶子桶。


In [6]:
# 步骤5：运行基线模型实验占位
# 基线部分在 run_kfold_experiments 内统一调度（仅在 use_kfold=True 时执行）
log_info('【步骤5】基线模型（LogReg / XGBoost）将在交叉验证模式中一并运行。')
log_info('【步骤5摘要】基线模型性能将作为后续对比基准。')



【INFO】【2025-11-26 20:19:56】【步骤5】基线模型（LogReg / XGBoost）将在交叉验证模式中一并运行。
【INFO】【2025-11-26 20:19:56】【步骤5摘要】基线模型性能将作为后续对比基准。


In [7]:
# 步骤6：运行 BTTWD 实验（k 折或单次留出）
use_kfold_raw = cfg.get('DATA', {}).get('use_kfold', False)
if isinstance(use_kfold_raw, str):
    use_kfold = use_kfold_raw.strip().lower() in ['true', '1', 'yes']
else:
    use_kfold = bool(use_kfold_raw)

if use_kfold:
    log_info('【步骤6】检测到 use_kfold=True，进入 k 折实验。')
    results = run_kfold_experiments(X, y, feature_df_for_bucket, cfg)
    summary_df = pd.read_csv(os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'metrics_kfold_summary.csv'))
    display(summary_df)
    summary_df.plot(x='model', kind='bar', figsize=(8,4), title='模型指标对比')
    fig_compare = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'metrics_compare.png')
    plt.savefig(fig_compare, bbox_inches='tight')
    plt.close()
    log_info('【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。')
else:
    log_info('【步骤6】use_kfold=False，执行单次留出验证流程。')
    holdout_metrics = run_holdout_experiment(X, y, feature_df_for_bucket, cfg)
    display(pd.DataFrame(holdout_metrics))
    log_info('【步骤6摘要】单次留出验证完成，指标已列出。')



【INFO】【2025-11-26 20:19:56】【步骤6】检测到 use_kfold=True，进入 k 折实验。
【INFO】【2025-11-26 20:19:56】【基线-LogReg】使用决策阈值=0.300（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-11-26 20:19:57】【基线-LogReg】整体指标：AUC_mean=0.907, AUC_std=0.005, BAC_mean=0.743, BAC_std=0.006, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.557, F1_std=0.008, Kappa_mean=0.501, Kappa_std=0.009, MCC_mean=0.501, MCC_std=0.009, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.577, Precision_std=0.013, Recall_mean=0.539, Recall_std=0.014, Regret_mean=0.116, Regret_std=0.002
【INFO】【2025-11-26 20:19:57】【基线-KNN】使用决策阈值=0.300（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-26 20:19:59】【基线-KNN】整体指标：AUC_mean=0.877, AUC_std=0.005, BAC_mean=0.777, BAC_std=0.003, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.562, F1_std=0.007, Kappa_mean=0.496, Kappa_std=0.009, MCC_mean=0.501, MCC_std=0.008, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.502, Precision_std=0.010, Recall_mean=0.639, Recall_std=0.005, Regret_mean=0.119, Regret_std=0.002
【INFO】【2025-11-26 20:19:59】【基线-XGB】使用决策阈值=0.300（fixed 模式）


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-26 20:20:02】【基线-XGB】整体指标：AUC_mean=0.935, AUC_std=0.003, BAC_mean=0.821, BAC_std=0.006, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.630, F1_std=0.010, Kappa_mean=0.575, Kappa_std=0.011, MCC_mean=0.580, MCC_std=0.011, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.564, Precision_std=0.010, Recall_mean=0.714, Recall_std=0.010, Regret_mean=0.099, Regret_std=0.003
【INFO】【2025-11-26 20:20:02】【K折实验】正在执行第 1/5 折...
【INFO】【2025-11-26 20:20:02】【桶树】已为样本生成桶ID，共 87 个组合
【INFO】【2025-11-26 20:20:02】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-26 20:20:03] 桶 L1_age=30-40 子桶样本不足（最小子桶 n=24 < 30），不再细分
[INFO][BT][2025-11-26 20:20:03] 桶 L1_age=40-50 分裂前 Score=0.5713，层级 L1，样本 n=8943；子桶Score=[0.569035365715289, 0.5205853174603174, 0.5493084501705191, 0.5071894301247928, 0.4621807880940301, 0.5784697372119986]，Gain=-0.0150
[INFO][BT][2025-11-26 20:20:03] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:03] 桶 L1_age=50-60 分裂前 Score=0.5630，层级 L1，样本 n=6407；子桶Score=[0.540972833723682

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-26 20:20:04】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-26 20:20:05] 桶 L1_age=30-40 子桶样本不足（最小子桶 n=25 < 30），不再细分
[INFO][BT][2025-11-26 20:20:05] 桶 L1_age=40-50 分裂前 Score=0.5772，层级 L1，样本 n=9058；子桶Score=[0.5849560519490246, 0.5383317175221961, 0.5527906304222093, 0.5108210066699267, 0.69209271037182, 0.5733835308684774]，Gain=-0.0145
[INFO][BT][2025-11-26 20:20:05] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:05] 桶 L1_age=50-60 分裂前 Score=0.5860，层级 L1，样本 n=6487；子桶Score=[0.5707134811066785, 0.5957935446906035, 0.5783315133315133, 0.5974188541824402, 0.6605311355311355, 0.5773504859636537]，Gain=-0.0175
[INFO][BT][2025-11-26 20:20:05] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:05] 桶 L1_age=<=30 子桶样本不足（最小子桶 n=2 < 30），不再细分
[INFO][BT][2025-11-26 20:20:05] 桶 L1_age=>60 子桶样本不足（最小子桶 n=7 < 30），不再细分
[INFO][BT][2025-11-26 20:20:05] 桶 L1_age=40-50|L2_job=blue_collar 分裂前 Score=0.5850，层级 L2，样本 n=3428；子桶Score=[0.5737061113157397, 0.5713429585021856, 0.6034296601227866]，Gain=-0.0144
[INFO][BT

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-26 20:20:07】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-26 20:20:07] 桶 L1_age=30-40 分裂前 Score=0.5747，层级 L1，样本 n=14168；子桶Score=[0.5803238960127147, 0.5828986906751321, 0.42634408602150536, 0.5793303029801078, 0.6473915776241358, 0.5617220011155122]，Gain=-0.0147
[INFO][BT][2025-11-26 20:20:07] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:07] 桶 L1_age=40-50 分裂前 Score=0.5902，层级 L1，样本 n=8935；子桶Score=[0.5663836287428476, 0.5647715494066327, 0.41643214509068166, 0.511581360373609, 0.4997244268077601, 0.6072248022258795]，Gain=-0.0199
[INFO][BT][2025-11-26 20:20:07] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:07] 桶 L1_age=50-60 分裂前 Score=0.5781，层级 L1，样本 n=6450；子桶Score=[0.5857450796763812, 0.5232633604256596, 0.584423079623924, 0.5351892587701602, 0.5733091787439614, 0.5676988352453973]，Gain=-0.0161
[INFO][BT][2025-11-26 20:20:07] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:07] 桶 L1_age=<=30 子桶样本不足（最小子桶 n=3 < 30），不再细分
[INFO][BT][2025-11-26 20:20:07] 桶 L1_age=>60 子桶样本不足（最小子桶 n=9 < 

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-26 20:20:09】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-26 20:20:09] 桶 L1_age=30-40 分裂前 Score=0.5617，层级 L1，样本 n=14179；子桶Score=[0.5598154365288631, 0.521824360392487, 0.8802923387096775, 0.5715292106496018, 0.6396920175989943, 0.5534915280740749]，Gain=-0.0150
[INFO][BT][2025-11-26 20:20:09] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:09] 桶 L1_age=40-50 分裂前 Score=0.5715，层级 L1，样本 n=8981；子桶Score=[0.5646966841694908, 0.5159821680147628, 0.4333166833166833, 0.48923635120089315, 0.6471135454703529, 0.5819701917249618]，Gain=-0.0163
[INFO][BT][2025-11-26 20:20:09] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:09] 桶 L1_age=50-60 分裂前 Score=0.5648，层级 L1，样本 n=6436；子桶Score=[0.5580923743236399, 0.5521298185218242, 0.5283869815651947, 0.5091338385702452, 0.7091049382716048, 0.5697702384446294]，Gain=-0.0181
[INFO][BT][2025-11-26 20:20:09] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:09] 桶 L1_age=<=30 子桶样本不足（最小子桶 n=4 < 30），不再细分
[INFO][BT][2025-11-26 20:20:09] 桶 L1_age=>60 子桶样本不足（最小子桶 n=9 < 

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-26 20:20:11】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-26 20:20:12] 桶 L1_age=30-40 分裂前 Score=0.5825，层级 L1，样本 n=14145；子桶Score=[0.5667879829830779, 0.552852136468168, 0.42867943548387094, 0.622328073155168, 0.7008928571428571, 0.5782720030085479]，Gain=-0.0161
[INFO][BT][2025-11-26 20:20:12] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:12] 桶 L1_age=40-50 分裂前 Score=0.6072，层级 L1，样本 n=9039；子桶Score=[0.5766894301280192, 0.5949975111996018, 0.5049625468164795, 0.5983360575692075, 0.6301911667765326, 0.6155000767224028]，Gain=-0.0192
[INFO][BT][2025-11-26 20:20:12] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:12] 桶 L1_age=50-60 分裂前 Score=0.5819，层级 L1，样本 n=6488；子桶Score=[0.5748376889348006, 0.49127230323680754, 0.580794842132335, 0.5268081616666055, 0.5547619047619048, 0.5862066927616459]，Gain=-0.0178
[INFO][BT][2025-11-26 20:20:12] Gain 足够，进入下一层 L2
[INFO][BT][2025-11-26 20:20:12] 桶 L1_age=<=30 子桶样本不足（最小子桶 n=4 < 30），不再细分
[INFO][BT][2025-11-26 20:20:12] 桶 L1_age=>60 子桶样本不足（最小子桶 n=8 < 3

Unnamed: 0,model,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,BAC_mean,BAC_std,AUC_mean,...,MCC_mean,MCC_std,Kappa_mean,Kappa_std,BND_ratio_mean,BND_ratio_std,POS_Coverage_mean,POS_Coverage_std,Regret_mean,Regret_std
0,BTTWD,0.580311,0.009448,0.670446,0.011425,0.622103,0.009286,0.803098,0.005993,0.932804,...,0.569868,0.010637,0.567913,0.010609,0.124527,0.009573,0.05888,0.002894,0.107485,0.002059
1,LogReg,0.576542,0.013104,0.539229,0.013567,0.557057,0.007894,0.743338,0.005856,0.906632,...,0.501061,0.008821,0.500589,0.008741,0.0,0.0,,,0.115658,0.001931
2,KNN,0.501684,0.010269,0.638873,0.004895,0.561968,0.007008,0.777367,0.002961,0.87708,...,0.500742,0.00804,0.495888,0.008641,0.0,0.0,,,0.119092,0.00242
3,XGBoost,0.564129,0.010188,0.71431,0.009625,0.630377,0.009539,0.820584,0.005548,0.935239,...,0.580119,0.010965,0.574785,0.01112,0.0,0.0,,,0.098571,0.002866


【INFO】【2025-11-26 20:20:14】【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。


In [8]:
# 步骤7：桶级别分析
bucket_metrics_path = os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'bucket_metrics.csv')
if os.path.exists(bucket_metrics_path):
    bucket_metrics_df = pd.read_csv(bucket_metrics_path)
    display(bucket_metrics_df.head())
    bucket_metrics_df.plot(x='bucket_id', y='pos_rate_all', kind='bar', figsize=(12,4), title='桶正类比例')
    plt.ylabel('正类比例')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(fig_bucket, bbox_inches='tight')
    plt.close()
log_info('【步骤7摘要】桶级指标（如存在）已整理，可用于局部化分析。')



Unnamed: 0,bucket_id,layer,parent_bucket_id,n_train,n_val,pos_rate_train,pos_rate_val,alpha,beta,regret_val,...,threshold_n_samples,n_all,pos_rate_all,n_test,pos_rate_test,BND_ratio_test,POS_Coverage_test,regret_test,fold,pos_rate
0,L1_age=30-40,L1,,9897,4276,0.102455,0.099392,0.6,0.2,0.091265,...,4276,14173,0.101531,,,,,,1,0.101531
1,L1_age=<=30,L1,,3970,1705,0.15995,0.17478,0.6,0.2,0.153226,...,1705,5675,0.164405,,,,,,1,0.164405
2,L1_age=40-50|L2_job=white_collar,L2,L1_age=40-50,3079,1370,0.105229,0.111679,0.6,0.2,0.103285,...,1370,4449,0.107215,,,,,,1,0.107215
3,L1_age=50-60|L2_job=white_collar,L2,L1_age=50-60,1940,775,0.119588,0.122581,0.6,0.3,0.114516,...,775,2715,0.120442,,,,,,1,0.120442
4,L1_age=40-50|L2_job=blue_collar|L3_contact=cel...,L3,L1_age=40-50|L2_job=blue_collar,1245,576,0.096386,0.086806,0.6,0.3,0.085069,...,576,1821,0.093355,498.0,0.090361,0.0,0.018072,0.096888,1,0.093355


  plt.tight_layout()


【INFO】【2025-11-26 20:20:15】【步骤7摘要】桶级指标（如存在）已整理，可用于局部化分析。


In [9]:
# 步骤8：结果汇总
log_info('【步骤8】检查结果文件与图表。')
results_dir = os.path.join(root_path, cfg['OUTPUT']['results_dir'])
figs_dir = os.path.join(root_path, cfg['OUTPUT']['figs_dir'])
os.makedirs(results_dir, exist_ok=True)
os.makedirs(figs_dir, exist_ok=True)
print(os.listdir(results_dir))
print(os.listdir(figs_dir))
log_info('【全部步骤完成】Bank Marketing 数据集的 BT-TWD 实验结束。')



【INFO】【2025-11-26 20:20:15】【步骤8】检查结果文件与图表。
['bucket_metrics.csv', 'bucket_thresholds_per_fold.csv', 'metrics_kfold_per_fold.csv', 'metrics_kfold_summary.csv']
['bank_class_distribution.png', 'bucket_metrics_bar.png', 'class_distribution.png', 'metrics_compare.png']
【INFO】【2025-11-26 20:20:15】【全部步骤完成】Bank Marketing 数据集的 BT-TWD 实验结束。
