hospital 数据集 BT-TWD 可行性实验

本 notebook 按步骤运行：加载配置 → 读取数据 → 预处理 → 桶树划分 → 基线与 BTTWD k 折实验 → 桶级分析。

In [1]:
# 步骤0：环境与路径设置
import os, sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# 将项目根目录加入路径，便于导入 bttwdlib
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root_path not in sys.path:
    sys.path.append(root_path)

from bttwdlib import (
    load_yaml_cfg,
    show_cfg,
    load_dataset,
    prepare_features_and_labels,
    BucketTree,
    run_kfold_experiments,
    log_info,
    set_global_seed,
)

cfg_path = Path(root_path) / "configs" / "hospital_bttwd.yaml"
cfg = load_yaml_cfg(cfg_path)
set_global_seed(cfg.get('SEED', {}).get('global_seed', 42))
log_info('【步骤0摘要】环境准备完毕，路径与随机种子已设置。')

【INFO】【2025-12-22 15:31:58】【配置加载】已读取 e:\yan\组\三支决策\机器学习\BT_TWD\configs\hospital_bttwd.yaml
【INFO】【2025-12-22 15:32:05】【步骤0摘要】环境准备完毕，路径与随机种子已设置。


In [2]:
# 步骤1：加载配置
show_cfg(cfg)
log_info('【步骤1摘要】配置文件加载完成，关键参数检查通过。')

【INFO】【2025-12-22 15:32:05】【配置-数据】数据集=hospital_readmissions, k折=5, 目标列=readmitted, 正类="yes"
【INFO】【2025-12-22 15:32:05】【配置-BTTWD】阈值模式=bucket_wise, 全局模型=xgb, 桶内模型=none, 后验估计器(兼容字段)=logreg
【INFO】【2025-12-22 15:32:05】【配置-基线】LogReg启用=True, RandomForest启用=True, KNN启用=True, XGBoost启用=True
【INFO】【2025-12-22 15:32:05】【步骤1摘要】配置文件加载完成，关键参数检查通过。


In [3]:
# 步骤2：加载原始数据
df_raw, target_col_model = load_dataset(cfg)  # 这里返回的是用于建模的标签列，例如 "label"

display(df_raw.head())
print("用于建模的标签列:", target_col_model)

# 1）画 0/1 标签（流失/未流失）的比例
class_counts = df_raw[target_col_model].value_counts(normalize=True)
ax = class_counts.plot(kind='bar', title='流失 vs 未流失比例')
plt.ylabel('比例')

fig_path = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'class_distribution.png')
os.makedirs(os.path.dirname(fig_path), exist_ok=True)
plt.savefig(fig_path, bbox_inches='tight')
plt.close()

# 2）如果想看原始标签列的分布，可以另外单独分析：
raw_target_col = cfg['DATA']['target_col']  # 这里是原始标签列
print("原始目标列:", raw_target_col)
print(df_raw[raw_target_col].describe())

log_info('【步骤2摘要】Telco Churn 原始数据加载与基本统计完成。')


【INFO】【2025-12-22 15:32:06】【数据加载】文本表格 ..\data\hospital\hospital_readmissions.csv 已读取，样本数=25000，列数=17
【INFO】【2025-12-22 15:32:06】【数据加载】标签列 readmitted 已处理完成：dropna_target=False, 丢弃样本=0, 最终样本数=25000, 正类比例=47.02%
【INFO】【2025-12-22 15:32:06】【数据集信息】名称=hospital_readmissions，样本数=25000，目标列=readmitted，正类比例=47.02%


Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,0
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,0
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,1
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,1
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,0


用于建模的标签列: readmitted
原始目标列: readmitted
count    25000.000000
mean         0.470160
std          0.499119
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: readmitted, dtype: float64
【INFO】【2025-12-22 15:32:06】【步骤2摘要】Telco Churn 原始数据加载与基本统计完成。


In [4]:
# 步骤3：预处理与特征工程
X, y, meta = prepare_features_and_labels(df_raw, cfg)
log_info(f'【预处理】编码特征维度={X.shape[1]}，样本数={X.shape[0]}')
log_info(f"【步骤3摘要】特征预处理完成：连续={len(meta['continuous_cols'])}，类别={len(meta['categorical_cols'])}，编码维度={X.shape[1]}。")

【INFO】【2025-12-22 15:32:06】【预处理】连续特征=7个，类别特征=9个
【INFO】【2025-12-22 15:32:06】【预处理】编码后维度=45
【INFO】【2025-12-22 15:32:06】【预处理】编码特征维度=45，样本数=25000
【INFO】【2025-12-22 15:32:06】【步骤3摘要】特征预处理完成：连续=7，类别=9，编码维度=45。


In [5]:
# 步骤4：构建桶树并检查划分
feature_cols_for_bucket = [c for c in df_raw.columns if c != target_col_model]

bucket_tree = BucketTree(
    cfg['BTTWD']['bucket_levels'],
    feature_names=feature_cols_for_bucket
)

bucket_ids_full = bucket_tree.assign_buckets(df_raw[feature_cols_for_bucket])

bucket_df = bucket_ids_full.value_counts().reset_index()
bucket_df.columns = ['bucket_id', 'count']

bucket_df['pos_rate'] = (
    df_raw.groupby(bucket_ids_full)[target_col_model]
    .apply(lambda s: (s == 1).mean())
    .values
)
display(bucket_df.head())
bucket_df.set_index('bucket_id')['count'].plot(kind='bar', figsize=(12,4), title='桶样本数分布')
fig_bucket = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bucket_metrics_bar.png')
plt.savefig(fig_bucket, bbox_inches='tight')
plt.close()
log_info(f'【步骤4摘要】桶树划分完成，共有 {bucket_ids_full.nunique()} 个叶子桶。')


【INFO】【2025-12-22 15:32:06】【桶树】已为样本生成桶ID，共 45 个组合


Unnamed: 0,bucket_id,count,pos_rate
0,L1_age=young_old|L2_medical_specialty=Others|L...,3671,0.333333
1,L1_age=young_old|L2_medical_specialty=Others|L...,2874,0.44086
2,L1_age=mid_age|L2_medical_specialty=Others|L3_...,2148,0.403636
3,L1_age=mid_age|L2_medical_specialty=Others|L3_...,1464,0.471429
4,L1_age=oldest|L2_medical_specialty=Others|L3_t...,1315,0.471299


【INFO】【2025-12-22 15:32:07】【步骤4摘要】桶树划分完成，共有 45 个叶子桶。


In [6]:
# 步骤5：运行基线模型 k 折实验
# 基线部分在 run_kfold_experiments 内统一调度
log_info('【步骤5】基线模型将在整体交叉验证中一并运行。')
log_info('【步骤5摘要】基线模型性能将作为后续对比基准。')

【INFO】【2025-12-22 15:32:07】【步骤5】基线模型将在整体交叉验证中一并运行。
【INFO】【2025-12-22 15:32:07】【步骤5摘要】基线模型性能将作为后续对比基准。


In [7]:
import numpy as np
import pandas as pd

print("y 全局标签分布：", np.unique(y, return_counts=True))

print("原始标签列分布：")
print(df_raw[cfg['DATA']['target_col']].value_counts())


y 全局标签分布： (array([0, 1]), array([13246, 11754], dtype=int64))
原始标签列分布：
readmitted
0    13246
1    11754
Name: count, dtype: int64


In [8]:
# 步骤6：运行 BTTWD k 折实验（含基线）
results = run_kfold_experiments(X, y, df_raw.drop(columns=[cfg['DATA']['target_col']]), cfg)
summary_df = pd.read_csv(os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'metrics_kfold_summary.csv'))
display(summary_df)
summary_df.plot(x='model', kind='bar', figsize=(8,4), title='模型指标对比')
fig_compare = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'metrics_compare.png')
plt.savefig(fig_compare, bbox_inches='tight')
plt.close()
log_info('【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。')

【INFO】【2025-12-22 15:32:07】【基线-LogReg】使用决策阈值=0.200（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-12-22 15:32:09】【基线-LogReg】整体指标：AUC_mean=0.646, AUC_std=0.003, BAC_mean=0.501, BAC_std=0.000, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.640, F1_std=0.000, Kappa_mean=0.001, Kappa_std=0.001, MCC_mean=0.018, MCC_std=0.010, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.470, Precision_std=0.000, Recall_mean=1.000, Recall_std=0.000, Regret_mean=0.529, Regret_std=0.000
【INFO】【2025-12-22 15:32:09】【基线-RF】使用决策阈值=0.200（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-12-22 15:32:16】【基线-RF】整体指标：AUC_mean=0.639, AUC_std=0.005, BAC_mean=0.508, BAC_std=0.000, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.641, F1_std=0.001, Kappa_mean=0.015, Kappa_std=0.001, MCC_mean=0.057, MCC_std=0.004, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.474, Precision_std=0.000, Recall_mean=0.989, Recall_std=0.002, Regret_mean=0.531, Regret_std=0.002
【INFO】【2025-12-22 15:32:16】【基线-KNN】使用决策阈值=0.200（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:32:18】【基线-KNN】整体指标：AUC_mean=0.585, AUC_std=0.003, BAC_mean=0.536, BAC_std=0.003, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.622, F1_std=0.002, Kappa_mean=0.069, Kappa_std=0.005, MCC_mean=0.091, MCC_std=0.007, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.492, Precision_std=0.002, Recall_mean=0.843, Recall_std=0.005, Regret_mean=0.630, Regret_std=0.006
【INFO】【2025-12-22 15:32:18】【基线-XGB】使用决策阈值=0.200（fixed 模式）


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:32:21】【基线-XGB】整体指标：AUC_mean=0.656, AUC_std=0.005, BAC_mean=0.511, BAC_std=0.002, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.642, F1_std=0.001, Kappa_mean=0.022, Kappa_std=0.003, MCC_mean=0.073, MCC_std=0.009, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.476, Precision_std=0.001, Recall_mean=0.987, Recall_std=0.001, Regret_mean=0.529, Regret_std=0.002
【INFO】【2025-12-22 15:32:21】【K折实验】正在执行第 1/5 折...
【INFO】【2025-12-22 15:32:21】[BT] 使用桶评分配置：mode=f1_regret_bnd, f1_weight=1.0, regret_weight=1.0, bnd_weight=0.5
【INFO】【2025-12-22 15:32:21】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-22 15:32:21] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=20000
[INFO][BT][2025-12-22 15:32:21] 创建桶 bucket_id=L1_age=mid_age，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="mid_age"，n_samples=5571
[INFO][BT][2025-12-22 15:32:21] 创建桶 bucket_id=L1_age=oldest，level=1，parent_id=ROOT，split_



【INFO】【2025-12-22 15:32:23】【阈值】桶 ROOT（n_val=1198，source=val) 使用本地阈值 α=0.3000, β=0.1000
[INFO][BT][2025-12-22 15:32:23] 桶 bucket_id=ROOT level=0：
    n_train=2799, n_val=1198,
    BAC=0.453, F1=0.654, AUC=0.656,
    Regret=0.513, BND_ratio=0.144, POS_coverage=0.856,
    Score(f1_regret_bnd )=0.069，threshold_source=val
【INFO】【2025-12-22 15:32:23】【阈值】桶 L1_age=mid_age 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-22 15:32:23] 桶 bucket_id=L1_age=mid_age level=1：
    n_train=787, n_val=396,
    BAC=0.381, F1=0.627, AUC=0.682,
    Regret=0.528, BND_ratio=0.422, POS_coverage=0.525,
    Score(f1_regret_bnd )=-0.112，threshold_source=val
[INFO][BT][2025-12-22 15:32:23] 桶 bucket_id=L1_age=mid_age：
    parent_id=ROOT，parent_Score=0.069, bucket_Score=-0.112,
    Gain=-0.181, is_weak=True
【INFO】【2025-12-22 15:32:23】【阈值】桶 L1_age=oldest（n_val=300，source=val) 使用本地阈值 α=0.2000, β=0.0000
[INFO][BT][2025-12-22 15:32:23] 桶 bucket_id=L1_age=oldest level=1：
    n_train=579, n_val=300,
    BAC=0.496, F1=0.648, AUC=

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:32:37】[BASELINE] 全局 XGB 模型训练完成
【INFO】【2025-12-22 15:32:37】[BASELINE] 阈值搜索开始
【INFO】【2025-12-22 15:32:37】[BASELINE] 最佳阈值找到: alpha=0.2000, beta=0.1000, regret=0.5241
【INFO】【2025-12-22 15:32:37】【桶树】已为样本生成桶ID，共 45 个组合
【INFO】【2025-12-22 15:32:37】[BASELINE] 测试集桶映射完成，共 45 个桶
【INFO】【2025-12-22 15:32:38】[BASELINE] 桶 L1_age=young_old|L2_medical_specialty=Others|L3_time_in_hospital=long_stay: BAC=0.4961, Regret=0.5000, Precision=0.5020, Recall=0.9921
【INFO】【2025-12-22 15:32:38】[BASELINE] 桶 L1_age=young_old|L2_medical_specialty=Others|L3_time_in_hospital=short_stay: BAC=0.5104, Regret=0.5596, Precision=0.4431, Recall=0.9755
【INFO】【2025-12-22 15:32:38】[BASELINE] 桶 L1_age=oldest|L2_medical_specialty=Others|L3_time_in_hospital=long_stay: BAC=0.5000, Regret=0.5052, Precision=0.5000, Recall=0.9792
【INFO】【2025-12-22 15:32:38】[BASELINE] 桶 L1_age=mid_age|L2_medical_specialty=InternalMed|L3_time_in_hospital=mid_stay: BAC=0.5000, Regret=0.4521, Precision=0.5479, Recall=1.0000
【INFO】【2025

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:32:39】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-22 15:32:39] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=20000
[INFO][BT][2025-12-22 15:32:39] 创建桶 bucket_id=L1_age=mid_age，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="mid_age"，n_samples=5610
[INFO][BT][2025-12-22 15:32:39] 创建桶 bucket_id=L1_age=oldest，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="oldest"，n_samples=4239
[INFO][BT][2025-12-22 15:32:39] 创建桶 bucket_id=L1_age=young_old，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="young_old"，n_samples=10151
[INFO][BT][2025-12-22 15:32:39] 创建桶 bucket_id=L1_age=mid_age|L2_medical_specialty=Cardio，level=2，parent_id=L1_age=mid_age，split_name=L2_medical_specialty，split_type=category_group，split_rule="Cardio"，n_samples=321
[INFO][BT][2025-12-22 15:32:39] 创建桶 bucket_id=L1_age=mid_age|L2_medical_specialty=InternalMed，level=2，



【INFO】【2025-12-22 15:32:41】【阈值】桶 ROOT（n_val=1200，source=val) 使用本地阈值 α=0.1000, β=0.0000
[INFO][BT][2025-12-22 15:32:41] 桶 bucket_id=ROOT level=0：
    n_train=2799, n_val=1200,
    BAC=0.499, F1=0.638, AUC=0.639,
    Regret=0.531, BND_ratio=0.005, POS_coverage=0.995,
    Score(f1_regret_bnd )=0.104，threshold_source=val
【INFO】【2025-12-22 15:32:41】【阈值】桶 L1_age=mid_age 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-22 15:32:41] 桶 bucket_id=L1_age=mid_age level=1：
    n_train=787, n_val=402,
    BAC=0.423, F1=0.618, AUC=0.663,
    Regret=0.544, BND_ratio=0.244, POS_coverage=0.756,
    Score(f1_regret_bnd )=-0.047，threshold_source=val
[INFO][BT][2025-12-22 15:32:41] 桶 bucket_id=L1_age=mid_age：
    parent_id=ROOT，parent_Score=0.104, bucket_Score=-0.047,
    Gain=-0.152, is_weak=True
【INFO】【2025-12-22 15:32:41】【阈值】桶 L1_age=oldest（n_val=305，source=val) 使用本地阈值 α=0.1000, β=0.0000
[INFO][BT][2025-12-22 15:32:41] 桶 bucket_id=L1_age=oldest level=1：
    n_train=580, n_val=305,
    BAC=0.497, F1=0.653, AUC=

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:32:54】[BASELINE] 全局 XGB 模型训练完成
【INFO】【2025-12-22 15:32:54】[BASELINE] 阈值搜索开始
【INFO】【2025-12-22 15:32:54】[BASELINE] 最佳阈值找到: alpha=0.3000, beta=0.0000, regret=0.5253
【INFO】【2025-12-22 15:32:54】【桶树】已为样本生成桶ID，共 45 个组合
【INFO】【2025-12-22 15:32:54】[BASELINE] 测试集桶映射完成，共 45 个桶
【INFO】【2025-12-22 15:32:55】[BASELINE] 桶 L1_age=young_old|L2_medical_specialty=Others|L3_time_in_hospital=short_stay: BAC=0.5449, Regret=0.5300, Precision=0.4946, Recall=0.8914
【INFO】【2025-12-22 15:32:55】[BASELINE] 桶 L1_age=young_old|L2_medical_specialty=PrimaryCare|L3_time_in_hospital=long_stay: BAC=0.5000, Regret=0.5455, Precision=0.4545, Recall=1.0000
【INFO】【2025-12-22 15:32:55】[BASELINE] 桶 L1_age=mid_age|L2_medical_specialty=Others|L3_time_in_hospital=short_stay: BAC=0.5937, Regret=0.5819, Precision=0.4308, Recall=0.8187
【INFO】【2025-12-22 15:32:55】[BASELINE] 桶 L1_age=mid_age|L2_medical_specialty=Others|L3_time_in_hospital=mid_stay: BAC=0.5684, Regret=0.4700, Precision=0.5565, Recall=0.9275
【INFO】【20

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:32:56】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-22 15:32:56] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=20000
[INFO][BT][2025-12-22 15:32:56] 创建桶 bucket_id=L1_age=mid_age，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="mid_age"，n_samples=5651
[INFO][BT][2025-12-22 15:32:56] 创建桶 bucket_id=L1_age=oldest，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="oldest"，n_samples=4165
[INFO][BT][2025-12-22 15:32:56] 创建桶 bucket_id=L1_age=young_old，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="young_old"，n_samples=10184
[INFO][BT][2025-12-22 15:32:56] 创建桶 bucket_id=L1_age=mid_age|L2_medical_specialty=Cardio，level=2，parent_id=L1_age=mid_age，split_name=L2_medical_specialty，split_type=category_group，split_rule="Cardio"，n_samples=326
[INFO][BT][2025-12-22 15:32:56] 创建桶 bucket_id=L1_age=mid_age|L2_medical_specialty=InternalMed，level=2，



【INFO】【2025-12-22 15:32:58】【阈值】桶 ROOT（n_val=1199，source=val) 使用本地阈值 α=0.3000, β=0.0000
[INFO][BT][2025-12-22 15:32:58] 桶 bucket_id=ROOT level=0：
    n_train=2799, n_val=1199,
    BAC=0.458, F1=0.642, AUC=0.640,
    Regret=0.524, BND_ratio=0.142, POS_coverage=0.858,
    Score(f1_regret_bnd )=0.047，threshold_source=val
【INFO】【2025-12-22 15:32:58】【阈值】桶 L1_age=mid_age 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-22 15:32:58] 桶 bucket_id=L1_age=mid_age level=1：
    n_train=804, n_val=399,
    BAC=0.490, F1=0.600, AUC=0.661,
    Regret=0.565, BND_ratio=0.073, POS_coverage=0.925,
    Score(f1_regret_bnd )=-0.001，threshold_source=val
[INFO][BT][2025-12-22 15:32:58] 桶 bucket_id=L1_age=mid_age：
    parent_id=ROOT，parent_Score=0.047, bucket_Score=-0.001,
    Gain=-0.048, is_weak=True
【INFO】【2025-12-22 15:32:59】【阈值】桶 L1_age=oldest（n_val=376，source=val) 使用本地阈值 α=0.2000, β=0.1000
[INFO][BT][2025-12-22 15:32:59] 桶 bucket_id=L1_age=oldest level=1：
    n_train=575, n_val=376,
    BAC=0.500, F1=0.667, AUC=

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:33:12】[BASELINE] 全局 XGB 模型训练完成
【INFO】【2025-12-22 15:33:12】[BASELINE] 阈值搜索开始
【INFO】【2025-12-22 15:33:13】[BASELINE] 最佳阈值找到: alpha=0.3000, beta=0.0000, regret=0.5215
【INFO】【2025-12-22 15:33:13】【桶树】已为样本生成桶ID，共 45 个组合
【INFO】【2025-12-22 15:33:13】[BASELINE] 测试集桶映射完成，共 45 个桶
【INFO】【2025-12-22 15:33:14】[BASELINE] 桶 L1_age=mid_age|L2_medical_specialty=Others|L3_time_in_hospital=short_stay: BAC=0.5974, Regret=0.5475, Precision=0.5061, Recall=0.7622
【INFO】【2025-12-22 15:33:14】[BASELINE] 桶 L1_age=mid_age|L2_medical_specialty=Others|L3_time_in_hospital=mid_stay: BAC=0.5903, Regret=0.4831, Precision=0.5403, Recall=0.9306
【INFO】【2025-12-22 15:33:14】[BASELINE] 桶 L1_age=young_old|L2_medical_specialty=Others|L3_time_in_hospital=short_stay: BAC=0.5667, Regret=0.5120, Precision=0.5172, Recall=0.8902
【INFO】【2025-12-22 15:33:14】[BASELINE] 桶 L1_age=young_old|L2_medical_specialty=Others|L3_time_in_hospital=long_stay: BAC=0.5506, Regret=0.5000, Precision=0.5138, Recall=0.9492
【INFO】【2025-12

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:33:18】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-22 15:33:18] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=20000
[INFO][BT][2025-12-22 15:33:18] 创建桶 bucket_id=L1_age=mid_age，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="mid_age"，n_samples=5519
[INFO][BT][2025-12-22 15:33:18] 创建桶 bucket_id=L1_age=oldest，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="oldest"，n_samples=4263
[INFO][BT][2025-12-22 15:33:18] 创建桶 bucket_id=L1_age=young_old，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="young_old"，n_samples=10218
[INFO][BT][2025-12-22 15:33:18] 创建桶 bucket_id=L1_age=mid_age|L2_medical_specialty=Cardio，level=2，parent_id=L1_age=mid_age，split_name=L2_medical_specialty，split_type=category_group，split_rule="Cardio"，n_samples=320
[INFO][BT][2025-12-22 15:33:18] 创建桶 bucket_id=L1_age=mid_age|L2_medical_specialty=InternalMed，level=2，



【INFO】【2025-12-22 15:33:19】【阈值】桶 ROOT（n_val=1199，source=val) 使用本地阈值 α=0.2000, β=0.1000
[INFO][BT][2025-12-22 15:33:19] 桶 bucket_id=ROOT level=0：
    n_train=2799, n_val=1199,
    BAC=0.497, F1=0.664, AUC=0.641,
    Regret=0.502, BND_ratio=0.027, POS_coverage=0.971,
    Score(f1_regret_bnd )=0.149，threshold_source=val
【INFO】【2025-12-22 15:33:20】【阈值】桶 L1_age=mid_age 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-22 15:33:20] 桶 bucket_id=L1_age=mid_age level=1：
    n_train=783, n_val=389,
    BAC=0.324, F1=0.611, AUC=0.682,
    Regret=0.539, BND_ratio=0.512, POS_coverage=0.488,
    Score(f1_regret_bnd )=-0.183，threshold_source=val
[INFO][BT][2025-12-22 15:33:20] 桶 bucket_id=L1_age=mid_age：
    parent_id=ROOT，parent_Score=0.149, bucket_Score=-0.183,
    Gain=-0.332, is_weak=True
【INFO】【2025-12-22 15:33:20】【阈值】桶 L1_age=oldest 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-22 15:33:20] 桶 bucket_id=L1_age=oldest level=1：
    n_train=592, n_val=374,
    BAC=0.495, F1=0.662, AUC=0.584,
    Regret=0.505, 

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:33:32】[BASELINE] 全局 XGB 模型训练完成
【INFO】【2025-12-22 15:33:32】[BASELINE] 阈值搜索开始
【INFO】【2025-12-22 15:33:32】[BASELINE] 最佳阈值找到: alpha=0.3000, beta=0.0000, regret=0.5232
【INFO】【2025-12-22 15:33:32】【桶树】已为样本生成桶ID，共 45 个组合
【INFO】【2025-12-22 15:33:32】[BASELINE] 测试集桶映射完成，共 45 个桶
【INFO】【2025-12-22 15:33:34】[BASELINE] 桶 L1_age=young_old|L2_medical_specialty=InternalMed|L3_time_in_hospital=short_stay: BAC=0.5844, Regret=0.5432, Precision=0.4771, Recall=0.8814
【INFO】【2025-12-22 15:33:34】[BASELINE] 桶 L1_age=oldest|L2_medical_specialty=InternalMed|L3_time_in_hospital=mid_stay: BAC=0.5039, Regret=0.5256, Precision=0.5152, Recall=0.8500
【INFO】【2025-12-22 15:33:34】[BASELINE] 桶 L1_age=mid_age|L2_medical_specialty=InternalMed|L3_time_in_hospital=mid_stay: BAC=0.5740, Regret=0.5000, Precision=0.5156, Recall=0.9429
【INFO】【2025-12-22 15:33:34】[BASELINE] 桶 L1_age=young_old|L2_medical_specialty=Others|L3_time_in_hospital=long_stay: BAC=0.5167, Regret=0.5258, Precision=0.4851, Recall=0.9500
【I

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:33:34】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-22 15:33:35] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=20000
[INFO][BT][2025-12-22 15:33:35] 创建桶 bucket_id=L1_age=mid_age，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="mid_age"，n_samples=5585
[INFO][BT][2025-12-22 15:33:35] 创建桶 bucket_id=L1_age=oldest，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="oldest"，n_samples=4192
[INFO][BT][2025-12-22 15:33:35] 创建桶 bucket_id=L1_age=young_old，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="young_old"，n_samples=10223
[INFO][BT][2025-12-22 15:33:35] 创建桶 bucket_id=L1_age=mid_age|L2_medical_specialty=Cardio，level=2，parent_id=L1_age=mid_age，split_name=L2_medical_specialty，split_type=category_group，split_rule="Cardio"，n_samples=326
[INFO][BT][2025-12-22 15:33:35] 创建桶 bucket_id=L1_age=mid_age|L2_medical_specialty=InternalMed，level=2，



【INFO】【2025-12-22 15:33:36】【阈值】桶 ROOT（n_val=1199，source=val) 使用本地阈值 α=0.2000, β=0.0000
[INFO][BT][2025-12-22 15:33:36] 桶 bucket_id=ROOT level=0：
    n_train=2799, n_val=1199,
    BAC=0.495, F1=0.659, AUC=0.621,
    Regret=0.508, BND_ratio=0.028, POS_coverage=0.972,
    Score(f1_regret_bnd )=0.137，threshold_source=val
【INFO】【2025-12-22 15:33:36】【阈值】桶 L1_age=mid_age 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-22 15:33:36] 桶 bucket_id=L1_age=mid_age level=1：
    n_train=780, n_val=334,
    BAC=0.439, F1=0.634, AUC=0.672,
    Regret=0.528, BND_ratio=0.210, POS_coverage=0.787,
    Score(f1_regret_bnd )=0.001，threshold_source=val
[INFO][BT][2025-12-22 15:33:36] 桶 bucket_id=L1_age=mid_age：
    parent_id=ROOT，parent_Score=0.137, bucket_Score=0.001,
    Gain=-0.136, is_weak=True
【INFO】【2025-12-22 15:33:36】【阈值】桶 L1_age=oldest 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-22 15:33:36] 桶 bucket_id=L1_age=oldest level=1：
    n_train=582, n_val=380,
    BAC=0.474, F1=0.634, AUC=0.600,
    Regret=0.534, BN

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-22 15:33:50】[BASELINE] 全局 XGB 模型训练完成
【INFO】【2025-12-22 15:33:50】[BASELINE] 阈值搜索开始
【INFO】【2025-12-22 15:33:50】[BASELINE] 最佳阈值找到: alpha=0.3000, beta=0.1000, regret=0.5204
【INFO】【2025-12-22 15:33:50】【桶树】已为样本生成桶ID，共 45 个组合
【INFO】【2025-12-22 15:33:50】[BASELINE] 测试集桶映射完成，共 45 个桶
【INFO】【2025-12-22 15:33:52】[BASELINE] 桶 L1_age=mid_age|L2_medical_specialty=Others|L3_time_in_hospital=mid_stay: BAC=0.5840, Regret=0.4863, Precision=0.5301, Recall=0.9429
【INFO】【2025-12-22 15:33:52】[BASELINE] 桶 L1_age=oldest|L2_medical_specialty=PrimaryCare|L3_time_in_hospital=short_stay: BAC=0.5577, Regret=0.5698, Precision=0.4250, Recall=1.0000
【INFO】【2025-12-22 15:33:52】[BASELINE] 桶 L1_age=oldest|L2_medical_specialty=InternalMed|L3_time_in_hospital=short_stay: BAC=0.6034, Regret=0.4918, Precision=0.5192, Recall=0.9643
【INFO】【2025-12-22 15:33:52】[BASELINE] 桶 L1_age=oldest|L2_medical_specialty=InternalMed|L3_time_in_hospital=mid_stay: BAC=0.5278, Regret=0.5986, Precision=0.4098, Recall=0.8929
【INFO】【

Unnamed: 0,model,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,BAC_mean,BAC_std,AUC_mean,...,MCC_mean,MCC_std,Kappa_mean,Kappa_std,BND_ratio_mean,BND_ratio_std,POS_Coverage_mean,POS_Coverage_std,Regret_mean,Regret_std
0,BTTWD,0.487137,0.001709,0.955504,0.007202,0.645287,0.002922,0.531429,0.003141,0.649473,...,0.117409,0.01379,0.059628,0.005949,0.10504,0.032246,0.88756,0.031898,0.52256,0.001286
1,LogReg,0.470421,0.00027,0.999915,0.00017,0.639828,0.000223,0.500524,0.000428,0.646378,...,0.018265,0.010084,0.000985,0.000806,0.0,0.0,,,0.52936,0.000388
2,RandomForest,0.474191,0.000304,0.988684,0.001839,0.640964,0.000556,0.507931,0.000485,0.639255,...,0.057103,0.003986,0.014949,0.000916,0.0,0.0,,,0.5314,0.001876
3,KNN,0.492499,0.001811,0.843117,0.005035,0.621782,0.00228,0.536083,0.002764,0.584971,...,0.090894,0.006801,0.069386,0.005331,0.0,0.0,,,0.62976,0.006158
4,XGBoost,0.475988,0.000858,0.987409,0.000876,0.642334,0.000894,0.511408,0.001664,0.655854,...,0.073222,0.008968,0.021514,0.003141,0.0,0.0,,,0.52884,0.002271


【INFO】【2025-12-22 15:33:53】【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。


In [9]:
# 步骤7：桶级别分析
bucket_metrics_path = os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'bucket_metrics.csv')
if os.path.exists(bucket_metrics_path):
    bucket_metrics_df = pd.read_csv(bucket_metrics_path)
    display(bucket_metrics_df.head())
    bucket_metrics_df.plot(x='bucket_id', y='pos_rate_all', kind='bar', figsize=(12,4), title='桶正类比例')
    plt.ylabel('正类比例')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(fig_bucket, bbox_inches='tight')
    plt.close()
log_info('【步骤7摘要】桶级指标已整理，可用于局部化分析。')

Unnamed: 0,bucket_id,layer,parent_bucket_id,n_train,n_val,pos_rate_train,pos_rate_val,alpha,beta,regret_val,...,use_gain_weak_backoff,threshold_data_source,parent_with_threshold,n_test,pos_rate_test,BND_ratio_test,POS_Coverage_test,regret_test,fold,pos_rate
0,ROOT,L1,,2799,1198,0.478742,0.484975,0.3,0.1,0.512521,...,True,val,,,,,,,1,0.4702
1,L1_age=young_old,L1,ROOT,1428,612,0.460784,0.47549,0.2,0.0,0.520425,...,True,val,,,,,,,1,0.482394
2,L1_age=young_old|L2_medical_specialty=Others,L2,L1_age=young_old,4378,1847,0.481498,0.489442,0.2,0.1,0.506497,...,True,val,,,,,,,1,0.483855
3,L1_age=mid_age,L1,ROOT,787,396,0.443456,0.441919,0.3,0.1,0.527778,...,True,val,,,,,,,1,0.439239
4,L1_age=oldest,L1,ROOT,579,300,0.483592,0.473333,0.2,0.0,0.52,...,True,val,,,,,,,1,0.48157


  plt.tight_layout()


【INFO】【2025-12-22 15:33:57】【步骤7摘要】桶级指标已整理，可用于局部化分析。


In [10]:
# 步骤8：结果汇总
log_info('【步骤8】检查结果文件与图表。')
print(os.listdir(os.path.join(root_path, cfg['OUTPUT']['results_dir'])))
print(os.listdir(os.path.join(root_path, cfg['OUTPUT']['figs_dir'])))
log_info('【全部步骤完成】Telco Churn 数据集上的 BT-TWD 实验结束。')

【INFO】【2025-12-22 15:33:57】【步骤8】检查结果文件与图表。
['baseline_bucket_metrics.csv', 'bucket_fallback_stats.csv', 'bucket_metrics.csv', 'bucket_metrics_gain.csv', 'bucket_metrics_gain_test_per_fold.csv', 'bucket_thresholds.csv', 'bucket_thresholds_per_fold.csv', 'bucket_tree_structure.csv', 'metrics_kfold_per_fold.csv', 'metrics_kfold_summary.csv', 'metrics_overview.csv', 'per_sample_test_predictions.csv']
['bank_class_distribution.png', 'bucket_metrics_bar.png', 'class_distribution.png', 'metrics_compare.png']
【INFO】【2025-12-22 15:33:57】【全部步骤完成】Telco Churn 数据集上的 BT-TWD 实验结束。
