diabetic 数据集 BT-TWD 可行性实验

本 notebook 按步骤运行：加载配置 → 读取数据 → 预处理 → 桶树划分 → 基线与 BTTWD k 折实验 → 桶级分析。

In [1]:
# 步骤0：环境与路径设置
import os, sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# 将项目根目录加入路径，便于导入 bttwdlib
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root_path not in sys.path:
    sys.path.append(root_path)

from bttwdlib import (
    load_yaml_cfg,
    show_cfg,
    load_dataset,
    prepare_features_and_labels,
    BucketTree,
    run_kfold_experiments,
    log_info,
    set_global_seed,
)

cfg_path = Path(root_path) / "configs" / "diabetic_bttwd.yaml"
cfg = load_yaml_cfg(cfg_path)
set_global_seed(cfg.get('SEED', {}).get('global_seed', 42))
log_info('【步骤0摘要】环境准备完毕，路径与随机种子已设置。')

【INFO】【2025-12-12 19:23:59】【配置加载】已读取 e:\yan\组\三支决策\机器学习\BT_TWD\configs\diabetic_bttwd.yaml
【INFO】【2025-12-12 19:24:03】【步骤0摘要】环境准备完毕，路径与随机种子已设置。


In [2]:
# 步骤1：加载配置
show_cfg(cfg)
log_info('【步骤1摘要】配置文件加载完成，关键参数检查通过。')

【INFO】【2025-12-12 19:24:03】【配置-数据】数据集=diabetic, k折=5, 目标列=readmitted, 正类="<30"
【INFO】【2025-12-12 19:24:03】【配置-BTTWD】阈值模式=bucket_wise, 全局模型=xgb, 桶内模型=none, 后验估计器(兼容字段)=logreg
【INFO】【2025-12-12 19:24:03】【配置-基线】LogReg启用=True, RandomForest启用=True, KNN启用=True, XGBoost启用=True
【INFO】【2025-12-12 19:24:03】【步骤1摘要】配置文件加载完成，关键参数检查通过。


In [3]:
# 步骤2：加载原始数据
df_raw, target_col_model = load_dataset(cfg)  # 这里返回的是用于建模的标签列，例如 "label"

display(df_raw.head())
print("用于建模的标签列:", target_col_model)

# 1）画 0/1 标签（流失/未流失）的比例
class_counts = df_raw[target_col_model].value_counts(normalize=True)
ax = class_counts.plot(kind='bar', title='流失 vs 未流失比例')
plt.ylabel('比例')

fig_path = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'class_distribution.png')
os.makedirs(os.path.dirname(fig_path), exist_ok=True)
plt.savefig(fig_path, bbox_inches='tight')
plt.close()

# 2）如果想看原始标签列的分布，可以另外单独分析：
raw_target_col = cfg['DATA']['target_col']  # 这里是原始标签列
print("原始目标列:", raw_target_col)
print(df_raw[raw_target_col].describe())

log_info('【步骤2摘要】Telco Churn 原始数据加载与基本统计完成。')


【INFO】【2025-12-12 19:24:04】【数据加载】文本表格 ..\data\diabetic\diabetic_data.csv 已读取，样本数=101766，列数=50
【INFO】【2025-12-12 19:24:04】【数据加载】糖尿病数据集已读取，标签已二值化，样本数=101766，正类比例=11.16%
【INFO】【2025-12-12 19:24:04】【数据集信息】名称=diabetic，样本数=101766，目标列=readmitted，正类比例=11.16%


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,0
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,0
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,0
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,0
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,0


用于建模的标签列: readmitted
原始目标列: readmitted
count    101766.000000
mean          0.111599
std           0.314874
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: readmitted, dtype: float64
【INFO】【2025-12-12 19:24:05】【步骤2摘要】Telco Churn 原始数据加载与基本统计完成。


In [4]:
# 步骤3：预处理与特征工程
X, y, meta = prepare_features_and_labels(df_raw, cfg)
log_info(f'【预处理】编码特征维度={X.shape[1]}，样本数={X.shape[0]}')
log_info(f"【步骤3摘要】特征预处理完成：连续={len(meta['continuous_cols'])}，类别={len(meta['categorical_cols'])}，编码维度={X.shape[1]}。")

【INFO】【2025-12-12 19:24:05】【预处理】连续特征=8个，类别特征=35个
【INFO】【2025-12-12 19:24:06】【预处理】编码后维度=215
【INFO】【2025-12-12 19:24:07】【预处理】编码特征维度=215，样本数=101766
【INFO】【2025-12-12 19:24:07】【步骤3摘要】特征预处理完成：连续=8，类别=35，编码维度=215。


In [5]:
# 步骤4：构建桶树并检查划分
feature_cols_for_bucket = [c for c in df_raw.columns if c != target_col_model]

bucket_tree = BucketTree(
    cfg['BTTWD']['bucket_levels'],
    feature_names=feature_cols_for_bucket
)

bucket_ids_full = bucket_tree.assign_buckets(df_raw[feature_cols_for_bucket])

bucket_df = bucket_ids_full.value_counts().reset_index()
bucket_df.columns = ['bucket_id', 'count']

bucket_df['pos_rate'] = (
    df_raw.groupby(bucket_ids_full)[target_col_model]
    .apply(lambda s: (s == 1).mean())
    .values
)
display(bucket_df.head())
bucket_df.set_index('bucket_id')['count'].plot(kind='bar', figsize=(12,4), title='桶样本数分布')
fig_bucket = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bucket_metrics_bar.png')
plt.savefig(fig_bucket, bbox_inches='tight')
plt.close()
log_info(f'【步骤4摘要】桶树划分完成，共有 {bucket_ids_full.nunique()} 个叶子桶。')


【INFO】【2025-12-12 19:24:07】【桶树】已为样本生成桶ID，共 113 个组合


Unnamed: 0,bucket_id,count,pos_rate
0,L1_age=age_60_80|L2_admission_type_id=Emergent...,17788,0.0
1,L1_age=age_60_80|L2_admission_type_id=Emergent...,13799,0.0
2,L1_age=age_40_60|L2_admission_type_id=Emergent...,11256,0.206823
3,L1_age=age_40_60|L2_admission_type_id=Emergent...,6950,0.288136
4,L1_age=age_ge_80|L2_admission_type_id=Emergent...,6378,0.110065


【INFO】【2025-12-12 19:24:10】【步骤4摘要】桶树划分完成，共有 113 个叶子桶。


In [6]:
# 步骤5：运行基线模型 k 折实验
# 基线部分在 run_kfold_experiments 内统一调度
log_info('【步骤5】基线模型将在整体交叉验证中一并运行。')
log_info('【步骤5摘要】基线模型性能将作为后续对比基准。')

【INFO】【2025-12-12 19:24:10】【步骤5】基线模型将在整体交叉验证中一并运行。
【INFO】【2025-12-12 19:24:10】【步骤5摘要】基线模型性能将作为后续对比基准。


In [7]:
import numpy as np
import pandas as pd

print("y 全局标签分布：", np.unique(y, return_counts=True))

print("原始标签列分布：")
print(df_raw[cfg['DATA']['target_col']].value_counts())


y 全局标签分布： (array([0, 1]), array([90409, 11357], dtype=int64))
原始标签列分布：
readmitted
0    90409
1    11357
Name: count, dtype: int64


In [8]:
# 步骤6：运行 BTTWD k 折实验（含基线）
results = run_kfold_experiments(X, y, df_raw.drop(columns=[cfg['DATA']['target_col']]), cfg)
summary_df = pd.read_csv(os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'metrics_kfold_summary.csv'))
display(summary_df)
summary_df.plot(x='model', kind='bar', figsize=(8,4), title='模型指标对比')
fig_compare = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'metrics_compare.png')
plt.savefig(fig_compare, bbox_inches='tight')
plt.close()
log_info('【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。')

【INFO】【2025-12-12 19:24:10】【基线-LogReg】使用决策阈值=0.400（fixed 模式）


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

【INFO】【2025-12-12 19:26:14】【基线-LogReg】整体指标：AUC_mean=0.669, AUC_std=0.007, BAC_mean=0.515, BAC_std=0.003, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.065, F1_std=0.010, Kappa_mean=0.048, Kappa_std=0.008, MCC_mean=0.095, MCC_std=0.013, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.420, Precision_std=0.037, Recall_mean=0.035, Recall_std=0.005, Regret_mean=0.328, Regret_std=0.002
【INFO】【2025-12-12 19:26:14】【基线-RF】使用决策阈值=0.400（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-12-12 19:28:13】【基线-RF】整体指标：AUC_mean=0.651, AUC_std=0.007, BAC_mean=0.514, BAC_std=0.002, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.062, F1_std=0.005, Kappa_mean=0.049, Kappa_std=0.005, MCC_mean=0.106, MCC_std=0.011, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.496, Precision_std=0.043, Recall_mean=0.033, Recall_std=0.003, Regret_mean=0.327, Regret_std=0.001
【INFO】【2025-12-12 19:28:13】【基线-KNN】使用决策阈值=0.400（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-12-12 19:28:52】【基线-KNN】整体指标：AUC_mean=0.568, AUC_std=0.009, BAC_mean=0.520, BAC_std=0.002, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.107, F1_std=0.006, Kappa_mean=0.059, Kappa_std=0.006, MCC_mean=0.072, MCC_std=0.007, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.235, Precision_std=0.012, Recall_mean=0.069, Recall_std=0.004, Regret_mean=0.337, Regret_std=0.002
【INFO】【2025-12-12 19:28:52】【基线-XGB】使用决策阈值=0.400（fixed 模式）


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-12-12 19:29:27】【基线-XGB】整体指标：AUC_mean=0.679, AUC_std=0.006, BAC_mean=0.518, BAC_std=0.002, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.075, F1_std=0.006, Kappa_mean=0.059, Kappa_std=0.005, MCC_mean=0.116, MCC_std=0.006, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.487, Precision_std=0.017, Recall_mean=0.041, Recall_std=0.003, Regret_mean=0.326, Regret_std=0.001
【INFO】【2025-12-12 19:29:27】【K折实验】正在执行第 1/5 折...
【INFO】【2025-12-12 19:29:27】[BT] 使用桶评分配置：mode=f1_regret_bnd, f1_weight=1.0, regret_weight=1.0, bnd_weight=0.5


Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-12 19:29:30】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-12 19:29:31] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=81412
[INFO][BT][2025-12-12 19:29:31] 创建桶 bucket_id=L1_age=age_40_60，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_40_60"，n_samples=21590
[INFO][BT][2025-12-12 19:29:31] 创建桶 bucket_id=L1_age=age_60_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_60_80"，n_samples=38746
[INFO][BT][2025-12-12 19:29:31] 创建桶 bucket_id=L1_age=age_ge_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_ge_80"，n_samples=16012
[INFO][BT][2025-12-12 19:29:31] 创建桶 bucket_id=L1_age=age_le_40，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_le_40"，n_samples=5064
[INFO][BT][2025-12-12 19:29:31] 创建桶 bucket_id=L1_age=age_40_60|L2_admission_type_id=EmergentUrgent，level=2，parent_id=L1_age=age_40_60，sp



【INFO】【2025-12-12 19:29:42】【阈值】桶 ROOT（n_val=4883）使用本地阈值 α=0.6000, β=0.4000
[INFO][BT][2025-12-12 19:29:42] 桶 bucket_id=ROOT level=0：
    n_train=11396, n_val=4883,
    BAC=0.502, F1=0.023, AUC=0.639,
    Regret=0.310, BND_ratio=0.008, POS_coverage=0.002,
    Score(f1_regret_bnd )=-0.291
【INFO】【2025-12-12 19:29:42】【阈值】桶 L1_age=age_40_60（n_val=1494）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:29:42] 桶 bucket_id=L1_age=age_40_60 level=1：
    n_train=3033, n_val=1494,
    BAC=0.511, F1=0.082, AUC=0.692,
    Regret=0.298, BND_ratio=0.019, POS_coverage=0.013,
    Score(f1_regret_bnd )=-0.225
[INFO][BT][2025-12-12 19:29:42] 桶 bucket_id=L1_age=age_40_60：
    parent_id=ROOT，parent_Score=-0.291, bucket_Score=-0.225,
    Gain=+0.065, is_weak=False
【INFO】【2025-12-12 19:29:42】【阈值】桶 L1_age=age_60_80（n_val=2327）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:29:42] 桶 bucket_id=L1_age=age_60_80 level=1：
    n_train=5424, n_val=2327,
    BAC=0.503, F1=0.056, AUC=0.660,
    Regret=0.334, BND_ra

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-12 19:30:02】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-12 19:30:03] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=81413
[INFO][BT][2025-12-12 19:30:03] 创建桶 bucket_id=L1_age=age_40_60，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_40_60"，n_samples=21510
[INFO][BT][2025-12-12 19:30:03] 创建桶 bucket_id=L1_age=age_60_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_60_80"，n_samples=38852
[INFO][BT][2025-12-12 19:30:03] 创建桶 bucket_id=L1_age=age_ge_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_ge_80"，n_samples=16070
[INFO][BT][2025-12-12 19:30:03] 创建桶 bucket_id=L1_age=age_le_40，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_le_40"，n_samples=4981
[INFO][BT][2025-12-12 19:30:03] 创建桶 bucket_id=L1_age=age_40_60|L2_admission_type_id=EmergentUrgent，level=2，parent_id=L1_age=age_40_60，sp



【INFO】【2025-12-12 19:30:15】【阈值】桶 ROOT（n_val=4884）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:30:15] 桶 bucket_id=ROOT level=0：
    n_train=11396, n_val=4884,
    BAC=0.510, F1=0.074, AUC=0.670,
    Regret=0.330, BND_ratio=0.020, POS_coverage=0.009,
    Score(f1_regret_bnd )=-0.266
【INFO】【2025-12-12 19:30:15】【阈值】桶 L1_age=age_40_60（n_val=1531）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:30:15] 桶 bucket_id=L1_age=age_40_60 level=1：
    n_train=3010, n_val=1531,
    BAC=0.509, F1=0.073, AUC=0.694,
    Regret=0.296, BND_ratio=0.022, POS_coverage=0.007,
    Score(f1_regret_bnd )=-0.234
[INFO][BT][2025-12-12 19:30:15] 桶 bucket_id=L1_age=age_40_60：
    parent_id=ROOT，parent_Score=-0.266, bucket_Score=-0.234,
    Gain=+0.032, is_weak=False
【INFO】【2025-12-12 19:30:15】【阈值】桶 L1_age=age_60_80 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-12 19:30:15] 桶 bucket_id=L1_age=age_60_80 level=1：
    n_train=5441, n_val=2331,
    BAC=0.501, F1=0.015, AUC=0.673,
    Regret=0.328, BND_ratio=0.005, POS

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-12 19:30:38】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-12 19:30:38] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=81413
[INFO][BT][2025-12-12 19:30:38] 创建桶 bucket_id=L1_age=age_40_60，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_40_60"，n_samples=21505
[INFO][BT][2025-12-12 19:30:38] 创建桶 bucket_id=L1_age=age_60_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_60_80"，n_samples=38982
[INFO][BT][2025-12-12 19:30:38] 创建桶 bucket_id=L1_age=age_ge_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_ge_80"，n_samples=15874
[INFO][BT][2025-12-12 19:30:38] 创建桶 bucket_id=L1_age=age_le_40，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_le_40"，n_samples=5052
[INFO][BT][2025-12-12 19:30:38] 创建桶 bucket_id=L1_age=age_40_60|L2_admission_type_id=EmergentUrgent，level=2，parent_id=L1_age=age_40_60，sp



【INFO】【2025-12-12 19:30:48】【阈值】桶 ROOT（n_val=4884）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:30:48] 桶 bucket_id=ROOT level=0：
    n_train=11396, n_val=4884,
    BAC=0.507, F1=0.070, AUC=0.669,
    Regret=0.317, BND_ratio=0.023, POS_coverage=0.009,
    Score(f1_regret_bnd )=-0.258
【INFO】【2025-12-12 19:30:48】【阈值】桶 L1_age=age_40_60（n_val=1523）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:30:48] 桶 bucket_id=L1_age=age_40_60 level=1：
    n_train=2992, n_val=1523,
    BAC=0.523, F1=0.114, AUC=0.671,
    Regret=0.297, BND_ratio=0.018, POS_coverage=0.011,
    Score(f1_regret_bnd )=-0.192
[INFO][BT][2025-12-12 19:30:48] 桶 bucket_id=L1_age=age_40_60：
    parent_id=ROOT，parent_Score=-0.258, bucket_Score=-0.192,
    Gain=+0.066, is_weak=False
【INFO】【2025-12-12 19:30:48】【阈值】桶 L1_age=age_60_80 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-12 19:30:48] 桶 bucket_id=L1_age=age_60_80 level=1：
    n_train=5476, n_val=2319,
    BAC=0.502, F1=0.021, AUC=0.679,
    Regret=0.359, BND_ratio=0.005, POS

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-12 19:31:10】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-12 19:31:11] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=81413
[INFO][BT][2025-12-12 19:31:11] 创建桶 bucket_id=L1_age=age_40_60，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_40_60"，n_samples=21483
[INFO][BT][2025-12-12 19:31:11] 创建桶 bucket_id=L1_age=age_60_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_60_80"，n_samples=38790
[INFO][BT][2025-12-12 19:31:11] 创建桶 bucket_id=L1_age=age_ge_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_ge_80"，n_samples=16117
[INFO][BT][2025-12-12 19:31:11] 创建桶 bucket_id=L1_age=age_le_40，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_le_40"，n_samples=5023
[INFO][BT][2025-12-12 19:31:11] 创建桶 bucket_id=L1_age=age_40_60|L2_admission_type_id=EmergentUrgent，level=2，parent_id=L1_age=age_40_60，sp



【INFO】【2025-12-12 19:31:21】【阈值】桶 ROOT（n_val=4884）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:31:21] 桶 bucket_id=ROOT level=0：
    n_train=11396, n_val=4884,
    BAC=0.517, F1=0.098, AUC=0.685,
    Regret=0.306, BND_ratio=0.019, POS_coverage=0.010,
    Score(f1_regret_bnd )=-0.217
【INFO】【2025-12-12 19:31:21】【阈值】桶 L1_age=age_40_60 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-12 19:31:21] 桶 bucket_id=L1_age=age_40_60 level=1：
    n_train=3019, n_val=1489,
    BAC=0.514, F1=0.071, AUC=0.679,
    Regret=0.305, BND_ratio=0.010, POS_coverage=0.008,
    Score(f1_regret_bnd )=-0.239
[INFO][BT][2025-12-12 19:31:21] 桶 bucket_id=L1_age=age_40_60：
    parent_id=ROOT，parent_Score=-0.217, bucket_Score=-0.239,
    Gain=-0.021, is_weak=True
【INFO】【2025-12-12 19:31:21】【阈值】桶 L1_age=age_60_80 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-12 19:31:21] 桶 bucket_id=L1_age=age_60_80 level=1：
    n_train=5421, n_val=2336,
    BAC=0.505, F1=0.064, AUC=0.659,
    Regret=0.324, BND_ratio=0.023, POS_coverage=0.011

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-12 19:31:43】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-12 19:31:44] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=81413
[INFO][BT][2025-12-12 19:31:44] 创建桶 bucket_id=L1_age=age_40_60，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_40_60"，n_samples=21676
[INFO][BT][2025-12-12 19:31:44] 创建桶 bucket_id=L1_age=age_60_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_60_80"，n_samples=38834
[INFO][BT][2025-12-12 19:31:44] 创建桶 bucket_id=L1_age=age_ge_80，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_ge_80"，n_samples=15887
[INFO][BT][2025-12-12 19:31:44] 创建桶 bucket_id=L1_age=age_le_40，level=1，parent_id=ROOT，split_name=L1_age，split_type=category_group，split_rule="age_le_40"，n_samples=5016
[INFO][BT][2025-12-12 19:31:44] 创建桶 bucket_id=L1_age=age_40_60|L2_admission_type_id=EmergentUrgent，level=2，parent_id=L1_age=age_40_60，sp



【INFO】【2025-12-12 19:31:54】【阈值】桶 ROOT（n_val=4884）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:31:54] 桶 bucket_id=ROOT level=0：
    n_train=11396, n_val=4884,
    BAC=0.509, F1=0.067, AUC=0.683,
    Regret=0.329, BND_ratio=0.016, POS_coverage=0.010,
    Score(f1_regret_bnd )=-0.269
【INFO】【2025-12-12 19:31:54】【阈值】桶 L1_age=age_40_60（n_val=1496）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:31:54] 桶 bucket_id=L1_age=age_40_60 level=1：
    n_train=3038, n_val=1496,
    BAC=0.515, F1=0.094, AUC=0.718,
    Regret=0.298, BND_ratio=0.019, POS_coverage=0.012,
    Score(f1_regret_bnd )=-0.214
[INFO][BT][2025-12-12 19:31:54] 桶 bucket_id=L1_age=age_40_60：
    parent_id=ROOT，parent_Score=-0.269, bucket_Score=-0.214,
    Gain=+0.056, is_weak=False
【INFO】【2025-12-12 19:31:54】【阈值】桶 L1_age=age_60_80（n_val=2319）使用本地阈值 α=0.4000, β=0.3000
[INFO][BT][2025-12-12 19:31:54] 桶 bucket_id=L1_age=age_60_80 level=1：
    n_train=5451, n_val=2319,
    BAC=0.516, F1=0.094, AUC=0.662,
    Regret=0.312, BND_ra

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-12 19:32:16】[BASELINE] 全局 XGB 模型训练完成
【INFO】【2025-12-12 19:32:16】[BASELINE] 阈值搜索开始
【INFO】【2025-12-12 19:32:17】[BASELINE] 最佳阈值找到: alpha=0.4000, beta=0.3000, regret=0.3290
【INFO】【2025-12-12 19:32:17】【桶树】已为样本生成桶ID，共 107 个组合
【INFO】【2025-12-12 19:32:17】[BASELINE] 测试集桶映射完成，共 107 个桶


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expecte

【INFO】【2025-12-12 19:32:23】[BASELINE] 桶 L1_age=age_ge_80|L2_admission_type_id=EmergentUrgent|L3_time_in_hospital=mid_stay|L4_number_inpatient=no_prev_inpt: BAC=0.5049, Regret=0.3943, Precision=0.5000, Recall=0.0116
【INFO】【2025-12-12 19:32:23】[BASELINE] 桶 L1_age=age_60_80|L2_admission_type_id=EmergentUrgent|L3_time_in_hospital=mid_stay|L4_number_inpatient=no_prev_inpt: BAC=0.5063, Regret=0.3188, Precision=0.6667, Recall=0.0134
【INFO】【2025-12-12 19:32:23】[BASELINE] 桶 L1_age=age_60_80|L2_admission_type_id=EmergentUrgent|L3_time_in_hospital=short_stay|L4_number_inpatient=no_prev_inpt: BAC=0.5118, Regret=0.2683, Precision=0.7273, Recall=0.0245
【INFO】【2025-12-12 19:32:23】[BASELINE] 桶 L1_age=age_40_60|L2_admission_type_id=EmergentUrgent|L3_time_in_hospital=mid_stay|L4_number_inpatient=no_prev_inpt: BAC=0.5099, Regret=0.2129, Precision=1.0000, Recall=0.0198
【INFO】【2025-12-12 19:32:23】[BASELINE] 桶 L1_age=age_60_80|L2_admission_type_id=OtherType|L3_time_in_hospital=long_stay|L4_number_inpatient=

Unnamed: 0,model,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,BAC_mean,BAC_std,AUC_mean,...,MCC_mean,MCC_std,Kappa_mean,Kappa_std,BND_ratio_mean,BND_ratio_std,POS_Coverage_mean,POS_Coverage_std,Regret_mean,Regret_std
0,BTTWD,0.329824,0.022082,0.151272,0.013409,0.207234,0.015529,0.556313,0.006285,0.676102,...,0.160875,0.016216,0.147444,0.015364,0.020577,0.001815,0.009394,0.001272,0.327064,0.001477
1,LogReg,0.420439,0.037458,0.035132,0.005478,0.064768,0.009518,0.514524,0.002533,0.669471,...,0.094969,0.012731,0.04842,0.008141,0.0,0.0,,,0.32844,0.001571
2,RandomForest,0.496365,0.043043,0.033019,0.002823,0.061904,0.005219,0.514397,0.001502,0.651329,...,0.105596,0.01097,0.048639,0.005056,0.0,0.0,,,0.327496,0.001148
3,KNN,0.234976,0.012184,0.069385,0.004072,0.107093,0.005736,0.52049,0.001964,0.567914,...,0.072288,0.006816,0.059216,0.005598,0.0,0.0,,,0.336802,0.001686
4,XGBoost,0.486811,0.017426,0.040768,0.003424,0.075193,0.00581,0.517679,0.001514,0.678575,...,0.115619,0.006082,0.058963,0.004755,0.0,0.0,,,0.325954,0.000847


【INFO】【2025-12-12 19:32:24】【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。


In [9]:
# 步骤7：桶级别分析
bucket_metrics_path = os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'bucket_metrics.csv')
if os.path.exists(bucket_metrics_path):
    bucket_metrics_df = pd.read_csv(bucket_metrics_path)
    display(bucket_metrics_df.head())
    bucket_metrics_df.plot(x='bucket_id', y='pos_rate_all', kind='bar', figsize=(12,4), title='桶正类比例')
    plt.ylabel('正类比例')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(fig_bucket, bbox_inches='tight')
    plt.close()
log_info('【步骤7摘要】桶级指标已整理，可用于局部化分析。')

Unnamed: 0,bucket_id,layer,parent_bucket_id,n_train,n_val,pos_rate_train,pos_rate_val,alpha,beta,regret_val,...,is_weak,threshold_source_bucket,parent_with_threshold,n_test,pos_rate_test,BND_ratio_test,POS_Coverage_test,regret_test,fold,pos_rate
0,ROOT,L1,,11396,4883,0.111794,0.104034,0.6,0.4,0.310055,...,False,ROOT,,,,,,,1,0.111593
1,L1_age=age_60_80,L1,ROOT,5424,2327,0.116335,0.113021,0.4,0.3,0.333691,...,False,L1_age=age_60_80,,,,,,,1,0.115186
2,L1_age=age_60_80|L2_admission_type_id=Emergent...,L2,L1_age=age_60_80,4862,2087,0.114973,0.125539,0.4,0.3,0.35817,...,False,L1_age=age_60_80|L2_admission_type_id=Emergent...,,,,,,,1,0.11622
3,L1_age=age_40_60,L1,ROOT,3033,1494,0.108144,0.100402,0.4,0.3,0.298193,...,False,L1_age=age_40_60,,,,,,,1,0.099398
4,L1_age=age_40_60|L2_admission_type_id=Emergent...,L2,L1_age=age_40_60,2719,1155,0.097095,0.090043,0.4,0.3,0.267532,...,True,L1_age=age_40_60,L1_age=age_40_60,,,,,,1,0.099721


  plt.tight_layout()


【INFO】【2025-12-12 19:32:37】【步骤7摘要】桶级指标已整理，可用于局部化分析。


In [10]:
# 步骤8：结果汇总
log_info('【步骤8】检查结果文件与图表。')
print(os.listdir(os.path.join(root_path, cfg['OUTPUT']['results_dir'])))
print(os.listdir(os.path.join(root_path, cfg['OUTPUT']['figs_dir'])))
log_info('【全部步骤完成】Telco Churn 数据集上的 BT-TWD 实验结束。')

【INFO】【2025-12-12 19:32:37】【步骤8】检查结果文件与图表。
['baseline_bucket_metrics.csv', 'bucket_fallback_stats.csv', 'bucket_metrics.csv', 'bucket_metrics_gain.csv', 'bucket_thresholds.csv', 'bucket_thresholds_per_fold.csv', 'bucket_tree_structure.csv', 'metrics_kfold_per_fold.csv', 'metrics_kfold_summary.csv', 'metrics_overview.csv']
['bank_class_distribution.png', 'bucket_metrics_bar.png', 'class_distribution.png', 'metrics_compare.png']
【INFO】【2025-12-12 19:32:37】【全部步骤完成】Telco Churn 数据集上的 BT-TWD 实验结束。
