Telco Churn 数据集 BT-TWD 可行性实验

本 notebook 按步骤运行：加载配置 → 读取数据 → 预处理 → 桶树划分 → 基线与 BTTWD k 折实验 → 桶级分析。

In [1]:
# 步骤0：环境与路径设置
import os, sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# 将项目根目录加入路径，便于导入 bttwdlib
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root_path not in sys.path:
    sys.path.append(root_path)

from bttwdlib import (
    load_yaml_cfg,
    show_cfg,
    load_dataset,
    prepare_features_and_labels,
    BucketTree,
    run_kfold_experiments,
    log_info,
    set_global_seed,
)

cfg_path = Path(root_path) / "configs" / "telco_churn.yaml"
cfg = load_yaml_cfg(cfg_path)
set_global_seed(cfg.get('SEED', {}).get('global_seed', 42))
log_info('【步骤0摘要】环境准备完毕，路径与随机种子已设置。')

【INFO】【2025-12-11 19:36:45】【配置加载】已读取 e:\yan\组\三支决策\机器学习\BT_TWD\configs\telco_churn.yaml
【INFO】【2025-12-11 19:36:49】【步骤0摘要】环境准备完毕，路径与随机种子已设置。


In [2]:
# 步骤1：加载配置
show_cfg(cfg)
log_info('【步骤1摘要】配置文件加载完成，关键参数检查通过。')

【INFO】【2025-12-11 19:36:49】【配置-数据】数据集=telco_churn, k折=5, 目标列=Churn, 正类="Yes"
【INFO】【2025-12-11 19:36:49】【配置-BTTWD】阈值模式=None, 全局模型=xgb, 桶内模型=none, 后验估计器(兼容字段)=logreg
【INFO】【2025-12-11 19:36:49】【配置-基线】LogReg启用=True, RandomForest启用=True, KNN启用=True, XGBoost启用=True
【INFO】【2025-12-11 19:36:49】【步骤1摘要】配置文件加载完成，关键参数检查通过。


In [3]:
# 步骤2：加载原始数据
df_raw, target_col_model = load_dataset(cfg)  # 这里返回的是用于建模的标签列，例如 "label"

display(df_raw.head())
print("用于建模的标签列:", target_col_model)

# 1）画 0/1 标签（流失/未流失）的比例
class_counts = df_raw[target_col_model].value_counts(normalize=True)
ax = class_counts.plot(kind='bar', title='流失 vs 未流失比例')
plt.ylabel('比例')

fig_path = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'class_distribution.png')
os.makedirs(os.path.dirname(fig_path), exist_ok=True)
plt.savefig(fig_path, bbox_inches='tight')
plt.close()

# 2）如果想看原始标签列的分布，可以另外单独分析：
raw_target_col = cfg['DATA']['target_col']  # 这里是原始标签列
print("原始目标列:", raw_target_col)
print(df_raw[raw_target_col].describe())

log_info('【步骤2摘要】Telco Churn 原始数据加载与基本统计完成。')


【INFO】【2025-12-11 19:36:49】【数据加载】文本表格 ..\data\Telco-Customer-Churn\Telco-Customer-Churn.csv 已读取，样本数=7043，列数=21
【INFO】【2025-12-11 19:36:49】【数据集信息】名称=telco_churn，样本数=7043，目标列=Churn，正类比例=26.54%


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


用于建模的标签列: Churn
原始目标列: Churn
count    7043.000000
mean        0.265370
std         0.441561
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Churn, dtype: float64
【INFO】【2025-12-11 19:36:49】【步骤2摘要】Telco Churn 原始数据加载与基本统计完成。


In [4]:
# 步骤3：预处理与特征工程
X, y, meta = prepare_features_and_labels(df_raw, cfg)
log_info(f'【预处理】编码特征维度={X.shape[1]}，样本数={X.shape[0]}')
log_info(f"【步骤3摘要】特征预处理完成：连续={len(meta['continuous_cols'])}，类别={len(meta['categorical_cols'])}，编码维度={X.shape[1]}。")

【INFO】【2025-12-11 19:36:49】【预处理】缺失值填充策略=most_frequent
【INFO】【2025-12-11 19:36:49】【预处理】连续特征=3个，类别特征=16个
【INFO】【2025-12-11 19:36:49】【预处理】编码后维度=30
【INFO】【2025-12-11 19:36:49】【预处理】编码特征维度=30，样本数=7043
【INFO】【2025-12-11 19:36:49】【步骤3摘要】特征预处理完成：连续=3，类别=16，编码维度=30。


In [5]:
# 步骤4：构建桶树并检查划分
feature_cols_for_bucket = [c for c in df_raw.columns if c != target_col_model]

bucket_tree = BucketTree(
    cfg['BTTWD']['bucket_levels'],
    feature_names=feature_cols_for_bucket
)

bucket_ids_full = bucket_tree.assign_buckets(df_raw[feature_cols_for_bucket])

bucket_df = bucket_ids_full.value_counts().reset_index()
bucket_df.columns = ['bucket_id', 'count']

bucket_df['pos_rate'] = (
    df_raw.groupby(bucket_ids_full)[target_col_model]
    .apply(lambda s: (s == 1).mean())
    .values
)
display(bucket_df.head())
bucket_df.set_index('bucket_id')['count'].plot(kind='bar', figsize=(12,4), title='桶样本数分布')
fig_bucket = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bucket_metrics_bar.png')
plt.savefig(fig_bucket, bbox_inches='tight')
plt.close()
log_info(f'【步骤4摘要】桶树划分完成，共有 {bucket_ids_full.nunique()} 个叶子桶。')


【INFO】【2025-12-11 19:36:50】【桶树】已为样本生成桶ID，共 35 个组合


Unnamed: 0,bucket_id,count,pos_rate
0,L1_Contract=month_to_month|L2_tenure=new|L3_In...,916,1.0
1,L1_Contract=month_to_month|L2_tenure=mid_term|...,695,0.0
2,L1_Contract=month_to_month|L2_tenure=new|L3_In...,690,0.25
3,L1_Contract=month_to_month|L2_tenure=short_ter...,425,0.036364
4,L1_Contract=two_year|L2_tenure=long_term|L3_In...,402,0.160839


【INFO】【2025-12-11 19:36:50】【步骤4摘要】桶树划分完成，共有 35 个叶子桶。


In [6]:
# 步骤5：运行基线模型 k 折实验
# 基线部分在 run_kfold_experiments 内统一调度
log_info('【步骤5】基线模型将在整体交叉验证中一并运行。')
log_info('【步骤5摘要】基线模型性能将作为后续对比基准。')

【INFO】【2025-12-11 19:36:50】【步骤5】基线模型将在整体交叉验证中一并运行。
【INFO】【2025-12-11 19:36:50】【步骤5摘要】基线模型性能将作为后续对比基准。


In [7]:
import numpy as np
import pandas as pd

print("y 全局标签分布：", np.unique(y, return_counts=True))

print("原始标签列分布：")
print(df_raw[cfg['DATA']['target_col']].value_counts())


y 全局标签分布： (array([0, 1]), array([5174, 1869], dtype=int64))
原始标签列分布：
Churn
0    5174
1    1869
Name: count, dtype: int64


In [8]:
# 步骤6：运行 BTTWD k 折实验（含基线）
results = run_kfold_experiments(X, y, df_raw.drop(columns=[cfg['DATA']['target_col']]), cfg)
summary_df = pd.read_csv(os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'metrics_kfold_summary.csv'))
display(summary_df)
summary_df.plot(x='model', kind='bar', figsize=(8,4), title='模型指标对比')
fig_compare = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'metrics_compare.png')
plt.savefig(fig_compare, bbox_inches='tight')
plt.close()
log_info('【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。')

【INFO】【2025-12-11 19:36:51】【基线-LogReg】使用决策阈值=0.200（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-12-11 19:36:51】【基线-LogReg】整体指标：AUC_mean=0.845, AUC_std=0.013, BAC_mean=0.756, BAC_std=0.013, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.610, F1_std=0.014, Kappa_mean=0.405, Kappa_std=0.022, MCC_mean=0.453, MCC_std=0.022, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.471, Precision_std=0.013, Recall_mean=0.863, Recall_std=0.015, Regret_mean=0.439, Regret_std=0.027
【INFO】【2025-12-11 19:36:51】【基线-RF】使用决策阈值=0.200（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-12-11 19:36:54】【基线-RF】整体指标：AUC_mean=0.825, AUC_std=0.013, BAC_mean=0.743, BAC_std=0.008, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.596, F1_std=0.009, Kappa_mean=0.385, Kappa_std=0.015, MCC_mean=0.429, MCC_std=0.015, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.462, Precision_std=0.009, Recall_mean=0.839, Recall_std=0.012, Regret_mean=0.473, Regret_std=0.018
【INFO】【2025-12-11 19:36:54】【基线-KNN】使用决策阈值=0.200（fixed 模式）


  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-11 19:36:56】【基线-KNN】整体指标：AUC_mean=0.790, AUC_std=0.018, BAC_mean=0.726, BAC_std=0.011, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.580, F1_std=0.012, Kappa_mean=0.367, Kappa_std=0.018, MCC_mean=0.401, MCC_std=0.019, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.457, Precision_std=0.010, Recall_mean=0.792, Recall_std=0.021, Regret_mean=0.526, Regret_std=0.027
【INFO】【2025-12-11 19:36:56】【基线-XGB】使用决策阈值=0.200（fixed 模式）


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-11 19:37:13】【基线-XGB】整体指标：AUC_mean=0.835, AUC_std=0.009, BAC_mean=0.753, BAC_std=0.008, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.609, F1_std=0.009, Kappa_mean=0.412, Kappa_std=0.015, MCC_mean=0.449, MCC_std=0.015, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.482, Precision_std=0.010, Recall_mean=0.829, Recall_std=0.010, Regret_mean=0.464, Regret_std=0.017
【INFO】【2025-12-11 19:37:13】【K折实验】正在执行第 1/5 折...
【INFO】【2025-12-11 19:37:13】[BT] 使用桶评分配置：mode=f1_regret_bnd, f1_weight=1.0, regret_weight=1.0, bnd_weight=0.5
【INFO】【2025-12-11 19:37:16】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-11 19:37:17] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=5634
[INFO][BT][2025-12-11 19:37:17] 创建桶 bucket_id=L1_Contract=month_to_month，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="month_to_month"，n_samples=3108
[INFO][BT][2025-12-11 19:37:17] 创建桶 bucket_id=L1_Contract=one_y



【INFO】【2025-12-11 19:37:18】【阈值】桶 ROOT（n_val=337）使用本地阈值 α=0.4000, β=0.2000
[INFO][BT][2025-12-11 19:37:18] 桶 bucket_id=ROOT level=0：
    n_train=788, n_val=337,
    BAC=0.659, F1=0.568, AUC=0.822,
    Regret=0.441, BND_ratio=0.151, POS_coverage=0.303,
    Score(f1_regret_bnd )=0.052
【INFO】【2025-12-11 19:37:18】【阈值】桶 L1_Contract=month_to_month 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:37:18] 桶 bucket_id=L1_Contract=month_to_month level=1：
    n_train=434, n_val=198,
    BAC=0.448, F1=0.619, AUC=0.728,
    Regret=0.561, BND_ratio=0.263, POS_coverage=0.737,
    Score(f1_regret_bnd )=-0.073
[INFO][BT][2025-12-11 19:37:18] 桶 bucket_id=L1_Contract=month_to_month：
    parent_id=ROOT，parent_Score=0.052, bucket_Score=-0.073,
    Gain=-0.125, is_weak=True
【INFO】【2025-12-11 19:37:19】【阈值】桶 L1_Contract=one_year 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:37:19] 桶 bucket_id=L1_Contract=one_year level=1：
    n_train=165, n_val=126,
    BAC=0.345, F1=0.000, AUC=0.653,
    Regret=0.520, BND_rat

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-11 19:37:26】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-11 19:37:27] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=5634
[INFO][BT][2025-12-11 19:37:27] 创建桶 bucket_id=L1_Contract=month_to_month，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="month_to_month"，n_samples=3090
[INFO][BT][2025-12-11 19:37:27] 创建桶 bucket_id=L1_Contract=one_year，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="one_year"，n_samples=1160
[INFO][BT][2025-12-11 19:37:27] 创建桶 bucket_id=L1_Contract=two_year，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="two_year"，n_samples=1384
[INFO][BT][2025-12-11 19:37:27] 创建桶 bucket_id=L1_Contract=month_to_month|L2_tenure=long_term，level=2，parent_id=L1_Contract=month_to_month，split_name=L2_tenure，split_type=numeric_bin，split_rule="long_term"，n_samples=88
[INFO][BT][2025-12-11 19:37:27] 创建桶 bucket_id



【INFO】【2025-12-11 19:37:28】【阈值】桶 ROOT（n_val=337）使用本地阈值 α=0.3000, β=0.1000
[INFO][BT][2025-12-11 19:37:28] 桶 bucket_id=ROOT level=0：
    n_train=788, n_val=337,
    BAC=0.646, F1=0.667, AUC=0.832,
    Regret=0.488, BND_ratio=0.211, POS_coverage=0.356,
    Score(f1_regret_bnd )=0.073
【INFO】【2025-12-11 19:37:28】【阈值】桶 L1_Contract=month_to_month 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:37:28] 桶 bucket_id=L1_Contract=month_to_month level=1：
    n_train=427, n_val=207,
    BAC=0.449, F1=0.602, AUC=0.722,
    Regret=0.575, BND_ratio=0.242, POS_coverage=0.758,
    Score(f1_regret_bnd )=-0.094
[INFO][BT][2025-12-11 19:37:28] 桶 bucket_id=L1_Contract=month_to_month：
    parent_id=ROOT，parent_Score=0.073, bucket_Score=-0.094,
    Gain=-0.167, is_weak=True
【INFO】【2025-12-11 19:37:28】【阈值】桶 L1_Contract=one_year 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:37:28] 桶 bucket_id=L1_Contract=one_year level=1：
    n_train=165, n_val=123,
    BAC=0.500, F1=0.000, AUC=0.739,
    Regret=0.244, BND_rat

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-11 19:37:40】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-11 19:37:40] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=5634
[INFO][BT][2025-12-11 19:37:40] 创建桶 bucket_id=L1_Contract=month_to_month，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="month_to_month"，n_samples=3105
[INFO][BT][2025-12-11 19:37:40] 创建桶 bucket_id=L1_Contract=one_year，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="one_year"，n_samples=1188
[INFO][BT][2025-12-11 19:37:40] 创建桶 bucket_id=L1_Contract=two_year，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="two_year"，n_samples=1341
[INFO][BT][2025-12-11 19:37:40] 创建桶 bucket_id=L1_Contract=month_to_month|L2_tenure=long_term，level=2，parent_id=L1_Contract=month_to_month，split_name=L2_tenure，split_type=numeric_bin，split_rule="long_term"，n_samples=86
[INFO][BT][2025-12-11 19:37:40] 创建桶 bucket_id



【INFO】【2025-12-11 19:37:41】【阈值】桶 ROOT（n_val=338）使用本地阈值 α=0.2000, β=0.1000
[INFO][BT][2025-12-11 19:37:41] 桶 bucket_id=ROOT level=0：
    n_train=788, n_val=338,
    BAC=0.638, F1=0.586, AUC=0.793,
    Regret=0.516, BND_ratio=0.157, POS_coverage=0.414,
    Score(f1_regret_bnd )=-0.008
【INFO】【2025-12-11 19:37:42】【阈值】桶 L1_Contract=month_to_month 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:37:42] 桶 bucket_id=L1_Contract=month_to_month level=1：
    n_train=436, n_val=199,
    BAC=0.434, F1=0.617, AUC=0.744,
    Regret=0.565, BND_ratio=0.307, POS_coverage=0.693,
    Score(f1_regret_bnd )=-0.102
[INFO][BT][2025-12-11 19:37:42] 桶 bucket_id=L1_Contract=month_to_month：
    parent_id=ROOT，parent_Score=-0.008, bucket_Score=-0.102,
    Gain=-0.093, is_weak=True
【INFO】【2025-12-11 19:37:42】【阈值】桶 L1_Contract=one_year 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:37:42] 桶 bucket_id=L1_Contract=one_year level=1：
    n_train=160, n_val=143,
    BAC=0.389, F1=0.125, AUC=0.665,
    Regret=0.364, BND_r

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-11 19:37:51】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-11 19:37:51] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=5635
[INFO][BT][2025-12-11 19:37:51] 创建桶 bucket_id=L1_Contract=month_to_month，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="month_to_month"，n_samples=3113
[INFO][BT][2025-12-11 19:37:51] 创建桶 bucket_id=L1_Contract=one_year，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="one_year"，n_samples=1183
[INFO][BT][2025-12-11 19:37:51] 创建桶 bucket_id=L1_Contract=two_year，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="two_year"，n_samples=1339
[INFO][BT][2025-12-11 19:37:51] 创建桶 bucket_id=L1_Contract=month_to_month|L2_tenure=long_term，level=2，parent_id=L1_Contract=month_to_month，split_name=L2_tenure，split_type=numeric_bin，split_rule="long_term"，n_samples=83
[INFO][BT][2025-12-11 19:37:51] 创建桶 bucket_id



【INFO】【2025-12-11 19:37:52】【阈值】桶 ROOT（n_val=336）使用本地阈值 α=0.2000, β=0.1000
[INFO][BT][2025-12-11 19:37:52] 桶 bucket_id=ROOT level=0：
    n_train=788, n_val=336,
    BAC=0.701, F1=0.573, AUC=0.822,
    Regret=0.399, BND_ratio=0.107, POS_coverage=0.464,
    Score(f1_regret_bnd )=0.120
【INFO】【2025-12-11 19:37:52】【阈值】桶 L1_Contract=month_to_month 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:37:52] 桶 bucket_id=L1_Contract=month_to_month level=1：
    n_train=477, n_val=209,
    BAC=0.438, F1=0.609, AUC=0.715,
    Regret=0.572, BND_ratio=0.282, POS_coverage=0.718,
    Score(f1_regret_bnd )=-0.104
[INFO][BT][2025-12-11 19:37:52] 桶 bucket_id=L1_Contract=month_to_month：
    parent_id=ROOT，parent_Score=0.120, bucket_Score=-0.104,
    Gain=-0.224, is_weak=True
【INFO】【2025-12-11 19:37:53】【阈值】桶 L1_Contract=one_year 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:37:53] 桶 bucket_id=L1_Contract=one_year level=1：
    n_train=158, n_val=138,
    BAC=0.414, F1=0.133, AUC=0.708,
    Regret=0.399, BND_rat

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-11 19:38:02】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-12-11 19:38:02] 创建桶 bucket_id=ROOT，level=0，parent_id=ROOT，split_name=ROOT，split_type=ROOT，split_rule="all"，n_samples=5635
[INFO][BT][2025-12-11 19:38:02] 创建桶 bucket_id=L1_Contract=month_to_month，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="month_to_month"，n_samples=3084
[INFO][BT][2025-12-11 19:38:02] 创建桶 bucket_id=L1_Contract=one_year，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="one_year"，n_samples=1179
[INFO][BT][2025-12-11 19:38:02] 创建桶 bucket_id=L1_Contract=two_year，level=1，parent_id=ROOT，split_name=L1_Contract，split_type=categorical_group，split_rule="two_year"，n_samples=1372
[INFO][BT][2025-12-11 19:38:02] 创建桶 bucket_id=L1_Contract=month_to_month|L2_tenure=long_term，level=2，parent_id=L1_Contract=month_to_month，split_name=L2_tenure，split_type=numeric_bin，split_rule="long_term"，n_samples=84
[INFO][BT][2025-12-11 19:38:02] 创建桶 bucket_id



【INFO】【2025-12-11 19:38:03】【阈值】桶 ROOT（n_val=337）使用本地阈值 α=0.2000, β=0.1000
[INFO][BT][2025-12-11 19:38:03] 桶 bucket_id=ROOT level=0：
    n_train=788, n_val=337,
    BAC=0.674, F1=0.599, AUC=0.823,
    Regret=0.466, BND_ratio=0.107, POS_coverage=0.481,
    Score(f1_regret_bnd )=0.080
【INFO】【2025-12-11 19:38:03】【阈值】桶 L1_Contract=month_to_month 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:38:03] 桶 bucket_id=L1_Contract=month_to_month level=1：
    n_train=476, n_val=206,
    BAC=0.416, F1=0.688, AUC=0.749,
    Regret=0.519, BND_ratio=0.388, POS_coverage=0.612,
    Score(f1_regret_bnd )=-0.025
[INFO][BT][2025-12-11 19:38:03] 桶 bucket_id=L1_Contract=month_to_month：
    parent_id=ROOT，parent_Score=0.080, bucket_Score=-0.025,
    Gain=-0.105, is_weak=True
【INFO】【2025-12-11 19:38:04】【阈值】桶 L1_Contract=one_year 标记为弱桶，阈值将回退使用 ROOT 的阈值
[INFO][BT][2025-12-11 19:38:04] 桶 bucket_id=L1_Contract=one_year level=1：
    n_train=162, n_val=130,
    BAC=0.494, F1=0.300, AUC=0.789,
    Regret=0.354, BND_rat

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-12-11 19:38:12】[BASELINE] 全局 XGB 模型训练完成
【INFO】【2025-12-11 19:38:12】[BASELINE] 阈值搜索开始
【INFO】【2025-12-11 19:38:13】[BASELINE] 最佳阈值找到: alpha=0.2000, beta=0.1000, regret=0.4382
【INFO】【2025-12-11 19:38:13】【桶树】已为样本生成桶ID，共 33 个组合
【INFO】【2025-12-11 19:38:13】[BASELINE] 测试集桶映射完成，共 33 个桶


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expecte

【INFO】【2025-12-11 19:38:14】[BASELINE] 桶 L1_Contract=two_year|L2_tenure=long_term|L3_InternetService=fiber: BAC=0.4919, Regret=0.3712, Precision=0.0000, Recall=0.0000
【INFO】【2025-12-11 19:38:14】[BASELINE] 桶 L1_Contract=month_to_month|L2_tenure=new|L3_InternetService=fiber: BAC=0.5039, Regret=0.3441, Precision=0.6630, Recall=0.9919
【INFO】【2025-12-11 19:38:14】[BASELINE] 桶 L1_Contract=one_year|L2_tenure=mid_term|L3_InternetService=dsl: BAC=0.5000, Regret=0.4067, Precision=0.0000, Recall=0.0000
【INFO】【2025-12-11 19:38:14】[BASELINE] 桶 L1_Contract=month_to_month|L2_tenure=short_term|L3_InternetService=fiber: BAC=0.5100, Regret=0.5160, Precision=0.4945, Recall=0.9783
【INFO】【2025-12-11 19:38:14】[BASELINE] 桶 L1_Contract=two_year|L2_tenure=long_term|L3_InternetService=dsl: BAC=0.5000, Regret=0.0704, Precision=0.0000, Recall=0.0000
【INFO】【2025-12-11 19:38:14】[BASELINE] 桶 L1_Contract=month_to_month|L2_tenure=short_term|L3_InternetService=OTHER: BAC=0.9231, Regret=0.0769, Precision=0.0000, Recall=0.

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Unnamed: 0,model,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,BAC_mean,BAC_std,AUC_mean,...,MCC_mean,MCC_std,Kappa_mean,Kappa_std,BND_ratio_mean,BND_ratio_std,POS_Coverage_mean,POS_Coverage_std,Regret_mean,Regret_std
0,BTTWD,0.468779,0.008144,0.846979,0.011842,0.603445,0.006118,0.750027,0.005369,0.830345,...,0.442064,0.009531,0.397625,0.011432,0.147382,0.026182,0.400539,0.015658,0.437177,0.019105
1,LogReg,0.471242,0.012526,0.863016,0.015203,0.609571,0.013644,0.756497,0.012515,0.845129,...,0.453251,0.022273,0.405416,0.021902,0.0,0.0,,,0.438883,0.026687
2,RandomForest,0.461734,0.008967,0.839479,0.012181,0.595722,0.009,0.742895,0.008224,0.824637,...,0.429325,0.014665,0.385184,0.014774,0.0,0.0,,,0.472814,0.017776
3,KNN,0.457309,0.010108,0.791847,0.021452,0.579689,0.011935,0.726132,0.010911,0.789977,...,0.400808,0.019274,0.366602,0.017845,0.0,0.0,,,0.525636,0.027284
4,XGBoost,0.481943,0.009569,0.828787,0.010403,0.609425,0.009224,0.753397,0.00808,0.834643,...,0.449308,0.014635,0.412106,0.01515,0.0,0.0,,,0.463726,0.01689


【INFO】【2025-12-11 19:38:15】【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。


In [9]:
# 步骤7：桶级别分析
bucket_metrics_path = os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'bucket_metrics.csv')
if os.path.exists(bucket_metrics_path):
    bucket_metrics_df = pd.read_csv(bucket_metrics_path)
    display(bucket_metrics_df.head())
    bucket_metrics_df.plot(x='bucket_id', y='pos_rate_all', kind='bar', figsize=(12,4), title='桶正类比例')
    plt.ylabel('正类比例')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(fig_bucket, bbox_inches='tight')
    plt.close()
log_info('【步骤7摘要】桶级指标已整理，可用于局部化分析。')

Unnamed: 0,bucket_id,layer,parent_bucket_id,n_train,n_val,pos_rate_train,pos_rate_val,alpha,beta,regret_val,...,is_weak,threshold_source_bucket,parent_with_threshold,n_test,pos_rate_test,BND_ratio_test,POS_Coverage_test,regret_test,fold,pos_rate
0,ROOT,L1,,788,337,0.263959,0.240356,0.4,0.2,0.440653,...,False,ROOT,,,,,,,1,0.265353
1,L1_Contract=month_to_month,L1,ROOT,434,198,0.421659,0.388889,0.4,0.2,0.560606,...,True,ROOT,,,,,,,1,0.425032
2,L1_Contract=month_to_month|L2_tenure=new,L2,L1_Contract=month_to_month,224,96,0.450893,0.489583,0.2,0.0,0.479167,...,False,L1_Contract=month_to_month|L2_tenure=new,,,,,,,1,0.508369
3,L1_Contract=two_year,L1,ROOT,252,111,0.015873,0.0,0.4,0.2,,...,True,ROOT,,,,,,,1,0.029762
4,L1_Contract=one_year,L1,ROOT,165,126,0.09697,0.126984,0.4,0.2,0.519841,...,True,ROOT,,,,,,,1,0.113367


  plt.tight_layout()


【INFO】【2025-12-11 19:38:18】【步骤7摘要】桶级指标已整理，可用于局部化分析。


In [10]:
# 步骤8：结果汇总
log_info('【步骤8】检查结果文件与图表。')
print(os.listdir(os.path.join(root_path, cfg['OUTPUT']['results_dir'])))
print(os.listdir(os.path.join(root_path, cfg['OUTPUT']['figs_dir'])))
log_info('【全部步骤完成】Telco Churn 数据集上的 BT-TWD 实验结束。')

【INFO】【2025-12-11 19:38:18】【步骤8】检查结果文件与图表。
['baseline_bucket_metrics.csv', 'bucket_fallback_stats.csv', 'bucket_metrics.csv', 'bucket_metrics_gain.csv', 'bucket_thresholds.csv', 'bucket_thresholds_per_fold.csv', 'bucket_tree_structure.csv', 'metrics_kfold_per_fold.csv', 'metrics_kfold_summary.csv', 'metrics_overview.csv']
['bank_class_distribution.png', 'bucket_metrics_bar.png', 'class_distribution.png', 'metrics_compare.png']
【INFO】【2025-12-11 19:38:18】【全部步骤完成】Telco Churn 数据集上的 BT-TWD 实验结束。
