# 成人收入数据集 BT-TWD 可行性实验

本 notebook 按步骤运行：加载配置 → 读取数据 → 预处理 → 桶树划分 → 基线与 BTTWD k 折实验 → 桶级分析。

In [1]:
# 步骤0：环境与路径设置
import os, sys
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# 将项目根目录加入路径，便于导入 bttwdlib
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root_path not in sys.path:
    sys.path.append(root_path)

from bttwdlib import (
    load_yaml_cfg,
    show_cfg,
    load_adult_raw,
    prepare_features_and_labels,
    BucketTree,
    run_kfold_experiments,
    log_info,
    set_global_seed,
)

cfg_path = os.path.join(root_path, 'configs', 'adult_bttwd.yaml')
cfg = load_yaml_cfg(cfg_path)
set_global_seed(cfg.get('SEED', {}).get('global_seed', 42))
log_info('【步骤0摘要】环境准备完毕，路径与随机种子已设置。')

【INFO】【2025-11-22 22:09:31】【配置加载】已读取 e:\yan\组\三支决策\机器学习\BT_TWD\configs\adult_bttwd.yaml
【INFO】【2025-11-22 22:09:34】【步骤0摘要】环境准备完毕，路径与随机种子已设置。


In [2]:
# 步骤1：加载配置
show_cfg(cfg)
log_info('【步骤1摘要】配置文件加载完成，关键参数检查通过。')

【INFO】【2025-11-22 22:09:34】【配置-数据】数据集=adult, k折=5, 目标列=income, 正类=">50K"
【INFO】【2025-11-22 22:09:34】【配置-BTTWD】阈值模式=bucket_wise, 全局模型=xgb, 桶内模型=knn, 后验估计器(兼容字段)=logreg
【INFO】【2025-11-22 22:09:34】【配置-基线】LogReg启用=True, RandomForest启用=True, KNN启用=True, XGBoost启用=True
【INFO】【2025-11-22 22:09:34】【步骤1摘要】配置文件加载完成，关键参数检查通过。


In [3]:
# 步骤2：加载原始数据
df_raw = load_adult_raw(cfg)
display(df_raw.head())
target_col = cfg['DATA']['target_col']
class_counts = df_raw[target_col].value_counts(normalize=True)
ax = class_counts.plot(kind='bar', title='正负类比例')
plt.ylabel('比例')
fig_path = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'class_distribution.png')
os.makedirs(os.path.dirname(fig_path), exist_ok=True)
plt.savefig(fig_path, bbox_inches='tight')
plt.close()
log_info('【步骤2摘要】Adult 原始数据加载与基本统计完成。')

【INFO】【2025-11-22 22:09:35】【数据加载完毕】样本数=32561，特征数=14，正类比例=0.24


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


【INFO】【2025-11-22 22:09:35】【步骤2摘要】Adult 原始数据加载与基本统计完成。


In [4]:
# 步骤3：预处理与特征工程
X, y, meta = prepare_features_and_labels(df_raw, cfg)
log_info(f'【预处理】编码特征维度={X.shape[1]}，样本数={X.shape[0]}')
log_info(f"【步骤3摘要】特征预处理完成：连续={len(meta['continuous_cols'])}，类别={len(meta['categorical_cols'])}，编码维度={X.shape[1]}。")

【INFO】【2025-11-22 22:09:35】【预处理】连续特征=6个，类别特征=8个
【INFO】【2025-11-22 22:09:35】【预处理】编码后维度=100
【INFO】【2025-11-22 22:09:35】【预处理】编码特征维度=100，样本数=32561
【INFO】【2025-11-22 22:09:35】【步骤3摘要】特征预处理完成：连续=6，类别=8，编码维度=100。


In [5]:
# 步骤4：构建桶树并检查划分
bucket_tree = BucketTree(cfg['BTTWD']['bucket_levels'], feature_names=df_raw.drop(columns=[cfg['DATA']['target_col']]).columns.tolist())
bucket_ids_full = bucket_tree.assign_buckets(df_raw.drop(columns=[cfg['DATA']['target_col']]))
bucket_df = bucket_ids_full.value_counts().reset_index()
bucket_df.columns = ['bucket_id', 'count']
bucket_df['pos_rate'] = df_raw.groupby(bucket_ids_full)[cfg['DATA']['target_col']].apply(lambda s: (s == cfg['DATA']['positive_label']).mean()).values
display(bucket_df.head())
bucket_df.set_index('bucket_id')['count'].plot(kind='bar', figsize=(12,4), title='桶样本数分布')
fig_bucket = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bucket_metrics_bar.png')
plt.savefig(fig_bucket, bbox_inches='tight')
plt.close()
log_info(f'【步骤4摘要】桶树划分完成，共有 {bucket_ids_full.nunique()} 个叶子桶。')

【INFO】【2025-11-22 22:09:35】【桶树】已为样本生成桶ID，共 48 个组合


Unnamed: 0,bucket_id,count,pos_rate
0,L1_age=old|L2_education=high|L3_hours-per-week...,3211,0.36
1,L1_age=old|L2_education=mid|L3_hours-per-week=...,2713,0.121723
2,L1_age=mid|L2_education=high|L3_hours-per-week...,2584,0.206269
3,L1_age=mid|L2_education=mid|L3_hours-per-week=...,2321,0.03125
4,L1_age=very_old|L2_education=mid|L3_hours-per-...,1654,0.0


【INFO】【2025-11-22 22:09:36】【步骤4摘要】桶树划分完成，共有 48 个叶子桶。


In [6]:
# 步骤5：运行基线模型 k 折实验
# 基线部分在 run_kfold_experiments 内统一调度
log_info('【步骤5】基线模型将在整体交叉验证中一并运行。')
log_info('【步骤5摘要】基线模型性能将作为后续对比基准。')

【INFO】【2025-11-22 22:09:36】【步骤5】基线模型将在整体交叉验证中一并运行。
【INFO】【2025-11-22 22:09:36】【步骤5摘要】基线模型性能将作为后续对比基准。


In [7]:
# 步骤6：运行 BTTWD k 折实验（含基线）
results = run_kfold_experiments(X, y, df_raw.drop(columns=[cfg['DATA']['target_col']]), cfg)
summary_df = pd.read_csv(os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'metrics_kfold_summary.csv'))
display(summary_df)
summary_df.plot(x='model', kind='bar', figsize=(8,4), title='模型指标对比')
fig_compare = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'metrics_compare.png')
plt.savefig(fig_compare, bbox_inches='tight')
plt.close()
log_info('【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。')

【INFO】【2025-11-22 22:09:52】【基线-LogReg】整体指标：Precision=0.735, Recall=0.602, F1=0.662, BAC=0.767, AUC=0.907, MCC=0.573, Kappa=0.568
【INFO】【2025-11-22 22:10:17】【基线-RF】整体指标：Precision=0.739, Recall=0.627, F1=0.678, BAC=0.778, AUC=0.906, MCC=0.590, Kappa=0.587
【INFO】【2025-11-22 22:10:25】【基线-KNN】整体指标：Precision=0.711, Recall=0.525, F1=0.604, BAC=0.729, AUC=0.869, MCC=0.512, Kappa=0.502


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-22 22:10:30】【基线-XGB】整体指标：Precision=0.786, Recall=0.655, F1=0.714, BAC=0.799, AUC=0.929, MCC=0.639, Kappa=0.634
【INFO】【2025-11-22 22:10:30】【K折实验】正在执行第 1/5 折...
【INFO】【2025-11-22 22:10:30】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:30】【BTTWD】桶 L1_age=mid|L2_education=high|L3_hours-per-week=high_hours 向父桶 L1_age=mid|L2_education=high 贡献 132 个典型样本
【INFO】【2025-11-22 22:10:30】【BTTWD】桶 L1_age=mid|L2_education=high|L3_hours-per-week=low_hours 向父桶 L1_age=mid|L2_education=high 贡献 63 个典型样本
【INFO】【2025-11-22 22:10:30】【BTTWD】桶 L1_age=mid|L2_education=high|L3_hours-per-week=normal_hours 向父桶 L1_age=mid|L2_education=high 贡献 314 个典型样本
【INFO】【2025-11-22 22:10:30】【BTTWD】桶 L1_age=mid|L2_education=low|L3_hours-per-week=high_hours 样本太少(n=24)，全部并入父桶 L1_age=mid|L2_education=low
【INFO】【2025-11-22 22:10:30】【BTTWD】桶 L1_age=mid|L2_education=low|L3_hours-per-week=low_hours 样本太少(n=35)，全部并入父桶 L1_age=mid|L2_education=low
【INFO】【2025-11-22 22:10:30】【BTTWD】桶 L1_age=mid|L2_education=low|L3_hours-per-we

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-22 22:10:31】【BTTWD】全局模型训练完成，用于兜底预测
【INFO】【2025-11-22 22:10:31】【BTTWD】父桶 L1_age=mid|L2_education=low 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:32】【BTTWD】父桶 L1_age=very_old|L2_education=low 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:32】【BTTWD】父桶 L1_age=young|L2_education=top 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:33】【BTTWD】叶子桶 L1_age=mid|L2_education=top|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:33】【BTTWD】叶子桶 L1_age=old|L2_education=low|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:35】【BTTWD】叶子桶 L1_age=young|L2_education=mid|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:35】【BTTWD】共生成 48 个叶子桶，其中有效桶 50 个（样本数 ≥ 50）
【INFO】【2025-11-22 22:10:35】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:36】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:37】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:37】【BTTWD】本折指标：Precision=0.501, Recall=0.813, F1=0.620, BAC=0.778, AUC=0.869, MCC=0.487, Kappa=0.458
【INFO】【2

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-22 22:10:37】【BTTWD】全局模型训练完成，用于兜底预测
【INFO】【2025-11-22 22:10:39】【BTTWD】父桶 L1_age=young|L2_education=top 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:40】【BTTWD】叶子桶 L1_age=mid|L2_education=top|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:40】【BTTWD】叶子桶 L1_age=old|L2_education=low|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:41】【BTTWD】叶子桶 L1_age=very_old|L2_education=low|L3_hours-per-week=high_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:42】【BTTWD】共生成 48 个叶子桶，其中有效桶 52 个（样本数 ≥ 50）
【INFO】【2025-11-22 22:10:42】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:43】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:43】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:43】【BTTWD】本折指标：Precision=0.522, Recall=0.786, F1=0.627, BAC=0.779, AUC=0.878, MCC=0.495, Kappa=0.475
【INFO】【2025-11-22 22:10:43】【K折实验】正在执行第 3/5 折...
【INFO】【2025-11-22 22:10:44】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:44】【BTTWD】桶 L1_age=mid|L2_education=high|L3_hours-per-w

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-22 22:10:44】【BTTWD】全局模型训练完成，用于兜底预测
【INFO】【2025-11-22 22:10:45】【BTTWD】父桶 L1_age=young|L2_education=top 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:46】【BTTWD】叶子桶 L1_age=mid|L2_education=top|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:47】【BTTWD】叶子桶 L1_age=very_old|L2_education=low|L3_hours-per-week=high_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:48】【BTTWD】叶子桶 L1_age=young|L2_education=mid|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:48】【BTTWD】共生成 48 个叶子桶，其中有效桶 51 个（样本数 ≥ 50）
【INFO】【2025-11-22 22:10:48】【桶树】已为样本生成桶ID，共 47 个组合
【INFO】【2025-11-22 22:10:49】【桶树】已为样本生成桶ID，共 47 个组合
【INFO】【2025-11-22 22:10:50】【桶树】已为样本生成桶ID，共 47 个组合
【INFO】【2025-11-22 22:10:50】【BTTWD】本折指标：Precision=0.507, Recall=0.793, F1=0.618, BAC=0.774, AUC=0.872, MCC=0.484, Kappa=0.460
【INFO】【2025-11-22 22:10:50】【K折实验】正在执行第 4/5 折...
【INFO】【2025-11-22 22:10:50】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:50】【BTTWD】桶 L1_age=mid|L2_education=high|L3_hours-per

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-22 22:10:50】【BTTWD】全局模型训练完成，用于兜底预测
【INFO】【2025-11-22 22:10:52】【BTTWD】父桶 L1_age=young|L2_education=top 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:52】【BTTWD】叶子桶 L1_age=mid|L2_education=top|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:55】【BTTWD】共生成 48 个叶子桶，其中有效桶 53 个（样本数 ≥ 50）
【INFO】【2025-11-22 22:10:55】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:56】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:57】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:57】【BTTWD】本折指标：Precision=0.515, Recall=0.781, F1=0.621, BAC=0.774, AUC=0.872, MCC=0.486, Kappa=0.466
【INFO】【2025-11-22 22:10:57】【K折实验】正在执行第 5/5 折...
【INFO】【2025-11-22 22:10:57】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:10:57】【BTTWD】桶 L1_age=mid|L2_education=high|L3_hours-per-week=high_hours 向父桶 L1_age=mid|L2_education=high 贡献 135 个典型样本
【INFO】【2025-11-22 22:10:57】【BTTWD】桶 L1_age=mid|L2_education=high|L3_hours-per-week=low_hours 向父桶 L1_age=mid|L2_education=high 贡献 65 个典型样本
【INFO】【2025-11-22 22:10:57】【BTTWD

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-22 22:10:57】【BTTWD】全局模型训练完成，用于兜底预测
【INFO】【2025-11-22 22:10:59】【BTTWD】父桶 L1_age=young|L2_education=low 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:10:59】【BTTWD】父桶 L1_age=young|L2_education=top 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:11:00】【BTTWD】叶子桶 L1_age=mid|L2_education=top|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:11:00】【BTTWD】叶子桶 L1_age=old|L2_education=low|L3_hours-per-week=low_hours 训练样本不足或单类，跳过局部模型训练
【INFO】【2025-11-22 22:11:02】【BTTWD】共生成 48 个叶子桶，其中有效桶 52 个（样本数 ≥ 50）
【INFO】【2025-11-22 22:11:02】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:11:03】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:11:04】【桶树】已为样本生成桶ID，共 48 个组合
【INFO】【2025-11-22 22:11:04】【BTTWD】本折指标：Precision=0.502, Recall=0.816, F1=0.622, BAC=0.780, AUC=0.878, MCC=0.490, Kappa=0.461
【INFO】【2025-11-22 22:11:04】【K折实验】所有结果已写入 results 目录


Unnamed: 0,Precision,Recall,F1,BAC,AUC,MCC,Kappa,model
0,0.509242,0.797855,0.621527,0.776861,0.873857,0.488583,0.463921,BTTWD
1,0.735088,0.601964,0.661899,0.766577,0.906751,0.572938,0.56829,LogReg
2,0.738954,0.627088,0.678441,0.778411,0.905658,0.59044,0.587168,RandomForest
3,0.711249,0.524933,0.60405,0.728668,0.868943,0.511519,0.502254,KNN
4,0.785889,0.654891,0.714435,0.799149,0.929218,0.638742,0.634399,XGBoost


【INFO】【2025-11-22 22:11:04】【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。


In [8]:
# 步骤7：桶级别分析
bucket_metrics_path = os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'bucket_metrics.csv')
if os.path.exists(bucket_metrics_path):
    bucket_metrics_df = pd.read_csv(bucket_metrics_path)
    display(bucket_metrics_df.head())
    bucket_metrics_df.plot(x='bucket_id', y='pos_rate', kind='bar', figsize=(12,4), title='桶正类比例')
    plt.ylabel('正类比例')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(fig_bucket, bbox_inches='tight')
    plt.close()
log_info('【步骤7摘要】桶级指标已整理，可用于局部化分析。')

Unnamed: 0,bucket_id,level,n_samples_all,n_samples_train,n_samples_val,pos_rate,alpha,beta,threshold_score,use_full_bucket_for_threshold,fold
0,L1_age=old|L2_education=high|L3_hours-per-week...,leaf,2586,2102,484,0.37935,0.35,0.05,0.693208,False,1
1,L1_age=old|L2_education=mid|L3_hours-per-week=...,leaf,2179,1707,472,0.208811,0.2,0.05,0.563686,False,1
2,L1_age=mid|L2_education=high|L3_hours-per-week...,leaf,2096,1686,410,0.209924,0.25,0.05,0.617886,False,1
3,L1_age=mid|L2_education=mid|L3_hours-per-week=...,leaf,1856,1486,370,0.078664,0.2,0.05,0.285714,False,1
4,L1_age=very_old|L2_education=mid|L3_hours-per-...,leaf,1296,1043,253,0.225309,0.2,0.05,0.548223,False,1


  plt.tight_layout()


【INFO】【2025-11-22 22:11:06】【步骤7摘要】桶级指标已整理，可用于局部化分析。


In [9]:
# 步骤8：结果汇总
log_info('【步骤8】检查结果文件与图表。')
print(os.listdir(os.path.join(root_path, cfg['OUTPUT']['results_dir'])))
print(os.listdir(os.path.join(root_path, cfg['OUTPUT']['figs_dir'])))
log_info('【全部步骤完成】Adult 数据集上的 BT-TWD 可行性实验结束。')

【INFO】【2025-11-22 22:11:06】【步骤8】检查结果文件与图表。
['bucket_metrics.csv', 'metrics_kfold_per_fold.csv', 'metrics_kfold_summary.csv']
['bucket_metrics_bar.png', 'class_distribution.png', 'metrics_compare.png']
【INFO】【2025-11-22 22:11:06】【全部步骤完成】Adult 数据集上的 BT-TWD 可行性实验结束。
