Airlines 延误数据集 BTTWD 实验

本 notebook 按步骤运行：环境准备 → 加载配置 → 读取数据 → 预处理 → 桶树划分 → 基线与 BTTWD 实验 → 桶级分析。


In [1]:
# 步骤0：环境与路径设置
import os, sys
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# 将项目根目录加入路径，便于导入 bttwdlib
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root_path not in sys.path:
    sys.path.append(root_path)

from bttwdlib import (
    load_yaml_cfg,
    show_cfg,
    load_dataset,
    prepare_features_and_labels,
    BucketTree,
    run_holdout_experiment,
    run_kfold_experiments,
    log_info,
    set_global_seed,
)

cfg_path = os.path.join(root_path, 'configs', 'airlines_delay.yaml')
cfg = load_yaml_cfg(cfg_path)
set_global_seed(cfg.get('SEED', {}).get('global_seed', 42))
log_info('【步骤0摘要】环境准备完毕，路径与随机种子已设置。')


【INFO】【2025-11-27 18:38:55】【配置加载】已读取 e:\yan\组\三支决策\机器学习\BT_TWD\configs\airlines_delay.yaml
【INFO】【2025-11-27 18:38:59】【步骤0摘要】环境准备完毕，路径与随机种子已设置。


In [2]:
# 步骤1：加载配置
show_cfg(cfg)
log_info('【步骤1摘要】配置文件加载完成，关键参数检查通过。')


【INFO】【2025-11-27 18:38:59】【配置-数据】数据集=airlines_delay_1m, k折=5, 目标列=DepDelay, 正类="1"
【INFO】【2025-11-27 18:38:59】【配置-BTTWD】阈值模式=None, 全局模型=xgb, 桶内模型=none, 后验估计器(兼容字段)=logreg
【INFO】【2025-11-27 18:38:59】【配置-基线】LogReg启用=False, RandomForest启用=False, KNN启用=False, XGBoost启用=True
【INFO】【2025-11-27 18:38:59】【步骤1摘要】配置文件加载完成，关键参数检查通过。


In [3]:
# 步骤2：加载原始数据
df_raw, target_col_model = load_dataset(cfg)  # 这里返回的是用于建模的标签列，例如 "label"

display(df_raw.head())
print("用于建模的标签列:", target_col_model)

# 1）画 0/1 标签（延误/不延误）的比例
class_counts = df_raw[target_col_model].value_counts(normalize=True)
ax = class_counts.plot(kind='bar', title='延误 vs 未延误比例')
plt.ylabel('比例')

fig_path = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'class_distribution.png')
os.makedirs(os.path.dirname(fig_path), exist_ok=True)
plt.savefig(fig_path, bbox_inches='tight')
plt.close()

# 2）如果想看原始 DepDelay 的分布，可以另外单独分析：
raw_target_col = cfg['DATA']['target_col']  # 这里是 "DepDelay"
print("原始目标列:", raw_target_col)
print(df_raw[raw_target_col].describe())

log_info('【步骤2摘要】Airlines 原始数据加载与基本统计完成。')


【INFO】【2025-11-27 18:39:12】【数据加载】ARFF 文件 ..\data\airline\airlines_train_regression_1000000.arff 已读取，含 1000000 条记录，10 列
【INFO】【2025-11-27 18:39:12】【目标变换】已按阈值 15.0 生成二分类标签列 label，正类取 > 15.0
【INFO】【2025-11-27 18:39:12】【数据集信息】名称=airlines_delay_1m，样本数=1000000，目标列=label，正类比例=15.59%


Unnamed: 0,DepDelay,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,Origin,Dest,Distance,label
0,8.0,10.0,11.0,7.0,1300.0,1535.0,AA,LAX,HNL,2556.0,0
1,-3.0,10.0,10.0,6.0,2035.0,2110.0,AA,OGG,HNL,100.0,0
2,6.0,10.0,26.0,1.0,1200.0,1446.0,AA,JFK,LAX,2475.0,0
3,1.0,10.0,9.0,5.0,1145.0,1512.0,AA,JFK,SFO,2586.0,0
4,0.0,10.0,16.0,5.0,930.0,1149.0,AA,SFO,HNL,2399.0,0


用于建模的标签列: label
原始目标列: DepDelay
count    1000000.000000
mean           8.191935
std           28.877186
min        -1197.000000
25%           -3.000000
50%            0.000000
75%            7.000000
max         2119.000000
Name: DepDelay, dtype: float64
【INFO】【2025-11-27 18:39:13】【步骤2摘要】Airlines 原始数据加载与基本统计完成。


In [4]:
# 步骤3：预处理与特征工程
X, y, meta = prepare_features_and_labels(df_raw, cfg)
log_info(f'【预处理】编码特征维度={X.shape[1]}，样本数={X.shape[0]}')
log_info(f"【步骤3摘要】特征预处理完成：连续={len(meta['continuous_cols'])}，类别={len(meta['categorical_cols'])}，编码维度={X.shape[1]}。")


【INFO】【2025-11-27 18:39:13】【预处理】连续特征=6个，类别特征=3个
【INFO】【2025-11-27 18:39:16】【预处理】编码后维度=755
【INFO】【2025-11-27 18:39:16】【预处理】编码特征维度=755，样本数=1000000
【INFO】【2025-11-27 18:39:16】【步骤3摘要】特征预处理完成：连续=6，类别=3，编码维度=755。


In [5]:
# 步骤4：构建桶树并检查划分
feature_df_for_bucket = df_raw.drop(columns=[cfg['DATA']['target_col']])
bucket_tree = BucketTree(cfg['BTTWD']['bucket_levels'], feature_names=feature_df_for_bucket.columns.tolist())
bucket_ids_full = bucket_tree.assign_buckets(feature_df_for_bucket)
bucket_df = bucket_ids_full.value_counts().reset_index()
bucket_df.columns = ['bucket_id', 'count']
bucket_df['pos_rate'] = df_raw.groupby(bucket_ids_full)[cfg['DATA']['target_col']].apply(
    lambda s: (s == cfg['DATA']['positive_label']).mean()
).values

display(bucket_df.head())
bucket_df.set_index('bucket_id')['count'].plot(kind='bar', figsize=(12,4), title='桶样本数分布')
fig_bucket = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'bucket_metrics_bar.png')
plt.savefig(fig_bucket, bbox_inches='tight')
plt.close()
log_info(f'【步骤4摘要】桶树划分完成，共有 {bucket_ids_full.nunique()} 个叶子桶。')


【INFO】【2025-11-27 18:39:19】【桶树】已为样本生成桶ID，共 10378 个组合


Unnamed: 0,bucket_id,count,pos_rate
0,L1_Origin=ATL|L2_UniqueCarrier=DL|L3_CRSDepTim...,7562,0.045455
1,L1_Origin=ATL|L2_UniqueCarrier=DL|L3_CRSDepTim...,6048,0.090909
2,L1_Origin=DFW|L2_UniqueCarrier=AA|L3_CRSDepTim...,5600,0.08
3,L1_Origin=ATL|L2_UniqueCarrier=DL|L3_CRSDepTim...,5541,0.0
4,L1_Origin=DFW|L2_UniqueCarrier=AA|L3_CRSDepTim...,4998,0.045714


【INFO】【2025-11-27 18:45:45】【步骤4摘要】桶树划分完成，共有 10378 个叶子桶。


In [6]:
# 步骤5：运行基线模型实验占位
# 基线部分在 run_kfold_experiments 内统一调度（仅在 use_kfold=True 时执行）
log_info('【步骤5】基线模型将在交叉验证模式中一并运行。')
log_info('【步骤5摘要】基线模型性能将作为后续对比基准。')


【INFO】【2025-11-27 18:45:45】【步骤5】基线模型将在交叉验证模式中一并运行。
【INFO】【2025-11-27 18:45:45】【步骤5摘要】基线模型性能将作为后续对比基准。


In [7]:
# 步骤6：运行 BTTWD 实验（k 折或单次留出）
use_kfold_raw = cfg.get('DATA', {}).get('use_kfold', False)
if isinstance(use_kfold_raw, str):
    use_kfold = use_kfold_raw.strip().lower() in ['true', '1', 'yes']
else:
    use_kfold = bool(use_kfold_raw)

if use_kfold:
    log_info('【步骤6】检测到 use_kfold=True，进入 k 折实验。')
    results = run_kfold_experiments(X, y, feature_df_for_bucket, cfg)
    summary_df = pd.read_csv(os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'metrics_kfold_summary.csv'))
    display(summary_df)
    summary_df.plot(x='model', kind='bar', figsize=(8,4), title='模型指标对比')
    fig_compare = os.path.join(root_path, cfg['OUTPUT']['figs_dir'], 'metrics_compare.png')
    plt.savefig(fig_compare, bbox_inches='tight')
    plt.close()
    log_info('【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。')
else:
    log_info('【步骤6】use_kfold=False，执行单次留出验证流程。')
    holdout_metrics = run_holdout_experiment(X, y, feature_df_for_bucket, cfg)
    display(pd.DataFrame(holdout_metrics))
    log_info('【步骤6摘要】单次留出验证完成，指标已列出。')


【INFO】【2025-11-27 18:45:45】【步骤6】检测到 use_kfold=True，进入 k 折实验。
【INFO】【2025-11-27 18:45:45】【基线-XGB】使用模型自定义阈值=0.200（per_model 模式）


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  summary[f"{col}_mean"] = float(np.nanmean(arr))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


【INFO】【2025-11-27 18:51:11】【基线-XGB】整体指标：AUC_mean=0.685, AUC_std=0.001, BAC_mean=0.621, BAC_std=0.001, BND_ratio_mean=0.000, BND_ratio_std=0.000, F1_mean=0.348, F1_std=0.001, Kappa_mean=0.189, Kappa_std=0.001, MCC_mean=0.200, MCC_std=0.002, POS_Coverage_mean=nan, POS_Coverage_std=nan, Precision_mean=0.277, Precision_std=0.001, Recall_mean=0.468, Recall_std=0.002, Regret_mean=0.605, Regret_std=0.002
【INFO】【2025-11-27 18:51:11】【K折实验】正在执行第 1/5 折...
【INFO】【2025-11-27 18:51:42】【桶树】已为样本生成桶ID，共 9053 个组合


Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-27 18:52:36】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-27 18:53:07] 桶 L1_Origin=ABQ 分裂前 Score=0.4936，层级 L1，样本 n=4993；子桶Score=[0.491566265060241, 0.4954337899543379, 0.49506172839506174, 0.5, 0.1, 0.5059564329475834, 0.38461538461538464, 0.4931506849315068, 0.4588744588744589, 0.44642857142857145, 0.4777777777777778, 0.4826923076923077, 0.5551242236024845, 0.4714168629155557, 0.4594594594594595, 0.48148148148148145]，Gain=-0.0141
[INFO][BT][2025-11-27 18:53:07] Gain 不足（Gain=-0.0141 < 阈值=-0.0060），停止在本层
[INFO][BT][2025-11-27 18:53:07] 桶 L1_Origin=ATL 分裂前 Score=0.4487，层级 L1，样本 n=41906；子桶Score=[0.42463662790697676, 0.4596277620628869, 0.125, 0.25, 0.5354460093896714, 0.43904730330802283, 0.44487984136513303, 0.39324992960619076, 0.45660281083385634, 0.3920099875156055, 0.3945981554677207, 0.48098078710876974, 0.39336065573770496, 0.41498257839721253, 0.46495726495726497, 0.4473684210526316, 0.4313011152416357, 0.4307371717127169, 0.46680277940711995, 0.4968553459119497, 0.48885

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-27 18:55:43】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-27 18:56:01] 桶 L1_Origin=ABQ 分裂前 Score=0.4965，层级 L1，样本 n=4969；子桶Score=[0.4971938775510204, 0.49327354260089684, 0.4873417721518987, 0.5, 0.125, 0.4927990511690952, 0.36666666666666664, 0.49230769230769234, 0.44401317337097157, 0.4444444444444444, 0.46153846153846156, 0.47307692307692306, 0.5336927223719676, 0.4825710074142521, 0.44666666666666666, 0.4074074074074074]，Gain=-0.0148
[INFO][BT][2025-11-27 18:56:01] Gain 不足（Gain=-0.0148 < 阈值=-0.0060），停止在本层
[INFO][BT][2025-11-27 18:56:01] 桶 L1_Origin=ATL 分裂前 Score=0.4444，层级 L1，样本 n=41880；子桶Score=[0.4237007613373055, 0.4592008624755744, 0.125, 0.3333333333333333, 0.530586471556374, 0.4336693619530952, 0.43453966415749856, 0.3836943953872213, 0.46943724190915204, 0.3488735919899875, 0.35416666666666663, 0.4759891271519179, 0.37940459170801444, 0.3833333333333333, 0.5275628626692457, 0.4523809523809524, 0.42864054184808903, 0.4277289377289377, 0.4689112359919813, 0.54980842911

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-27 18:58:43】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-27 18:59:03] 桶 L1_Origin=ABQ 分裂前 Score=0.4964，层级 L1，样本 n=4982；子桶Score=[0.4880382775119617, 0.49521531100478466, 0.49120603015075376, 0.4444444444444444, 0.125, 0.45535714285714285, 0.4090909090909091, 0.5, 0.45198863636363634, 0.42857142857142855, 0.47674418604651164, 0.46525096525096526, 0.4787234042553192, 0.4769629294301091, 0.4605263157894737, 0.5]，Gain=-0.0200
[INFO][BT][2025-11-27 18:59:03] Gain 不足（Gain=-0.0200 < 阈值=-0.0060），停止在本层
[INFO][BT][2025-11-27 18:59:03] 桶 L1_Origin=ATL 分裂前 Score=0.4411，层级 L1，样本 n=41996；子桶Score=[0.38570691434468524, 0.44646437590877786, 0.14285714285714285, 0.25, 0.5261960417640642, 0.43429340855574516, 0.4550543280596472, 0.3722997870783717, 0.4464861158018732, 0.35723039215686275, 0.4241452991452992, 0.45465042204172634, 0.4052708504613136, 0.3948717948717949, 0.5086342229199372, 0.475, 0.4314685314685315, 0.4107411385606874, 0.4471863736774794, 0.3992974238875878, 0.5036630036630036, 

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-27 19:01:47】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-27 19:02:08] 桶 L1_Origin=ABQ 分裂前 Score=0.5000，层级 L1，样本 n=5035；子桶Score=[0.5039009497964722, 0.4977578475336323, 0.49125, 0.5, 0.3333333333333333, 0.4958662832494608, 0.2916666666666667, 0.5, 0.45606060606060606, 0.4583333333333333, 0.47701149425287354, 0.49070631970260226, 0.49065420560747663, 0.479330527014277, 0.4620253164556962, 0.4444444444444444]，Gain=-0.0170
[INFO][BT][2025-11-27 19:02:08] Gain 不足（Gain=-0.0170 < 阈值=-0.0060），停止在本层
[INFO][BT][2025-11-27 19:02:08] 桶 L1_Origin=ATL 分裂前 Score=0.4400，层级 L1，样本 n=41856；子桶Score=[0.40548675610595114, 0.45272448602082344, 0.1, 0.25, 0.5446327683615819, 0.42683305992891485, 0.4454691259021652, 0.38596636456369526, 0.4550605406888296, 0.3490691489361702, 0.446031746031746, 0.4646517917511832, 0.39947600661886373, 0.4208261617900172, 0.4131205673758865, 0.45454545454545453, 0.4144061302681992, 0.4205533596837945, 0.46378115128115127, 0.6008771929824561, 0.4508426966292135, 0.42

Parameters: { "use_label_encoder" } are not used.



【INFO】【2025-11-27 19:04:53】【BTTWD】全局模型训练完成，用于兜底预测
[INFO][BT][2025-11-27 19:05:14] 桶 L1_Origin=ABQ 分裂前 Score=0.4934，层级 L1，样本 n=4973；子桶Score=[0.4850374064837905, 0.49099099099099097, 0.49246231155778897, 0.45454545454545453, 0.125, 0.4897222222222222, 0.34615384615384615, 0.4931506849315068, 0.4789224245447743, 0.4482758620689655, 0.45, 0.4734848484848485, 0.5377207062600321, 0.47666270391651805, 0.4375, 0.4090909090909091]，Gain=-0.0143
[INFO][BT][2025-11-27 19:05:14] Gain 不足（Gain=-0.0143 < 阈值=-0.0060），停止在本层
[INFO][BT][2025-11-27 19:05:14] 桶 L1_Origin=ATL 分裂前 Score=0.4377，层级 L1，样本 n=41854；子桶Score=[0.41925624044829346, 0.4533119346367085, 0.125, 0.3333333333333333, 0.5693469785575048, 0.424665434091941, 0.4287975486818726, 0.37250975451241514, 0.4799632898366824, 0.29561781609195403, 0.39820675105485237, 0.46291511149392284, 0.39214670534604223, 0.399910434393193, 0.46448863636363635, 0.4722222222222222, 0.40990425364934857, 0.41490159325210874, 0.4403226516418699, 0.5415584415584416, 0.4

Unnamed: 0,model,Precision_mean,Precision_std,Recall_mean,Recall_std,F1_mean,F1_std,BAC_mean,BAC_std,AUC_mean,...,MCC_mean,MCC_std,Kappa_mean,Kappa_std,BND_ratio_mean,BND_ratio_std,POS_Coverage_mean,POS_Coverage_std,Regret_mean,Regret_std
0,BTTWD,0.24504,0.00169,0.617349,0.003069,0.350825,0.001931,0.633074,0.001895,0.684313,...,0.197688,0.002951,0.164352,0.002796,0.205318,0.004785,0.07457,0.008763,0.682242,0.000876
1,XGBoost,0.277064,0.000987,0.468221,0.0018,0.348127,0.001196,0.621323,0.000954,0.684799,...,0.199814,0.00157,0.189378,0.001497,0.0,0.0,,,0.604833,0.001504


【INFO】【2025-11-27 19:06:31】【步骤6摘要】BTTWD 与基线的 k 折结果已生成并保存。


In [8]:
# 步骤7：桶级别分析
bucket_metrics_path = os.path.join(root_path, cfg['OUTPUT']['results_dir'], 'bucket_metrics.csv')
if os.path.exists(bucket_metrics_path):
    bucket_metrics_df = pd.read_csv(bucket_metrics_path)
    display(bucket_metrics_df.head())
    bucket_metrics_df.plot(x='bucket_id', y='pos_rate_all', kind='bar', figsize=(12,4), title='桶正类比例')
    plt.ylabel('正类比例')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(fig_bucket, bbox_inches='tight')
    plt.close()
log_info('【步骤7摘要】桶级指标（如存在）已整理，可用于局部化分析。')


Unnamed: 0,bucket_id,layer,parent_bucket_id,n_train,n_val,pos_rate_train,pos_rate_val,alpha,beta,regret_val,...,n_all,pos_rate_all,parent_with_threshold,n_test,pos_rate_test,BND_ratio_test,POS_Coverage_test,regret_test,fold,pos_rate
0,L1_Origin=OTHER,L1,,84841,36477,0.128299,0.126052,0.3,0.2,0.572525,...,121318,0.127623,,,,,,,1,0.127623
1,L1_Origin=ATL,L1,,29298,12608,0.183664,0.187976,0.3,0.2,0.768123,...,41906,0.184962,,,,,,,1,0.184962
2,L1_Origin=ORD,L1,,29328,12364,0.2113,0.204626,0.3,0.2,0.809689,...,41692,0.209321,,,,,,,1,0.209321
3,L1_Origin=LAX,L1,,18706,8039,0.157703,0.153004,0.3,0.2,0.703508,...,26745,0.156291,,,,,,,1,0.156291
4,L1_Origin=DEN,L1,,16439,6995,0.17057,0.180272,0.3,0.2,0.793996,...,23434,0.173466,,,,,,,1,0.173466


  plt.tight_layout()


【INFO】【2025-11-27 19:08:02】【步骤7摘要】桶级指标（如存在）已整理，可用于局部化分析。


In [9]:
# 步骤8：结果汇总
log_info('【步骤8】检查结果文件与图表。')
results_dir = os.path.join(root_path, cfg['OUTPUT']['results_dir'])
figs_dir = os.path.join(root_path, cfg['OUTPUT']['figs_dir'])
os.makedirs(results_dir, exist_ok=True)
os.makedirs(figs_dir, exist_ok=True)
print(os.listdir(results_dir))
print(os.listdir(figs_dir))
log_info('【全部步骤完成】Airlines 数据集的 BT-TWD 实验结束。')


【INFO】【2025-11-27 19:08:02】【步骤8】检查结果文件与图表。
['bucket_metrics.csv', 'bucket_thresholds_per_fold.csv', 'metrics_kfold_per_fold.csv', 'metrics_kfold_summary.csv']
['bank_class_distribution.png', 'bucket_metrics_bar.png', 'class_distribution.png', 'metrics_compare.png']
【INFO】【2025-11-27 19:08:02】【全部步骤完成】Airlines 数据集的 BT-TWD 实验结束。
