# Airlines 延误数据 BTTWD 实验（Notebook 版）

本 Notebook 展示如何在不依赖命令行脚本的情况下，直接用 Python 代码跑 airlines 延误配置的 BTTWD 训练/评估。

In [None]:
from pathlib import Path
import sys

REPO_ROOT = Path().resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

REPO_ROOT

In [None]:
from bttwdlib.config_loader import load_yaml_cfg, show_cfg, flatten_cfg_to_vars
from bttwdlib.utils_seed import set_global_seed

cfg_path = REPO_ROOT / "configs" / "airlines_delay.yaml"
cfg = load_yaml_cfg(cfg_path)
show_cfg(cfg)

flat_cfg = flatten_cfg_to_vars(cfg)
set_global_seed(cfg.get("SEED", {}).get("global_seed", 42))
flat_cfg

In [None]:
from bttwdlib.data_loader import load_dataset
from bttwdlib.utils_logging import log_info

df_raw, target_col = load_dataset(cfg)
data_cfg = cfg.get("DATA", {})
log_info(f"【Notebook入口】数据集={data_cfg.get('dataset_name')}，样本数={len(df_raw)}，标签列={target_col}")

# 展示前几行数据
show_columns = df_raw.columns.tolist()[:10]
df_raw[show_columns].head()

In [None]:
from typing import List
from bttwdlib.preprocessing import prepare_features_and_labels

def build_bucket_feature_df(df, cfg):
    X, y, meta = prepare_features_and_labels(df, cfg)
    prep_cfg = cfg.get("PREPROCESS", {})
    bucket_cols: List[str] = (prep_cfg.get("continuous_cols") or []) + (prep_cfg.get("categorical_cols") or [])
    bucket_df = df[bucket_cols].reset_index(drop=True)
    return X, y, meta, bucket_df, bucket_cols

X, y, meta, bucket_df, bucket_cols = build_bucket_feature_df(df_raw, cfg)
log_info(f"【Notebook入口】特征矩阵形状={X.shape}，正类占比={y.mean():.2%}")


In [None]:
from bttwdlib import run_holdout_experiment, run_kfold_experiments

bucket_levels = cfg.get("BTTWD", {}).get("bucket_levels", [])
log_info(f"【桶树层级】分裂顺序={[lvl.get('name') for lvl in bucket_levels]}")

data_cfg = cfg.get("DATA", {})
use_kfold = data_cfg.get("use_kfold", False)
if isinstance(use_kfold, str):
    use_kfold = use_kfold.strip().lower() in {"1", "true", "yes", "y"}

if use_kfold:
    log_info("【模式选择】use_kfold=true，启动K折实验...")
    kfold_results = run_kfold_experiments(X, y, bucket_df, cfg)
    kfold_results
else:
    holdout_results = run_holdout_experiment(X, y, bucket_df, cfg, bucket_cols=bucket_cols)
    holdout_results
