# 03 · Airline Baselines (GWB & XGBoost)
本笔记在与 ST-TWD 相同的数据与划分方案下，评估两种全局静态二分类 baseline。

### Notebook 结构
1. 载入配置与航空数据，并打印基本统计。
2. 以 warmup 年份训练 GWB baseline，按年份输出指标表。
3. 以相同训练集训练 XGBoost baseline，按年份输出指标表。
4. 简单汇总 GWB 与 XGBoost 的年度指标，方便和 ST-TWD 对比。

## 1. 配置与数据加载

In [None]:
from pathlib import Path
import yaml
import pandas as pd

from s3wdlib import (
    load_table_auto,
    assign_year_from_month_sequence,
    enrich_airline_dataframe,
    evaluate_gwb_baseline_by_year,
    evaluate_xgb_baseline_by_year,
)


In [None]:
cfg_path = Path('configs/s3wd_airline_v02.yaml')
with cfg_path.open('r', encoding='utf-8') as f:
    cfg = yaml.safe_load(f)
cfg


In [None]:
data_cfg = cfg['DATA']
label_col = data_cfg.get('label_col', 'label')
positive_label = data_cfg.get('positive_label', 1)
start_year = data_cfg.get('start_year', 1987)
config_dir = cfg_path.parent
data_dir = Path(data_cfg.get('data_dir', '.'))
if not data_dir.is_absolute():
    data_dir = (config_dir / data_dir).resolve()
data_path = data_dir / data_cfg['data_file']
print(f'使用数据文件: {data_path}')
X_raw, y = load_table_auto(
    str(data_path),
    label_col=data_cfg.get('label_col'),
    positive_label=positive_label,
    continuous_label=data_cfg.get('continuous_label'),
    threshold=data_cfg.get('threshold'),
    threshold_op=data_cfg.get('threshold_op', '>=')
)
airline_df = assign_year_from_month_sequence(X_raw, start_year=start_year)
airline_df[label_col] = y
airline_df = enrich_airline_dataframe(airline_df)
warmup_years = list(range(1987, 2000))
categorical_features = ['UniqueCarrier', 'Origin', 'Dest', 'DayOfWeek', 'Month', 'dep_block']
numeric_features = ['CRSDepTime', 'CRSArrTime', 'CRSElapsedTime', 'Distance', 'dep_hour', 'arr_hour', 'block_time_min', 'Year']
print(f'样本数: {len(airline_df):,}')
print(f'特征数: {airline_df.shape[1]}')
print(f'正类比例: {float(airline_df[label_col].mean()):.4f}')
print(f'warmup 年份: {warmup_years[0]}–{warmup_years[-1]}')
stream_years = sorted(year for year in airline_df['Year'].unique() if year not in warmup_years)
print(f'stream 年份样例: {stream_years[:3]} … {stream_years[-3:]}')


## 2. GWB baseline 按年评估

In [None]:
gwb_results = evaluate_gwb_baseline_by_year(
    airline_df,
    label_col=label_col,
    positive_label=positive_label,
    warmup_years=warmup_years,
    categorical_features=categorical_features,
    numeric_features=numeric_features,
    gwb_params=cfg.get('GWB', {}),
)
gwb_results


In [None]:
targets_dir = Path('targets')
targets_dir.mkdir(parents=True, exist_ok=True)
gwb_csv = targets_dir / 'baseline_gwb_by_year.csv'
gwb_results.to_csv(gwb_csv)
print(f'GWB baseline 结果已保存到: {gwb_csv}')


## 3. XGBoost baseline 按年评估

In [None]:
xgb_results = evaluate_xgb_baseline_by_year(
    airline_df,
    label_col=label_col,
    positive_label=positive_label,
    warmup_years=warmup_years,
    categorical_features=categorical_features,
    numeric_features=numeric_features,
    xgb_params={'random_state': 42},
)
xgb_results


In [None]:
xgb_csv = targets_dir / 'baseline_xgb_by_year.csv'
xgb_results.to_csv(xgb_csv)
print(f'XGBoost baseline 结果已保存到: {xgb_csv}')


## 4. 简单对比：GWB vs. XGBoost

In [None]:
comparison = pd.concat(
    {
        'GWB': gwb_results[['AUC', 'F1', 'BAC']],
        'XGB': xgb_results[['AUC', 'F1', 'BAC']],
    },
    axis=1,
)
comparison
