## 基线：全局 XGBoost + 传统三支决策 (TWD)

In [4]:
import os,sys
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from xgboost import XGBClassifier

root_path = Path(os.path.abspath(os.path.join(os.getcwd(), '..')))
if str(root_path) not in sys.path:
    sys.path.append(str(root_path))

from bttwdlib.config_loader import load_yaml_cfg, show_cfg
from bttwdlib.data_loader import load_dataset
from bttwdlib.preprocessing import prepare_features_and_labels
from bttwdlib.metrics import (
    compute_binary_metrics,
    compute_s3_metrics,
    log_metrics,
    predict_binary_by_cost,
)
from bttwdlib.threshold_search import (
    search_thresholds_with_regret,
    compute_regret,
)
from bttwdlib.utils_logging import log_info


log_info(f"[基线-XGB+TWD] 项目根路径: {root_path}")

【INFO】【2025-11-29 21:40:53】[基线-XGB+TWD] 项目根路径: e:\yan\组\三支决策\机器学习\BT_TWD


In [5]:
# 读取 YAML 配置
cfg_name = "airlines_delay.yaml"  # 可切换为 "adult_bttwd.yaml" 或 "airlines_delay.yaml""bank_bttwd.yaml"
cfg_path = root_path / "configs" / cfg_name
cfg = load_yaml_cfg(str(cfg_path))
show_cfg(cfg)

【INFO】【2025-11-29 21:40:54】【配置加载】已读取 e:\yan\组\三支决策\机器学习\BT_TWD\configs\airlines_delay.yaml
【INFO】【2025-11-29 21:40:54】【配置-数据】数据集=airlines_delay_1m, k折=5, 目标列=DepDelay, 正类="1"
【INFO】【2025-11-29 21:40:54】【配置-BTTWD】阈值模式=None, 全局模型=xgb, 桶内模型=none, 后验估计器(兼容字段)=logreg
【INFO】【2025-11-29 21:40:54】【配置-基线】LogReg启用=False, RandomForest启用=False, KNN启用=False, XGBoost启用=True


In [7]:
# 数据加载与预处理（与 BT-TWD 主流程保持一致）
df_raw, target_col = load_dataset(cfg)
X, y, meta = prepare_features_and_labels(df_raw, cfg)
log_info(
    f"[基线-XGB+TWD] 数据加载完成，样本数={len(y)}，特征维度={X.shape[1]}，正类比例={y.mean():.4f}"
)

【INFO】【2025-11-29 21:42:15】【数据加载】ARFF 文件 ..\data\airline\airlines_train_regression_1000000.arff 已读取，含 1000000 条记录，10 列
【INFO】【2025-11-29 21:42:15】【目标变换】已按阈值 15.0 生成二分类标签列 label，正类取 > 15.0
【INFO】【2025-11-29 21:42:15】【数据集信息】名称=airlines_delay_1m，样本数=1000000，目标列=label，正类比例=15.59%
【INFO】【2025-11-29 21:42:15】【预处理】连续特征=6个，类别特征=3个
【INFO】【2025-11-29 21:42:21】【预处理】编码后维度=755
【INFO】【2025-11-29 21:42:22】[基线-XGB+TWD] 数据加载完成，样本数=1000000，特征维度=755，正类比例=0.1559


In [8]:
# 辅助函数定义

def predict_s3_by_thresholds(y_score: np.ndarray, alpha: float, beta: float) -> np.ndarray:
    """基于全局 alpha/beta 生成三支预测 (1=POS, 0=NEG, -1=BND)。"""
    return np.where(y_score >= alpha, 1, np.where(y_score <= beta, 0, -1))

def run_xgb_twd_fold(X: np.ndarray, y: np.ndarray, train_idx: np.ndarray, test_idx: np.ndarray, cfg: dict) -> dict:
    data_cfg = cfg.get("DATA", {})
    thresh_cfg = cfg.get("THRESHOLDS", {})
    metrics_cfg = cfg.get("METRICS", {})
    xgb_cfg = cfg.get("BTTWD", {}).get("global_xgb", {})

    random_state = data_cfg.get("random_state", 42)
    bcfg = cfg.get("BTTWD", {})
    val_ratio = bcfg.get("val_ratio", 0.2)

    costs = thresh_cfg.get(
        "costs",
        {"C_TP": 0.0, "C_TN": 0.0, "C_FP": 2.0, "C_FN": 2.0, "C_BP": 1.5, "C_BN": 1.5},
    )
    alpha_grid = thresh_cfg.get("alpha_grid", np.linspace(0.1, 0.9, 9))
    beta_grid = thresh_cfg.get("beta_grid", np.linspace(0.0, 0.5, 6))
    gap_min = thresh_cfg.get("gap_min", 0.0)

    X_train_full, X_test = X[train_idx], X[test_idx]
    y_train_full, y_test = y[train_idx], y[test_idx]

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full,
        y_train_full,
        test_size=val_ratio,
        stratify=y_train_full,
        random_state=random_state,
    )

    model = XGBClassifier(
        n_estimators=xgb_cfg.get("n_estimators", 300),
        max_depth=xgb_cfg.get("max_depth", 4),
        learning_rate=xgb_cfg.get("learning_rate", 0.1),
        subsample=xgb_cfg.get("subsample", 0.8),
        colsample_bytree=xgb_cfg.get("colsample_bytree", 0.8),
        reg_lambda=xgb_cfg.get("reg_lambda", 1.0),
        random_state=xgb_cfg.get("random_state", 42),
        n_jobs=xgb_cfg.get("n_jobs", -1),
        eval_metric="logloss",
    )
    model.fit(X_train, y_train)
    log_info("[基线-XGB+TWD] 全局 XGB 模型训练完成。")

    y_proba_val = model.predict_proba(X_val)[:, 1]
    y_proba_test = model.predict_proba(X_test)[:, 1]

    best_alpha, best_beta, best_stats = search_thresholds_with_regret(
        prob=y_proba_val,
        y_true=y_val,
        alpha_grid=alpha_grid,
        beta_grid=beta_grid,
        costs=costs,
        gap_min=gap_min,
    )
    log_info(
    f"[基线-XGB+TWD] 全局阈值搜索完成: "
    f"alpha={best_alpha:.3f}, beta={best_beta:.3f}, "
    f"Regret={best_stats.get('Regret', best_stats.get('regret', float('nan'))):.4f}"
    )

    y_pred_s3 = predict_s3_by_thresholds(y_proba_test, best_alpha, best_beta)
    s3_metrics = compute_s3_metrics(y_true=y_test, y_s3_pred=y_pred_s3, y_score=y_proba_test, cfg_metrics=metrics_cfg, costs=costs)
    log_metrics("[基线-XGB+TWD] 测试集指标: ", s3_metrics)

    return {
        "metrics": s3_metrics,
        "alpha": best_alpha,
        "beta": best_beta,
        "val_stats": best_stats,
    }

In [9]:
# K 折训练 + 阈值搜索 + 指标计算
fold_metrics = []

data_cfg = cfg.get("DATA", {})
n_splits = data_cfg.get("n_splits", 5)
shuffle = data_cfg.get("shuffle", True)
random_state = data_cfg.get("random_state", 42)

skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    log_info(f"[基线-XGB+TWD] 开始第 {fold_id}/{n_splits} 折")
    result = run_xgb_twd_fold(X, y, train_idx, test_idx, cfg)
    metrics = result["metrics"]
    metrics["alpha"] = result["alpha"]
    metrics["beta"] = result["beta"]
    metrics["fold"] = fold_id
    fold_metrics.append(metrics)

fold_metrics_df = pd.DataFrame(fold_metrics)
fold_metrics_df

【INFO】【2025-11-29 21:42:33】[基线-XGB+TWD] 开始第 1/5 折
【INFO】【2025-11-29 21:44:00】[基线-XGB+TWD] 全局 XGB 模型训练完成。
【INFO】【2025-11-29 21:44:08】[基线-XGB+TWD] 全局阈值搜索完成: alpha=0.300, beta=0.200, Regret=0.6794
【INFO】【2025-11-29 21:44:09】[基线-XGB+TWD] 测试集指标: Precision=0.247, Recall=0.617, F1=0.353, BAC=0.635, AUC=0.685, MCC=0.201, Kappa=0.168, BND_ratio=0.199, POS_Coverage=0.061, Regret=0.679
【INFO】【2025-11-29 21:44:09】[基线-XGB+TWD] 开始第 2/5 折
【INFO】【2025-11-29 21:46:05】[基线-XGB+TWD] 全局 XGB 模型训练完成。
【INFO】【2025-11-29 21:46:12】[基线-XGB+TWD] 全局阈值搜索完成: alpha=0.300, beta=0.200, Regret=0.6807
【INFO】【2025-11-29 21:46:13】[基线-XGB+TWD] 测试集指标: Precision=0.246, Recall=0.619, F1=0.352, BAC=0.634, AUC=0.687, MCC=0.199, Kappa=0.165, BND_ratio=0.206, POS_Coverage=0.057, Regret=0.681
【INFO】【2025-11-29 21:46:14】[基线-XGB+TWD] 开始第 3/5 折
【INFO】【2025-11-29 21:47:55】[基线-XGB+TWD] 全局 XGB 模型训练完成。
【INFO】【2025-11-29 21:48:03】[基线-XGB+TWD] 全局阈值搜索完成: alpha=0.300, beta=0.200, Regret=0.6847
【INFO】【2025-11-29 21:48:03】[基线-XGB+TWD] 测试集指标: Pre

Unnamed: 0,Precision,Recall,F1,BAC,AUC,MCC,Kappa,BND_ratio,POS_Coverage,Regret,alpha,beta,fold
0,0.247479,0.616515,0.353184,0.635191,0.684743,0.201235,0.168156,0.19866,0.06097,0.679305,0.3,0.2,1
1,0.245556,0.619017,0.351626,0.63393,0.686586,0.198935,0.165343,0.205695,0.057235,0.681458,0.3,0.2,2
2,0.243599,0.618953,0.349605,0.632047,0.683254,0.195868,0.162202,0.208945,0.05876,0.68261,0.3,0.2,3
3,0.245281,0.619915,0.351489,0.633864,0.685189,0.198747,0.164992,0.20686,0.05903,0.680967,0.3,0.2,4
4,0.243286,0.612344,0.348223,0.630339,0.681793,0.193654,0.161068,0.20594,0.05881,0.681913,0.3,0.2,5


In [10]:
# 汇总 K 折均值与标准差，并保存为 CSV
metric_names = [
    "Precision",
    "Recall",
    "F1",
    "BAC",
    "AUC",
    "MCC",
    "Kappa",
    "BND_ratio",
    "POS_Coverage",
    "Regret",
]

summary = {"model": "Baseline_XGB_TWD"}
for name in metric_names:
    values = fold_metrics_df[name] if name in fold_metrics_df else []
    summary[f"{name}_mean"] = float(np.mean(values)) if len(values) else np.nan
    summary[f"{name}_std"] = float(np.std(values)) if len(values) else np.nan

summary_df = pd.DataFrame([summary])
summary_df

results_dir = root_path / "results"
results_dir.mkdir(parents=True, exist_ok=True)
out_path = results_dir / "metrics_kfold_summary_xgb_twd.csv"
summary_df.to_csv(out_path, index=False)
log_info(f"[基线-XGB+TWD] K折指标汇总已保存到: {out_path}")

【INFO】【2025-11-29 21:52:17】[基线-XGB+TWD] K折指标汇总已保存到: e:\yan\组\三支决策\机器学习\BT_TWD\results\metrics_kfold_summary_xgb_twd.csv


In [11]:
# （可选）单次 holdout 流程示例
use_holdout = True  # 可切换为 False 以跳过 holdout 流程
if use_holdout:
    data_cfg = cfg.get("DATA", {})
    test_ratio = data_cfg.get("test_size", data_cfg.get("test_ratio", 0.2))
    random_state = data_cfg.get("random_state", 42)

    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X,
        y,
        test_size=test_ratio,
        stratify=y,
        random_state=random_state,
    )

    bcfg = cfg.get("BTTWD", {})
    val_ratio = bcfg.get("val_ratio", 0.2)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full,
        y_train_full,
        test_size=val_ratio,
        stratify=y_train_full,
        random_state=random_state,
    )

    result = run_xgb_twd_fold(
        X=np.vstack([X_train, X_val, X_test]),
        y=np.hstack([y_train, y_val, y_test]),
        train_idx=np.arange(len(y_train) + len(y_val)),
        test_idx=np.arange(len(y_train) + len(y_val), len(y_train) + len(y_val) + len(y_test)),
        cfg=cfg,
    )
    holdout_metrics = result["metrics"]
    log_metrics("[基线-XGB+TWD] Holdout 指标: ", holdout_metrics)

【INFO】【2025-11-29 21:54:59】[基线-XGB+TWD] 全局 XGB 模型训练完成。
【INFO】【2025-11-29 21:55:07】[基线-XGB+TWD] 全局阈值搜索完成: alpha=0.300, beta=0.200, Regret=0.6804
【INFO】【2025-11-29 21:55:07】[基线-XGB+TWD] 测试集指标: Precision=0.245, Recall=0.616, F1=0.350, BAC=0.633, AUC=0.684, MCC=0.197, Kappa=0.164, BND_ratio=0.207, POS_Coverage=0.059, Regret=0.681
【INFO】【2025-11-29 21:55:08】[基线-XGB+TWD] Holdout 指标: Precision=0.245, Recall=0.616, F1=0.350, BAC=0.633, AUC=0.684, MCC=0.197, Kappa=0.164, BND_ratio=0.207, POS_Coverage=0.059, Regret=0.681
