# 00 · S3WD Baseline（中文）

In [1]:
# ================================
# 0. 环境初始化与模块导入（中文）
# ================================
import os, sys, platform, importlib, inspect, warnings
import numpy as np
import pandas as pd
from sklearn.metrics import (f1_score, balanced_accuracy_score, precision_score, recall_score,
                             matthews_corrcoef, cohen_kappa_score, roc_auc_score)
# 1) 确保项目根目录在 sys.path[0]（notebooks/ 的上一级）
sys.path.insert(0, os.path.abspath('..'))

# 2) 导入自研库（s3wdlib）各模块，并强制重载确保拿到最新版实现
import s3wdlib.zh_utils as zh_utils
import s3wdlib.data_io as data_io
import s3wdlib.features as features
import s3wdlib.gwb as gwb
import s3wdlib.objective as objective
import s3wdlib.trainer as trainer

importlib.reload(zh_utils)
importlib.reload(data_io)
importlib.reload(features)
importlib.reload(gwb)
importlib.reload(objective)
importlib.reload(trainer)

# 3) 把常用符号直接引入（可读性更好）
from s3wdlib.zh_utils import set_chinese_font, fix_minus
from s3wdlib.data_io import load_table_auto, minmax_scale_fit_transform
from s3wdlib.features import rank_features_mi, make_levels
from s3wdlib.gwb import GWBProbEstimator
from s3wdlib.objective import S3WDParams
from s3wdlib.trainer import PSOParams, pso_learn_thresholds
from s3wdlib.config_loader import load_yaml_cfg, extract_vars, show_cfg

# 4) 可视化中文设置（宋体优先，负号正常）
set_chinese_font(); fix_minus()
print("【可视化字体】已设置为宋体优先（若系统缺失则自动回退）。")

# 5) 版本与路径自检（定位是否导入了正确文件）
print("【Python】", platform.python_version())
print("【Pandas/Numpy】", pd.__version__, np.__version__)
print("【模块路径】")
print("  zh_utils   ->", zh_utils.__file__)
print("  data_io    ->", data_io.__file__)
print("  features   ->", features.__file__)
print("  gwb        ->", gwb.__file__)
print("  objective  ->", objective.__file__)
print("  trainer    ->", trainer.__file__)
print("【函数签名】load_table_auto:", inspect.signature(data_io.load_table_auto))

# 6) 随机种子（方便复现；如需完全一致可统一设置）
np.random.seed(42)

# 7) 警告精简（可选）
warnings.filterwarnings("ignore")




【可视化字体】已设置为宋体优先（若系统缺失则自动回退）。
【Python】 3.11.5
【Pandas/Numpy】 2.0.3 1.26.4
【模块路径】
  zh_utils   -> e:\yan\组\三支决策\机器学习\C三支决策与不平衡数据集分类\S3WD实验\s3wdlib\zh_utils.py
  data_io    -> e:\yan\组\三支决策\机器学习\C三支决策与不平衡数据集分类\S3WD实验\s3wdlib\data_io.py
  features   -> e:\yan\组\三支决策\机器学习\C三支决策与不平衡数据集分类\S3WD实验\s3wdlib\features.py
  gwb        -> e:\yan\组\三支决策\机器学习\C三支决策与不平衡数据集分类\S3WD实验\s3wdlib\gwb.py
  objective  -> e:\yan\组\三支决策\机器学习\C三支决策与不平衡数据集分类\S3WD实验\s3wdlib\objective.py
  trainer    -> e:\yan\组\三支决策\机器学习\C三支决策与不平衡数据集分类\S3WD实验\s3wdlib\trainer.py
【函数签名】load_table_auto: (path: 'str', label_col: 'Optional[str | int]' = None, positive_label=1, continuous_label: 'Optional[str]' = None, threshold: 'Optional[float]' = None, threshold_op: 'str' = '>=') -> 'Tuple[pd.DataFrame, pd.Series]'


In [2]:
# === 配置接入（YAML → dataclass → 变量字典）===
# wine
# CFG = load_yaml_cfg("../configs/s3wd_wine.yaml")  # ← 如换配置文件，只改这里
# Heart
# CFG = load_yaml_cfg("../configs/s3wd_heart.yaml")
# Credit
# CFG = load_yaml_cfg("../configs/s3wd_credit.yaml")
# airline
CFG = load_yaml_cfg("../configs/s3wd_airline.yaml")


V   = extract_vars(CFG)
show_cfg(CFG)

# ✅ 兼容“连续列二值化”和“已有二值标签”两种配置
if "CONT_LABEL" in V:
    label_desc = f"{V['CONT_LABEL']}{V['CONT_OP']}{V['CONT_THRESH']}"
elif "LABEL_COL" in V:
    label_desc = f"{V['LABEL_COL']}=={V.get('POSITIVE_LABEL', 1)}"
else:
    label_desc = "(未检测到标签配置)"

gwb_desc = {
    "k": V["GWB_K"],
    "mode": V.get("GWB_mode"),
    "bandwidth": V.get("GWB_bandwidth"),
    "use_faiss": bool(V.get("GWB_use_faiss", True)),
    "faiss_gpu": bool(V.get("GWB_faiss_gpu", True)),
}
print("【参数就绪（来自 YAML）】", {
    "DATA_PATH": V["DATA_PATH"],
    "label": label_desc,
    "splits": f"test={V['TEST_SIZE']}, val={V['VAL_SIZE']}, seed={V['SEED']}",
    "gwb": gwb_desc,
    "pso": {"particles": V["PSO_particles"], "iters": V["PSO_iters"]}
})




【配置快照】
- DATA: {'data_dir': '../data', 'data_file': 'airlines_train_regression_1000000.arff', 'continuous_label': 'DepDelay', 'threshold': 15, 'threshold_op': '>', 'label_col': None, 'positive_label': None, 'test_size': 0.3, 'val_size': 0.3, 'random_state': 42}
- LEVEL: {'level_pcts': [0.6, 0.8, 1.0], 'ranker': 'mi'}
- KWB: {'k': 6, 'metric': 'euclidean', 'eps': 1e-06, 'use_faiss': True, 'faiss_gpu': True}
- GWB: {'k': 6, 'metric': 'euclidean', 'eps': 1e-06, 'mode': 'epanechnikov', 'bandwidth': 0.8, 'bandwidth_scale': 1.0, 'use_faiss': True, 'faiss_gpu': True}
- S3WD: {'c1': 0.37, 'c2': 0.63, 'xi_min': 0.1, 'theta_pos': 0.9, 'theta_neg': 0.1, 'sigma': 3.0, 'regret_mode': 'utility', 'penalty_large': 1000000.0, 'gamma_last': True, 'gap': 0.02}
- PSO: {'particles': 20, 'iters': 20, 'w_max': 0.9, 'w_min': 0.4, 'c1': 2.8, 'c2': 1.3, 'seed': 42, 'use_gpu': True}
【参数就绪（来自 YAML）】 {'DATA_PATH': '../data\\airlines_train_regression_1000000.arff', 'label': 'DepDelay>15', 'splits': 'test=0.3, val=0

In [3]:
# === 读取数据（兼容“连续列二值化 / 已有二值标签”两种配置）+ 切分 ===
from s3wdlib.data_io import load_table_auto

# 统一参数打包（不存在的键用 .get() 不报错）
kw = dict(
    path=V["DATA_PATH"],
    label_col=V.get("LABEL_COL"),
    positive_label=V.get("POSITIVE_LABEL"),
    continuous_label=V.get("CONT_LABEL"),
    threshold=V.get("CONT_THRESH"),
    threshold_op=V.get("CONT_OP"),
)

# 友好提示
if V.get("CONT_LABEL") is not None:
    print(f"【标签策略】连续列二值化：{V['CONT_LABEL']} {V['CONT_OP']} {V['CONT_THRESH']}")
elif V.get("LABEL_COL") is not None:
    print(f"【标签策略】已有标签列：{V['LABEL_COL']} == {V.get('POSITIVE_LABEL', 1)} 视为正类")
else:
    raise RuntimeError("未检测到标签配置（既无 CONT_* 也无 LABEL_COL）。请检查 YAML。")

# 读取数据（保持原始顺序）
X_all, y_all = load_table_auto(**kw)
print("【数据加载】X_all, y_all =", X_all.shape, y_all.shape)

# 按时间顺序划分 70/30（不打乱）
test_size = float(V["TEST_SIZE"])
cut = int((1.0 - test_size) * len(X_all))
cut = max(1, min(len(X_all) - 1, cut))
Xtr, Xte = X_all.iloc[:cut], X_all.iloc[cut:]
ytr, yte = y_all.iloc[:cut], y_all.iloc[cut:]
print("【划分方式】按时间顺序：train=%d, test=%d" % (len(Xtr), len(Xte)))


【标签策略】连续列二值化：DepDelay > 15
【数据加载完毕】样本数=1000000，特征数=9，正类比例=0.1559
【数据加载】X_all, y_all = (1000000, 9) (1000000,)
【划分方式】按时间顺序：train=700000, test=300000


In [4]:
# 1) 训练集内切出验证集（按时间顺序，仅用于阈值寻优）
val_size = float(V["VAL_SIZE"])
if 0 < val_size < 1:
    vcut = int((1.0 - val_size) * len(Xtr))
    vcut = max(1, min(len(Xtr) - 1, vcut))
    Xtr_sub, Xva = Xtr.iloc[:vcut], Xtr.iloc[vcut:]
    ytr_sub, yva = ytr.iloc[:vcut], ytr.iloc[vcut:]
else:
    raise ValueError("VAL_SIZE 需在 (0,1) 内以进行验证划分")

# 2) 归一化（仅在训练子集拟合）
Xtr2, Xva2, scaler = minmax_scale_fit_transform(Xtr_sub, Xva)
Xte2 = pd.DataFrame(scaler.transform(Xte), columns=Xte.columns)

# 3) 分层（训练子集上）
feat_rank, mi_vals = rank_features_mi(Xtr2, ytr_sub)
L1, L2, L3 = make_levels(feat_rank)
print(f"【分层复核】总特征={len(feat_rank)} | L1={len(L1)} L2={len(L2)} L3={len(L3)}")


【归一化】已对训练/测试集进行 MinMax 缩放到 [0,1]。
【分层复核】总特征=9 | L1=5 L2=7 L3=9


## GWB（Algorithm 1）训练与三层概率

In [5]:

gwb_kwargs = {
    "k": int(V["GWB_K"]),
    "mode": V.get("GWB_mode", "epanechnikov"),
    "bandwidth": V.get("GWB_bandwidth"),
    "bandwidth_scale": V.get("GWB_bandwidth_scale", 1.0),
    "use_faiss": bool(V.get("GWB_use_faiss", True)),
    "faiss_gpu": bool(V.get("GWB_faiss_gpu", True)),
}
gwb_kwargs = {k: v for k, v in gwb_kwargs.items() if v is not None}
gwb1 = GWBProbEstimator(**gwb_kwargs).fit(Xtr2[L1], ytr_sub)
gwb2 = GWBProbEstimator(**gwb_kwargs).fit(Xtr2[L2], ytr_sub)
gwb3 = GWBProbEstimator(**gwb_kwargs).fit(Xtr2[L3], ytr_sub)
p1_va = gwb1.predict_proba(Xva2[L1]); p2_va = gwb2.predict_proba(Xva2[L2]); p3_va = gwb3.predict_proba(Xva2[L3])
p1_te = gwb1.predict_proba(Xte2[L1]); p2_te = gwb2.predict_proba(Xte2[L2]); p3_te = gwb3.predict_proba(Xte2[L3])
print("【GWB 完成】验证/测试三层概率就绪。")


【GWB 完成】验证/测试三层概率就绪。


## 验证集 PSO 学阈值（信息增益−后悔值 + 单调序 + ξ）

In [6]:
# 5) 验证集 PSO 学阈值（信息增益−后悔值 + 单调序 + ξ）
s3 = S3WDParams(
    c1=V["S3_c1"], c2=V["S3_c2"], xi_min=V["S3_xi_min"],
    theta_pos=V["S3_theta_pos"], theta_neg=V["S3_theta_neg"],
    penalty_large=V["S3_penalty_large"],
    gamma_last=V.get("S3_gamma_last"),   # ← 用 gamma_last（True 或 0.5）
    gap=V.get("S3_gap", 0.02)
)
pso = PSOParams(
    particles=V["PSO_particles"], iters=V["PSO_iters"],
    w_max=V["PSO_w_max"], w_min=V["PSO_w_min"],
    c1=V["PSO_c1"], c2=V["PSO_c2"], seed=V["PSO_seed"], use_gpu=V["PSO_use_gpu"]
)
(best_th, best_fit, detail) = pso_learn_thresholds([p1_va, p2_va, p3_va], yva.values, s3, pso)

alphas, betas, gamma3 = best_th
print("【PSO 学到阈值（验证集）】", [f"α{i+1}={a:.4f}/β{i+1}={b:.4f}" for i,(a,b) in enumerate(zip(alphas,betas))], f"γ3={gamma3:.4f}")
print("【适应度/约束】", {"fit":round(best_fit,4), "pen_bnd":detail.get("pen_bnd",None), "pen_mono":detail.get("pen_mono",None)})


【PSO 学到阈值（验证集）】 ['α1=0.5958/β1=0.2658', 'α2=0.5958/β2=0.2876', 'α3=0.5958/β3=0.2876'] γ3=0.5000
【适应度/约束】 {'fit': -0.0065, 'pen_bnd': 0.0, 'pen_mono': 0.0}


## 测试集序贯三支决策 + 评估

In [7]:

# 6) 测试集序贯三支决策 + 评估
def _seq_predict_eval(p1, p2, p3, y_true, a1, b1, a2, b2, g3):
    POS1 = (p1 >= a1); NEG1 = (p1 <= b1); BND1 = (~POS1) & (~NEG1)
    p2s = p2[BND1]; POS2 = np.zeros_like(BND1, bool); NEG2 = np.zeros_like(BND1, bool)
    POS2[BND1] = (p2s >= a2); NEG2[BND1] = (p2s <= b2)
    BND2 = BND1 & (~POS2) & (~NEG2)
    p3s = p3[BND2]; POS3 = np.zeros_like(BND2, bool); NEG3 = np.zeros_like(BND2, bool)
    POS3[BND2] = (p3s >= g3); NEG3[BND2] = ~POS3[BND2]
    y_hat = np.full_like(y_true, -1, int)
    y_hat[POS1] = 1; y_hat[NEG1] = 0; y_hat[POS2] = 1; y_hat[NEG2] = 0; y_hat[POS3] = 1; y_hat[NEG3] = 0
    flow = {"L1": (int(POS1.sum()), int(BND1.sum()), int(NEG1.sum())),
            "L2": (int(POS2.sum()), int(BND2.sum()), int(NEG2.sum())),
            "L3": (int(POS3.sum()), int(NEG3.sum()))}
    return y_hat, flow

def _safe_round(val, ndigits=4):
    if val is None:
        return np.nan
    try:
        return round(float(val), ndigits)
    except Exception:
        return np.nan

y_hat, flow = _seq_predict_eval(
    p1_te, p2_te, p3_te, yte.values,
    float(alphas[0]), float(betas[0]),
    float(alphas[1]), float(betas[1]),
    float(gamma3)
)
mask = (y_hat >= 0)
yt, yp = yte[mask], y_hat[mask]

try:
    auc_val = roc_auc_score(yt, yp)
except Exception:
    auc_val = np.nan

metrics = {
    "指标-F1分数": _safe_round(f1_score(yt, yp)),
    "指标-平衡准确率": _safe_round(balanced_accuracy_score(yt, yp)),
    "指标-精确率": _safe_round(precision_score(yt, yp)),
    "指标-召回率": _safe_round(recall_score(yt, yp)),
    "指标-MCC相关系数": _safe_round(matthews_corrcoef(yt, yp)),
    "指标-Kappa一致性": _safe_round(cohen_kappa_score(yt, yp)),
    "指标-AUC曲线下面积": _safe_round(auc_val),
}
flow_stats = {
    "样本流-一级正类数量": flow['L1'][0],
    "样本流-一级待定数量": flow['L1'][1],
    "样本流-一级负类数量": flow['L1'][2],
    "样本流-二级正类数量": flow['L2'][0],
    "样本流-二级待定数量": flow['L2'][1],
    "样本流-二级负类数量": flow['L2'][2],
    "样本流-三级正类数量": flow['L3'][0],
    "样本流-三级负类数量": flow['L3'][1],
}
threshold_stats = {
    "阈值-一级正判阈α1": _safe_round(alphas[0]),
    "阈值-一级负判阈β1": _safe_round(betas[0]),
    "阈值-二级正判阈α2": _safe_round(alphas[1]),
    "阈值-二级负判阈β2": _safe_round(betas[1]),
    "阈值-三级正判阈γ3": _safe_round(gamma3),
}
penalty_stats = {
    "约束-PSO适应度": _safe_round(best_fit),
    "约束-边界惩罚": _safe_round(detail.get('pen_bnd')),
    "约束-单调性惩罚": _safe_round(detail.get('pen_mono')),
}

summary = pd.DataFrame([{**metrics, **flow_stats, **threshold_stats, **penalty_stats}])
print("【样本流转（测试集）】", flow)
print("【核心指标（测试集）】", metrics)
display(summary)


【样本流转（学到的阈值）】L1 POS/BND/NEG = 4781 162746 132473  | L2 POS/BND/NEG = 3311 42809 116626  | L3 POS/NEG = 3680 39129
【评估（测试集）】 {'F1': 0.1041, 'BAC': 0.5147, 'Prec': 0.2908, 'Rec': 0.0634, 'MCC': 0.0583, 'Kappa': 0.0424, 'AUC': 0.5147}


In [None]:
# 7) 导出结果表（含中文表头说明）
from pathlib import Path
targets_dir = Path('..') / 'targets'
targets_dir.mkdir(parents=True, exist_ok=True)
dataset_name = Path(V['DATA_PATH']).stem
output_path = targets_dir / f"{dataset_name}_gwb_70_30单次结果.xlsx"
summary.to_excel(output_path, index=False)
print(f"【结果已保存】{output_path}")
