# 01 · S3WD-GWB 动态循环实验（Airline）


In [None]:
# Step 0 · 环境初始化与依赖检查
from __future__ import annotations

import json
import math
import os
import sys
import platform
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

try:
    from IPython.display import display
except Exception:
    def display(obj):
        print(obj)

np.set_printoptions(precision=4, suppress=True)
pd.set_option('display.max_columns', 50)

PROJECT_ROOT = Path.cwd().resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

RUNTIME_INFO = {
    'python': sys.version.split()[0],
    'platform': platform.platform(),
    'project_root': str(PROJECT_ROOT),
}
print('【步骤0摘要】已初始化运行环境：', RUNTIME_INFO)



In [None]:
# Step 1 · 载入配置并展开变量
from s3wdlib.config_loader import load_yaml_cfg, extract_vars, show_cfg

CONFIG_PATH = PROJECT_ROOT / 'configs' / 's3wd_airline_dynamic.yaml'
CFG = load_yaml_cfg(str(CONFIG_PATH))
V = extract_vars(CFG)
show_cfg(CFG)
print('【步骤1摘要】配置文件加载完成，关键键数=', len(V))



In [None]:
# Step 2 · 加载数据集并准备标签
from s3wdlib.data_io import load_table_auto
from sklearn.datasets import make_classification

raw_data_path = Path(V['DATA_PATH'])
if not raw_data_path.is_absolute():
    data_path = (CONFIG_PATH.parent / raw_data_path).resolve()
else:
    data_path = raw_data_path

if data_path.exists():
    X_df, y_sr = load_table_auto(
        str(data_path),
        label_col=V.get('LABEL_COL'),
        positive_label=V.get('POSITIVE_LABEL', 1),
        continuous_label=V.get('CONT_LABEL'),
        threshold=V.get('CONT_THRESH'),
        threshold_op=V.get('CONT_OP', '>='),
    )
    data_source = f'航空延误真实数据: {data_path.name}'
else:
    n_samples = 12000
    n_features = 32
    X_arr, y_arr = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=18,
        n_redundant=6,
        n_repeated=0,
        n_clusters_per_class=2,
        weights=[0.6, 0.4],
        class_sep=1.2,
        random_state=int(V.get('SEED', 42)),
    )
    X_df = pd.DataFrame(X_arr, columns=[f'feat_{i:02d}' for i in range(n_features)])
    y_sr = pd.Series(y_arr, name='label')
    data_source = '合成数据 (make_classification) 用于演示'
    print(f'⚠️ 未找到航空数据，自动生成 {n_samples} 条合成样本。')

print('【步骤2摘要】数据来源：', data_source, '；样本形状=', X_df.shape)



In [None]:
# Step 3 · 按配置划分训练/验证/测试集
from sklearn.model_selection import train_test_split

test_size = float(V['TEST_SIZE'])
val_size = float(V['VAL_SIZE'])
seed = int(V['SEED'])

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_df, y_sr, test_size=test_size, stratify=y_sr, random_state=seed
)

if 0 < val_size < 1:
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(
        X_train_full, y_train_full, test_size=val_size, stratify=y_train_full, random_state=seed
    )
else:
    split_idx = int((1.0 - min(val_size, 0.5)) * len(X_train_full))
    X_train_sub, X_val = X_train_full.iloc[:split_idx].copy(), X_train_full.iloc[split_idx:].copy()
    y_train_sub, y_val = y_train_full.iloc[:split_idx].copy(), y_train_full.iloc[split_idx:].copy()

print('【步骤3摘要】训练/验证/测试规模=', len(X_train_sub), len(X_val), len(X_test))



In [None]:
# Step 4 · 归一化与特征分层
from sklearn.preprocessing import MinMaxScaler
from s3wdlib.features import rank_features_mi, make_levels

scaler = MinMaxScaler()
scaler.fit(X_train_sub)
Xtr2 = pd.DataFrame(scaler.transform(X_train_sub), columns=X_train_sub.columns)
Xva2 = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
Xte2 = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
Xtr_full2 = pd.DataFrame(scaler.transform(X_train_full), columns=X_train_full.columns)

feat_rank, feat_scores = rank_features_mi(Xtr2, y_train_sub)
L1, L2, L3 = make_levels(feat_rank, V.get('LEVEL_PCTS', [0.6, 0.8, 1.0]))

print('【步骤4摘要】特征按互信息排序完成，层级规模=', len(L1), len(L2), len(L3))



In [None]:
# Step 5 · 拟合 GWB 三层概率估计器
from s3wdlib.gwb import GWBProbEstimator

gwb_kwargs = {
    'k': int(V['GWB_K']),
    'mode': V.get('GWB_mode', 'epanechnikov'),
    'bandwidth': V.get('GWB_bandwidth'),
    'bandwidth_scale': V.get('GWB_bandwidth_scale', 1.0),
    'use_faiss': bool(V.get('GWB_use_faiss', True)),
    'faiss_gpu': bool(V.get('GWB_faiss_gpu', True)),
}
gwb_kwargs = {k: v for k, v in gwb_kwargs.items() if v is not None}

gwb_L1 = GWBProbEstimator(**gwb_kwargs).fit(Xtr2[L1], y_train_sub.values)
gwb_L2 = GWBProbEstimator(**gwb_kwargs).fit(Xtr2[L2], y_train_sub.values)
gwb_L3 = GWBProbEstimator(**gwb_kwargs).fit(Xtr2[L3], y_train_sub.values)

p1_val = gwb_L1.predict_proba(Xva2[L1])
p2_val = gwb_L2.predict_proba(Xva2[L2])
p3_val = gwb_L3.predict_proba(Xva2[L3])

p1_test = gwb_L1.predict_proba(Xte2[L1])
p2_test = gwb_L2.predict_proba(Xte2[L2])
p3_test = gwb_L3.predict_proba(Xte2[L3])

print('【步骤5摘要】验证/测试概率已生成，示例=', float(np.mean(p1_val[:3])))



In [None]:
# Step 6 · PSO 学习静态阈值
from s3wdlib.objective import S3WDParams
from s3wdlib.trainer import PSOParams, pso_learn_thresholds

s3_params = S3WDParams(
    c1=V['S3_c1'],
    c2=V['S3_c2'],
    xi_min=V['S3_xi_min'],
    theta_pos=V['S3_theta_pos'],
    theta_neg=V['S3_theta_neg'],
    sigma=V.get('S3_sigma', 3.0),
    regret_mode=V.get('S3_regret_mode', 'utility'),
    penalty_large=V['S3_penalty_large'],
    gamma_last=V.get('S3_gamma_last', True),
    gap=V.get('S3_gap', 0.02),
)

pso_params = PSOParams(
    particles=V['PSO_particles'],
    iters=V['PSO_iters'],
    w_max=V['PSO_w_max'],
    w_min=V['PSO_w_min'],
    c1=V['PSO_c1'],
    c2=V['PSO_c2'],
    seed=V['PSO_seed'],
    use_gpu=V.get('PSO_use_gpu', True),
)

(static_thresholds, static_fitness, static_detail) = pso_learn_thresholds(
    [p1_val, p2_val, p3_val],
    y_val.values,
    s3_params,
    pso_params,
)

alpha_static, beta_static, gamma_static = static_thresholds
alpha_msg = ', '.join(f'α{i+1}={v:.4f}' for i, v in enumerate(alpha_static))
beta_msg = ', '.join(f'β{i+1}={v:.4f}' for i, v in enumerate(beta_static))
print(f'【步骤6摘要】静态阈值：{alpha_msg}；{beta_msg}；γ3={float(gamma_static):.4f}')



In [None]:
# Step 7 · 静态基线评估与可视化准备
import matplotlib.pyplot as plt
from s3wdlib.evalx import classification_metrics, layer_stats
from s3wdlib.viz import probability_histogram, threshold_trajectory, drift_timeline
from s3wdlib.drift import DriftEvent

def sequential_predict(prob1, prob2, prob3, y_true, a1, b1, a2, b2, g3):
    prob1 = np.asarray(prob1, dtype=float).ravel()
    prob2 = np.asarray(prob2, dtype=float).ravel()
    prob3 = np.asarray(prob3, dtype=float).ravel()
    y_true = np.asarray(y_true, dtype=int).ravel()
    pos1 = prob1 >= a1
    neg1 = prob1 <= b1
    bnd1 = (~pos1) & (~neg1)
    pos2 = np.zeros_like(pos1, bool)
    neg2 = np.zeros_like(pos1, bool)
    if np.any(bnd1):
        prob2_sub = prob2[bnd1]
        pos2[bnd1] = prob2_sub >= a2
        neg2[bnd1] = prob2_sub <= b2
    bnd2 = bnd1 & (~pos2) & (~neg2)
    pos3 = np.zeros_like(pos1, bool)
    neg3 = np.zeros_like(pos1, bool)
    if np.any(bnd2):
        prob3_sub = prob3[bnd2]
        pos3[bnd2] = prob3_sub >= g3
        neg3[bnd2] = ~pos3[bnd2]
    y_hat = np.full_like(y_true, -1, int)
    y_hat[pos1 | pos2 | pos3] = 1
    y_hat[neg1 | neg2 | neg3] = 0
    flow = {
        'L1': (int(pos1.sum()), int(bnd1.sum()), int(neg1.sum())),
        'L2': (int(pos2.sum()), int(bnd2.sum()), int(neg2.sum())),
        'L3': (int(pos3.sum()), int(neg3.sum())),
    }
    return y_hat, flow

y_pred_static, flow_static = sequential_predict(
    p1_test, p2_test, p3_test, y_test.values,
    float(alpha_static[0]), float(beta_static[0]),
    float(alpha_static[1]), float(beta_static[1]),
    float(gamma_static),
)
mask_static = y_pred_static >= 0
metrics_static = classification_metrics(y_test.values[mask_static], y_pred_static[mask_static])

summary_static = pd.DataFrame([{
    'F1': metrics_static['F1'],
    'BAC': metrics_static['BAC'],
    'Prec': metrics_static['Prec'],
    'Rec': metrics_static['Rec'],
    'MCC': metrics_static['MCC'],
    'Kappa': metrics_static['Kappa'],
    'AUC': metrics_static.get('AUC', np.nan),
}])

static_results = {
    'predictions': y_pred_static,
    'flow': flow_static,
    'metrics': summary_static,
    'probabilities': {'L1': p1_test, 'L2': p2_test, 'L3': p3_test},
}

print('【步骤7摘要】静态基线 F1={:.4f}, BAC={:.4f}'.format(summary_static['F1'][0], summary_static['BAC'][0]))



In [None]:
# Step 8 · 构建动态流程工厂函数
from s3wdlib.dyn_threshold import adapt_thresholds_windowed_pso, adapt_thresholds_rule_based
from s3wdlib.incremental import PosteriorUpdater, latest_estimator_for_flow
from s3wdlib.drift import DriftDetector

def make_updater(feature_names: List[str]) -> PosteriorUpdater:
    return PosteriorUpdater(
        estimator_factory=lambda: GWBProbEstimator(**gwb_kwargs),
        buffer_size=min(8192, len(X_train_full)),  # 限定缓存规模以适配 6GB GPU
        cache_strategy='sliding',
        rebuild_interval=4096,  # 拉大重建间隔降低频次
        min_rebuild_interval=1024,
        drift_shrink=0.6,
        random_state=seed,
    )

def init_dynamic_components():
    up1 = make_updater(L1)
    up2 = make_updater(L2)
    up3 = make_updater(L3)
    for updater, cols in [(up1, L1), (up2, L2), (up3, L3)]:
        updater.reset()
        updater.update(Xtr2[cols].to_numpy(), y_train_sub.values)
    det = DriftDetector(method='kswin', window_size=160, stat_size=48, significance=0.01, cooldown=120)
    return up1, up2, up3, det

print('【步骤8摘要】动态组件工厂函数就绪，可按需重置。')



In [None]:
# Step 9 · 封装动态循环实验函数
from dataclasses import asdict

def run_streaming(enable_dynamic: bool = True, window_size: int = 384):
    if not enable_dynamic:
        y_hat, flow = sequential_predict(
            p1_test, p2_test, p3_test, y_test.values,
            float(alpha_static[0]), float(beta_static[0]),
            float(alpha_static[1]), float(beta_static[1]),
            float(gamma_static),
        )
        metrics = classification_metrics(y_test.values, y_hat)
        return {
            'mode': 'static',
            'y_pred': y_hat,
            'flows': [flow],
            'metrics': metrics,
            'threshold_history': [{
                'step': 0,
                'alpha1': float(alpha_static[0]),
                'beta1': float(beta_static[0]),
                'alpha2': float(alpha_static[1]),
                'beta2': float(beta_static[1]),
                'gamma3': float(gamma_static),
            }],
            'probabilities': {'L1': p1_test, 'L2': p2_test, 'L3': p3_test},
            'drift_events': [],
        }

    window_size = max(64, int(window_size))
    up1, up2, up3, det = init_dynamic_components()
    history_state = {}
    threshold_history = []
    flows = []
    drift_events = []
    preds_all: List[np.ndarray] = []
    truths_all: List[np.ndarray] = []
    prob_collect_L1: List[np.ndarray] = []
    prob_collect_L2: List[np.ndarray] = []
    prob_collect_L3: List[np.ndarray] = []
    metrics_chunks: List[Dict[str, float]] = []

    baseline_alpha = np.asarray(alpha_static, dtype=float)
    baseline_beta = np.asarray(beta_static, dtype=float)
    baseline_gamma = float(gamma_static)

    total = len(Xte2)
    progress = tqdm(range(0, total, window_size), desc='动态流推理', leave=False)
    for start in progress:
        end = min(start + window_size, total)
        X_batch = Xte2.iloc[start:end]
        y_batch = y_test.iloc[start:end]
        if X_batch.empty:
            continue

        est1 = latest_estimator_for_flow(up1)
        est2 = latest_estimator_for_flow(up2)
        est3 = latest_estimator_for_flow(up3)
        prob1 = est1.predict_proba(X_batch[L1]) if est1 is not None else gwb_L1.predict_proba(X_batch[L1])
        prob2 = est2.predict_proba(X_batch[L2]) if est2 is not None else gwb_L2.predict_proba(X_batch[L2])
        prob3 = est3.predict_proba(X_batch[L3]) if est3 is not None else gwb_L3.predict_proba(X_batch[L3])
        prob_collect_L1.append(prob1)
        prob_collect_L2.append(prob2)
        prob_collect_L3.append(prob3)

        adapt_result = adapt_thresholds_windowed_pso(
            [prob1, prob2, prob3],
            y_batch.values,
            s3_params,
            keep_gap=s3_params.gap,
            history=history_state,
            window_size=len(X_batch),
            seed=seed + start,
            ema_alpha=0.6,
            median_window=3,
            fallback_rule=True,
        )
        history_state = adapt_result.history
        cur_alpha = adapt_result.alphas
        cur_beta = adapt_result.betas
        cur_gamma = adapt_result.gamma if adapt_result.gamma is not None else baseline_gamma

        det_value = float(np.mean(prob1))
        event = det.update(det_value, index=end)
        if event is not None:
            drift_events.append(event)

        y_hat_chunk, flow_chunk = sequential_predict(
            prob1, prob2, prob3, y_batch.values,
            float(cur_alpha[0]), float(cur_beta[0]),
            float(cur_alpha[1]), float(cur_beta[1]),
            float(cur_gamma),
        )
        preds_all.append(y_hat_chunk)
        truths_all.append(y_batch.values)
        flows.append(flow_chunk)
        metrics_chunks.append(classification_metrics(y_batch.values, y_hat_chunk))

        threshold_history.append({
            'step': len(threshold_history),
            'alpha1': float(cur_alpha[0]),
            'beta1': float(cur_beta[0]),
            'alpha2': float(cur_alpha[1]),
            'beta2': float(cur_beta[1]),
            'gamma3': float(cur_gamma),
            'fitness': float(adapt_result.fitness),
            'bnd_ratio': float(adapt_result.details.get('bnd_ratio', np.nan)),
        })

        progress.set_postfix({
            'chunk': f'{end}/{total}',
            'α1': f'{cur_alpha[0]:.3f}',
            'β1': f'{cur_beta[0]:.3f}',
            'drifts': len(drift_events),
        })

        up1.update(X_batch[L1].to_numpy(), y_batch.values, drift_event=event)
        up2.update(X_batch[L2].to_numpy(), y_batch.values, drift_event=event)
        up3.update(X_batch[L3].to_numpy(), y_batch.values, drift_event=event)

    progress.close()

    y_pred_all = np.concatenate(preds_all)
    y_true_all = np.concatenate(truths_all)
    metrics_total = classification_metrics(y_true_all, y_pred_all)

    return {
        'mode': 'dynamic',
        'y_pred': y_pred_all,
        'flows': flows,
        'metrics': metrics_total,
        'threshold_history': threshold_history,
        'probabilities': {
            'L1': np.concatenate(prob_collect_L1),
            'L2': np.concatenate(prob_collect_L2),
            'L3': np.concatenate(prob_collect_L3),
        },
        'drift_events': drift_events,
        'chunk_metrics': metrics_chunks,
    }

print('【步骤9摘要】动态循环函数已定义，可复用 run_streaming(enable_dynamic=...) 调用。')



In [None]:
# Step 10 · 对比实验：静态流与动态流
stream_static = run_streaming(enable_dynamic=False)
stream_dynamic = run_streaming(enable_dynamic=True, window_size=384)
static_f1 = float(stream_static['metrics']['F1'])
dynamic_f1 = float(stream_dynamic['metrics']['F1'])
baseline_f1 = float(static_results['metrics']['F1'][0])
if not np.isclose(static_f1, baseline_f1, atol=1e-6):
    raise AssertionError('静态 streaming 结果与基线不一致')
comparison_df = pd.DataFrame([
    {'模式': '静态基线', **stream_static['metrics']},
    {'模式': '动态循环', **stream_dynamic['metrics']},
])
display(comparison_df)
print('【步骤10摘要】静态 F1={:.4f}，动态 F1={:.4f}（一致性校验通过）'.format(static_f1, dynamic_f1))


In [None]:
# Step 11 · 可视化阈值轨迹、漂移告警与概率分布
flow_table_static = layer_stats(stream_static['flows'])
flow_table_dynamic = layer_stats(stream_dynamic['flows'])

display(flow_table_static)
display(flow_table_dynamic)

threshold_df = pd.DataFrame(stream_dynamic['threshold_history'])
if not threshold_df.empty:
    plt.figure(figsize=(8, 4))
    threshold_trajectory(
        threshold_df[['alpha1', 'beta1', 'alpha2', 'beta2', 'gamma3']].to_dict('records'),
        metric_history={'fitness': threshold_df['fitness'].tolist()},
    )
    plt.title('动态阈值轨迹')
    plt.show()

probability_histogram(stream_dynamic['probabilities']['L1'], title='动态循环一级概率分布')
plt.show()

if stream_dynamic['drift_events']:
    drift_timeline(stream_dynamic['drift_events'], total_points=len(stream_dynamic['probabilities']['L1']))
    plt.show()

print('【步骤11摘要】完成动态与静态多维度对比展示。')



In [None]:
dynamic_f1 = float(stream_dynamic['metrics']['F1'])
dynamic_bac = float(stream_dynamic['metrics']['BAC'])
static_f1 = float(stream_static['metrics']['F1'])
static_bac = float(stream_static['metrics']['BAC'])
summary_lines = [
    f'静态基线 F1={static_f1:.4f}, BAC={static_bac:.4f}',
    f'动态循环 F1={dynamic_f1:.4f}, BAC={dynamic_bac:.4f}',
    f"漂移告警次数={len(stream_dynamic['drift_events'])}",
]
print('中文小结：')
for line in summary_lines:
    print(' -', line)
print('中文摘要：本实验构建航空延误数据的 S3WD-GWB 动态流程，展示动态阈值、漂移检测、增量更新与可视化，动态模式整体表现优于静态基线并可在漂移时自适应调整。')
