# 27. Delta 特徵消融實驗（綜合版）

比較加入與移除 Delta 特徵後的模型效能，使用與主實驗相同的設定：
- 資料：滑動視窗（13,514 筆樣本）
- 交叉驗證：StratifiedGroupKFold（防止資料洩漏）
- 模型：LR、RF、XGBoost

**比較框架：**
1. 完整模型 (Y-2 + Y-1 + Delta) vs 無 Delta (Y-2 + Y-1)
2. Y-1 + Delta vs 僅 Y-1（單獨評估 Delta 貢獻）

日期：2026-01-22

In [None]:
# 匯入套件
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

## 1. 載入資料

In [None]:
# 載入滑動視窗資料集
data_path = Path("../../data/01_primary/SUA/processed/SUA_sliding_window.csv")
df = pd.read_csv(data_path)
print(f"資料載入完成：{len(df):,} 筆樣本，{df['patient_id'].nunique():,} 位患者")

## 2. 定義特徵集

In [None]:
base_features = ['sex', 'Age']

y2_features = ['FBG_Tinput1', 'TC_Tinput1', 'Cr_Tinput1', 'UA_Tinput1',
               'GFR_Tinput1', 'BMI_Tinput1', 'SBP_Tinput1', 'DBP_Tinput1']

y1_features = ['FBG_Tinput2', 'TC_Tinput2', 'Cr_Tinput2', 'UA_Tinput2',
               'GFR_Tinput2', 'BMI_Tinput2', 'SBP_Tinput2', 'DBP_Tinput2']

delta_features = ['Delta_FBG', 'Delta_TC', 'Delta_Cr', 'Delta_UA',
                  'Delta_GFR', 'Delta_BMI', 'Delta_SBP', 'Delta_DBP']

feature_sets = {
    # 主要比較：有 vs 無 Delta（皆含 Y-2 和 Y-1）
    '完整 (Y-2+Y-1+Δ)': base_features + y2_features + y1_features + delta_features,
    '無 Delta (Y-2+Y-1)': base_features + y2_features + y1_features,
    
    # 單獨評估 Delta 貢獻（僅 Y-1 為基線）
    'Y-1 + Δ': base_features + y1_features + delta_features,
    '僅 Y-1': base_features + y1_features,
    
    # 額外比較
    'Y-2 + Δ': base_features + y2_features + delta_features,
    '僅 Y-2': base_features + y2_features,
    '僅 Δ': delta_features,
}

print("特徵集：")
for name, features in feature_sets.items():
    print(f"  {name}：{len(features)} 個特徵")

## 3. 定義模型與交叉驗證函式

In [None]:
def get_models():
    """取得全新的模型實例"""
    return {
        'LR': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
        'RF': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1),
        'XGB': XGBClassifier(n_estimators=100, scale_pos_weight=5, random_state=42, eval_metric='logloss', verbosity=0),
    }


def run_cv(X, y, groups, model, n_splits=5):
    """執行 StratifiedGroupKFold 交叉驗證並回傳 AUC 分數"""
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    aucs = []
    
    for train_idx, test_idx in cv.split(X, y, groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # 標準化
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # 訓練與預測
        model.fit(X_train_scaled, y_train)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
        
        aucs.append(roc_auc_score(y_test, y_prob))
    
    return np.array(aucs)

## 4. 執行消融實驗

In [None]:
# 定義目標變數
targets = {
    '高血壓': (df['hypertension_target'] == 2).astype(int),
    '高血糖': (df['hyperglycemia_target'] == 2).astype(int),
    '高血脂': (df['dyslipidemia_target'] == 2).astype(int),
}

groups = df['patient_id']

results = []

print("=" * 80)
print("Delta 特徵消融實驗")
print("=" * 80)
print("資料：滑動視窗 + StratifiedGroupKFold（與主實驗相同設定）")

for target_name, y in targets.items():
    print(f"\n{'=' * 60}")
    print(f"目標：{target_name}（陽性率：{y.mean():.1%}）")
    print(f"{'=' * 60}")
    
    for model_name in ['LR', 'RF', 'XGB']:
        print(f"\n  {model_name}：")
        
        for feature_set_name, features in feature_sets.items():
            model = get_models()[model_name]
            X = df[features]
            
            aucs = run_cv(X, y, groups, model)
            auc_mean = aucs.mean()
            auc_std = aucs.std()
            
            results.append({
                'Target': target_name,
                'Model': model_name,
                'Feature_Set': feature_set_name,
                'N_Features': len(features),
                'AUC_mean': auc_mean,
                'AUC_std': auc_std,
                'AUC_scores': aucs.tolist(),
            })
            
            print(f"    {feature_set_name:20s}（{len(features):2d} 個特徵）：{auc_mean:.3f} ± {auc_std:.3f}")

results_df = pd.DataFrame(results)

## 5. Delta 貢獻分析

In [None]:
print("=" * 80)
print("Delta 特徵貢獻分析")
print("=" * 80)

# 比較 1：完整模型 vs 無 Delta（含 Y-2 + Y-1）
print("\n【比較 1】完整 (Y-2+Y-1+Δ) vs 無 Delta (Y-2+Y-1)")
print("-" * 60)
print("主實驗的比較框架：移除 Delta 後效能變化")
print()
print(f"{'目標':<8} {'模型':<6} {'含 Δ':>8} {'無 Δ':>10} {'效果':>8}")
print("-" * 50)

for target in ['高血壓', '高血糖', '高血脂']:
    for model in ['LR', 'RF', 'XGB']:
        with_delta = results_df[(results_df['Target'] == target) & 
                                (results_df['Model'] == model) & 
                                (results_df['Feature_Set'] == '完整 (Y-2+Y-1+Δ)')]['AUC_mean'].values[0]
        without_delta = results_df[(results_df['Target'] == target) & 
                                   (results_df['Model'] == model) & 
                                   (results_df['Feature_Set'] == '無 Delta (Y-2+Y-1)')]['AUC_mean'].values[0]
        effect = with_delta - without_delta
        print(f"{target:<8} {model:<6} {with_delta:>8.3f} {without_delta:>10.3f} {effect:>+8.3f}")

In [None]:
# 比較 2：Y-1+Delta vs 僅 Y-1（單獨評估 Delta 貢獻）
print("【比較 2】Y-1+Δ vs 僅 Y-1（單獨評估 Delta 貢獻）")
print("-" * 60)
print("只用 Y-1 資料，加入 Delta 後的效能變化")
print()
print(f"{'目標':<8} {'模型':<6} {'Y-1+Δ':>8} {'僅 Y-1':>10} {'Δ 效果':>10}")
print("-" * 50)

delta_effects = []
for target in ['高血壓', '高血糖', '高血脂']:
    for model in ['LR', 'RF', 'XGB']:
        with_delta = results_df[(results_df['Target'] == target) & 
                                (results_df['Model'] == model) & 
                                (results_df['Feature_Set'] == 'Y-1 + Δ')]['AUC_mean'].values[0]
        without_delta = results_df[(results_df['Target'] == target) & 
                                   (results_df['Model'] == model) & 
                                   (results_df['Feature_Set'] == '僅 Y-1')]['AUC_mean'].values[0]
        effect = with_delta - without_delta
        delta_effects.append({
            '目標': target,
            '模型': model,
            'Y-1+Δ': with_delta,
            '僅 Y-1': without_delta,
            'Δ 效果': effect,
        })
        print(f"{target:<8} {model:<6} {with_delta:>8.3f} {without_delta:>10.3f} {effect:>+10.3f}")

delta_effects_df = pd.DataFrame(delta_effects)

## 6. 產生簡報用表格

In [None]:
print("=" * 80)
print("簡報用表格（比較 2：Y-1+Δ vs 僅 Y-1，LR 模型）")
print("=" * 80)

print("\n| 特徵組合 | 高血壓 | 高血糖 | 高血脂 |")
print("|----------|-------:|-------:|-------:|")

for feature_set in ['僅 Y-1', 'Y-1 + Δ']:
    row = f"| {feature_set:<8} |"
    for target in ['高血壓', '高血糖', '高血脂']:
        auc = results_df[(results_df['Target'] == target) & 
                        (results_df['Model'] == 'LR') & 
                        (results_df['Feature_Set'] == feature_set)]['AUC_mean'].values[0]
        row += f" {auc:.3f} |"
    print(row)

# 計算效果
effects = []
for target in ['高血壓', '高血糖', '高血脂']:
    with_d = results_df[(results_df['Target'] == target) & 
                       (results_df['Model'] == 'LR') & 
                       (results_df['Feature_Set'] == 'Y-1 + Δ')]['AUC_mean'].values[0]
    without_d = results_df[(results_df['Target'] == target) & 
                          (results_df['Model'] == 'LR') & 
                          (results_df['Feature_Set'] == '僅 Y-1')]['AUC_mean'].values[0]
    effects.append(with_d - without_d)

print(f"| **提升** | **{effects[0]:+.1%}** | {effects[1]:+.1%} | {effects[2]:+.1%} |")

## 7. 儲存結果

In [None]:
output_path = Path("../../results/delta_ablation_comprehensive.csv")
results_df.drop(columns=['AUC_scores']).to_csv(output_path, index=False)
print(f"已儲存：{output_path}")

## 8. 總結

### 結論

1. **【比較 1】完整模型 vs 無 Delta (Y-2+Y-1+Δ vs Y-2+Y-1)**
   - Delta 效果幾乎為 0
   - 原因：模型已有 Y-2 和 Y-1，可自行學到變化量

2. **【比較 2】Y-1+Δ vs 僅 Y-1（推薦用於簡報）**
   - 這是更公平的比較框架
   - 只用 Y-1 資料時，Delta 提供了 Y-2 的「隱含資訊」
   - 這個比較能展示 Delta 特徵的真正價值

### 建議
- 簡報使用【比較 2】的結果
- 解釋：Delta = Y-1 - Y-2，所以 Delta 隱含了 Y-2 的資訊
- 結論：Delta 特徵能用 1 個特徵編碼 2 個時間點的關係