# 25. PySR 樹深度實驗

## 目的
探索如何產生更深（depth >= 2）的 PySR 公式。

## 實驗設計

| 實驗 | 特徵 | parsimony | 說明 |
|------|------|-----------|------|
| A | Top 5 | 0.0001 | 重現 Nb16 的 depth=2 公式 |
| B | Top 5 | 0 | 嘗試 depth=3+ |
| C | 全部 26 | 0 | 對照組 |

## 背景
- Nb16 用 Top 5 特徵得到 depth=2：`abs((|Delta_SBP| + SBP_T1 + 1.46) * 0.15)`
- Nb23 用全部 26 特徵，公式都是 depth=1
- 假設：減少特徵數可讓 PySR 更容易搜索到複雜公式

## 日期：2026-01-14

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import time
import os
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

from pysr import PySRRegressor

plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei', 'SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False

os.makedirs('./pysr_temp', exist_ok=True)
os.makedirs('../../results/tables', exist_ok=True)

print("套件載入完成")

Detected Jupyter notebook. Loading juliacall extension. Set `PYSR_AUTOLOAD_EXTENSIONS=no` to disable.
套件載入完成


## 1. 載入資料

In [2]:
# 載入滑動窗口資料
df = pd.read_csv('../../data/01_primary/SUA/processed/SUA_sliding_window.csv')
print(f"資料形狀: {df.shape}")
print(f"病患數: {df['patient_id'].nunique()}")

# 全部特徵
all_feature_cols = [
    'sex', 'Age',
    'FBG_Tinput1', 'TC_Tinput1', 'Cr_Tinput1', 'UA_Tinput1', 'GFR_Tinput1', 'BMI_Tinput1', 'SBP_Tinput1', 'DBP_Tinput1',
    'FBG_Tinput2', 'TC_Tinput2', 'Cr_Tinput2', 'UA_Tinput2', 'GFR_Tinput2', 'BMI_Tinput2', 'SBP_Tinput2', 'DBP_Tinput2',
    'Delta_FBG', 'Delta_TC', 'Delta_Cr', 'Delta_UA', 'Delta_GFR', 'Delta_BMI', 'Delta_SBP', 'Delta_DBP'
]

# 簡化名稱
all_feature_names = [
    'sex', 'Age',
    'FBG_T1', 'TC_T1', 'Cr_T1', 'UA_T1', 'GFR_T1', 'BMI_T1', 'SBP_T1', 'DBP_T1',
    'FBG_T2', 'TC_T2', 'Cr_T2', 'UA_T2', 'GFR_T2', 'BMI_T2', 'SBP_T2', 'DBP_T2',
    'D_FBG', 'D_TC', 'D_Cr', 'D_UA', 'D_GFR', 'D_BMI', 'D_SBP', 'D_DBP'
]

X_all = df[all_feature_cols].copy()
groups = df['patient_id'].values

# 目標變數
targets = {
    '高血壓': (df['hypertension_target'] == 2).astype(int),
    '高血糖': (df['hyperglycemia_target'] == 2).astype(int),
    '高血脂': (df['dyslipidemia_target'] == 2).astype(int)
}

print(f"\n目標變數分佈:")
for name, y in targets.items():
    print(f"  {name}: {y.mean():.2%}")

資料形狀: (13514, 32)
病患數: 6056

目標變數分佈:
  高血壓: 19.29%
  高血糖: 5.93%
  高血脂: 7.94%


## 2. 篩選 Top 5 特徵（使用 RF）

In [3]:
def get_top_features(X, y, feature_names, top_n=5):
    """使用 RF 篩選 Top N 重要特徵"""
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)
    rf.fit(X_scaled, y)
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    top_features = importance_df.head(top_n)['feature'].tolist()
    return top_features, importance_df

# 為每個疾病篩選 Top 5
top5_features = {}
print("=== 各疾病 Top 5 特徵 ===")
for name, y in targets.items():
    top_features, importance_df = get_top_features(X_all, y, all_feature_names, top_n=5)
    top5_features[name] = top_features
    print(f"\n{name}: {top_features}")

=== 各疾病 Top 5 特徵 ===

高血壓: ['SBP_T1', 'SBP_T2', 'D_SBP', 'D_GFR', 'FBG_T1']

高血糖: ['FBG_T2', 'FBG_T1', 'D_FBG', 'BMI_T1', 'BMI_T2']

高血脂: ['TC_T2', 'TC_T1', 'D_TC', 'D_GFR', 'GFR_T1']


## 3. PySR 實驗函數

In [4]:
def run_pysr_experiment(X, y, groups, feature_names, disease_name, exp_name,
                        maxsize=35, niterations=300, parsimony=0.0001, timeout=900):
    """
    執行單一 PySR 實驗（使用 3-Fold CV 加速）
    """
    print(f"\n{'='*60}")
    print(f"{disease_name} - {exp_name}")
    print(f"{'='*60}")
    print(f"特徵: {feature_names}")
    print(f"parsimony={parsimony}, maxsize={maxsize}")
    
    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)
    
    fold_results = []
    all_formulas = []
    all_equations = []
    
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y, groups)):
        print(f"\n--- Fold {fold+1}/3 ---")
        
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        X_train_df = pd.DataFrame(X_train_scaled, columns=feature_names)
        X_test_df = pd.DataFrame(X_test_scaled, columns=feature_names)
        y_train_reset = y_train.reset_index(drop=True)
        y_test_reset = y_test.reset_index(drop=True)
        
        model = PySRRegressor(
            niterations=niterations,
            binary_operators=["+", "-", "*", "/"],
            unary_operators=["exp", "log", "abs", "square"],
            maxsize=maxsize,
            populations=20,
            population_size=100,
            parsimony=parsimony,
            weight_optimize=0.01,
            timeout_in_seconds=timeout,
            temp_equation_file=True,
            tempdir="./pysr_temp",
            random_state=42 + fold,
            deterministic=True,
            procs=0,
            multithreading=False,
            progress=True,
            verbosity=0,
        )
        
        start_time = time.time()
        model.fit(X_train_df, y_train_reset)
        elapsed = time.time() - start_time
        
        y_pred_proba = np.clip(model.predict(X_test_df), 0, 1)
        threshold = y_train_reset.mean()
        y_pred = (y_pred_proba >= threshold).astype(int)
        
        try:
            auc = roc_auc_score(y_test_reset, y_pred_proba)
        except:
            auc = 0.5
        
        best_eq = str(model.sympy())
        
        # 取得 Pareto 前沿（不同複雜度的公式）
        equations_df = model.equations_
        
        print(f"  AUC: {auc:.3f}, 時間: {elapsed/60:.1f}分鐘")
        print(f"  最佳公式: {best_eq[:80]}..." if len(best_eq) > 80 else f"  最佳公式: {best_eq}")
        
        # 顯示 Pareto 前沿
        if equations_df is not None and len(equations_df) > 0:
            print(f"  Pareto 前沿（前 5）:")
            for i, row in equations_df.head(5).iterrows():
                print(f"    C={row['complexity']:2.0f}: {row['equation'][:60]}")
        
        fold_results.append({'fold': fold+1, 'AUC': auc, 'time_min': elapsed/60})
        all_formulas.append(best_eq)
        all_equations.append(equations_df)
    
    results_df = pd.DataFrame(fold_results)
    
    print(f"\n--- {exp_name} 結果 ---")
    print(f"AUC: {results_df['AUC'].mean():.3f} ± {results_df['AUC'].std():.3f}")
    print(f"總時間: {results_df['time_min'].sum():.1f} 分鐘")
    
    return {
        'disease': disease_name,
        'exp_name': exp_name,
        'AUC_mean': results_df['AUC'].mean(),
        'AUC_std': results_df['AUC'].std(),
        'formulas': all_formulas,
        'equations': all_equations,
        'total_time': results_df['time_min'].sum()
    }

## 4. 實驗 A：Top 5 特徵 + parsimony=0.0001

In [5]:
# 實驗 A：重現 Nb16 設定（只跑高血壓）
results_A = []

for disease_name in ['高血壓']:  # 先只跑高血壓測試
    y = targets[disease_name]
    top5 = top5_features[disease_name]
    
    # 取得 Top 5 特徵的索引
    top5_indices = [all_feature_names.index(f) for f in top5]
    X_top5 = X_all.iloc[:, top5_indices]
    
    result = run_pysr_experiment(
        X_top5, y, groups, top5, disease_name,
        exp_name="Top5_parsimony=0.0001",
        maxsize=35,
        niterations=300,
        parsimony=0.0001,
        timeout=900
    )
    results_A.append(result)


高血壓 - Top5_parsimony=0.0001
特徵: ['SBP_T1', 'SBP_T2', 'D_SBP', 'D_GFR', 'FBG_T1']
parsimony=0.0001, maxsize=35

--- Fold 1/3 ---
  AUC: 0.500, 時間: 15.6分鐘
  最佳公式: 0.189674480000000
  Pareto 前沿（前 5）:
    C= 1: 0.10778828
    C= 2: abs(0.18967448)
    C= 5: (1.8922251 + SBP_T2) * 0.10021353
    C= 6: abs(-0.10913244 * (-1.6972121 - SBP_T2))
    C= 7: (2.9165955 + (SBP_T2 + SBP_T1)) * 0.06502958

--- Fold 2/3 ---
  AUC: 0.698, 時間: 15.6分鐘
  最佳公式: 0.10172239*SBP_T2 + 0.198235647613773
  Pareto 前沿（前 5）:
    C= 1: 0.19847867
    C= 3: square(square(0.66741693))
    C= 5: 0.10172239 * (SBP_T2 - -1.9487907)
    C= 6: abs((SBP_T2 + 1.7521147) * 0.110747166)
    C= 7: 0.10172239 * (SBP_T1 - (-1.9487907 - D_SBP))

--- Fold 3/3 ---
  AUC: 0.500, 時間: 15.5分鐘
  最佳公式: 0.190536840000000
  Pareto 前沿（前 5）:
    C= 1: FBG_T1
    C= 2: abs(0.19053684)
    C= 5: (SBP_T2 - -1.8631719) * 0.102271855
    C= 6: abs(-0.10970054 * (SBP_T2 - -1.7045609))
    C= 7: (SBP_T2 + (SBP_T1 - -2.8602915)) * 0.06660664

--- To

## 5. 實驗 B：Top 5 特徵 + parsimony=0（無懲罰）

In [6]:
# 實驗 B：parsimony=0，看能否產生更深公式
results_B = []

for disease_name in ['高血壓']:
    y = targets[disease_name]
    top5 = top5_features[disease_name]
    
    top5_indices = [all_feature_names.index(f) for f in top5]
    X_top5 = X_all.iloc[:, top5_indices]
    
    result = run_pysr_experiment(
        X_top5, y, groups, top5, disease_name,
        exp_name="Top5_parsimony=0",
        maxsize=35,
        niterations=300,
        parsimony=0,  # 無複雜度懲罰
        timeout=900
    )
    results_B.append(result)


高血壓 - Top5_parsimony=0
特徵: ['SBP_T1', 'SBP_T2', 'D_SBP', 'D_GFR', 'FBG_T1']
parsimony=0, maxsize=35

--- Fold 1/3 ---
  AUC: 0.500, 時間: 15.7分鐘
  最佳公式: 0.189674470000000
  Pareto 前沿（前 5）:
    C= 1: D_GFR
    C= 2: abs(-0.18967447)
    C= 5: (SBP_T2 * 0.10018963) + 0.18967257
    C= 6: abs((-1.6987418 - SBP_T2) * -0.10906334)
    C= 7: ((SBP_T2 + SBP_T1) * 0.08345131) + 0.18967257

--- Fold 2/3 ---
  AUC: 0.500, 時間: 15.9分鐘
  最佳公式: 0.198478670000000
  Pareto 前沿（前 5）:
    C= 1: 1.1113906
    C= 2: abs(0.19847867)
    C= 5: 0.101526156 * (SBP_T2 + 1.9548389)
    C= 6: abs((-1.7520107 - SBP_T2) * 0.1107625)
    C= 7: ((3.0460725 + SBP_T2) + SBP_T1) * 0.06515484

--- Fold 3/3 ---
  AUC: 0.500, 時間: 15.5分鐘
  最佳公式: 0.190536885050366
  Pareto 前沿（前 5）:
    C= 1: SBP_T1
    C= 2: log(1.209899)
    C= 5: (SBP_T2 + 1.8628505) * 0.10228542
    C= 6: abs(-0.10974182 * (SBP_T2 + 1.7037925))
    C= 7: 0.06647109 * (SBP_T1 + (2.8657203 + SBP_T2))

--- Top5_parsimony=0 結果 ---
AUC: 0.500 ± 0.000
總時間: 47.2 

## 6. 實驗 C：全部特徵 + parsimony=0（對照組）

In [7]:
# 實驗 C：全部特徵 + parsimony=0
results_C = []

for disease_name in ['高血壓']:
    y = targets[disease_name]
    
    result = run_pysr_experiment(
        X_all, y, groups, all_feature_names, disease_name,
        exp_name="All26_parsimony=0",
        maxsize=35,
        niterations=300,
        parsimony=0,
        timeout=900
    )
    results_C.append(result)


高血壓 - All26_parsimony=0
特徵: ['sex', 'Age', 'FBG_T1', 'TC_T1', 'Cr_T1', 'UA_T1', 'GFR_T1', 'BMI_T1', 'SBP_T1', 'DBP_T1', 'FBG_T2', 'TC_T2', 'Cr_T2', 'UA_T2', 'GFR_T2', 'BMI_T2', 'SBP_T2', 'DBP_T2', 'D_FBG', 'D_TC', 'D_Cr', 'D_UA', 'D_GFR', 'D_BMI', 'D_SBP', 'D_DBP']
parsimony=0, maxsize=35

--- Fold 1/3 ---
  AUC: 0.713, 時間: 15.6分鐘
  最佳公式: 0.06502808*SBP_T1 + 0.06502808*SBP_T2 + 0.18967049
  Pareto 前沿（前 5）:
    C= 1: 0.18967447
    C= 5: (0.1002009 * SBP_T2) + 0.18966252
    C= 6: abs((0.10907269 * SBP_T2) + 0.18519574)
    C= 7: ((SBP_T2 + SBP_T1) * 0.06502808) + 0.18967049
    C= 8: abs((0.06689704 * (SBP_T2 + SBP_T1)) + 0.18796583)

--- Fold 2/3 ---
  AUC: 0.500, 時間: 15.4分鐘
  最佳公式: 0.198479260000000
  Pareto 前沿（前 5）:
    C= 1: SBP_T2
    C= 2: abs(0.19847926)
    C= 6: abs((SBP_T2 * -0.11070968) + -0.19410582)
    C= 8: abs(0.19697031 + (0.0669935 * (SBP_T1 + SBP_T2)))
    C= 9: abs(((SBP_T2 + SBP_T1) * square(-0.2585116)) - -0.19700283)

--- Fold 3/3 ---
  AUC: 0.500, 時間: 15.5分鐘
  

## 7. 結果比較

In [8]:
# 彙總所有結果
all_results = results_A + results_B + results_C

print("\n" + "="*70)
print("實驗結果比較")
print("="*70)

for r in all_results:
    print(f"\n{r['exp_name']}:")
    print(f"  AUC: {r['AUC_mean']:.3f} ± {r['AUC_std']:.3f}")
    print(f"  時間: {r['total_time']:.1f} 分鐘")
    print(f"  公式:")
    for i, formula in enumerate(r['formulas']):
        print(f"    Fold {i+1}: {formula[:70]}..." if len(formula) > 70 else f"    Fold {i+1}: {formula}")


實驗結果比較

Top5_parsimony=0.0001:
  AUC: 0.566 ± 0.114
  時間: 46.7 分鐘
  公式:
    Fold 1: 0.189674480000000
    Fold 2: 0.10172239*SBP_T2 + 0.198235647613773
    Fold 3: 0.190536840000000

Top5_parsimony=0:
  AUC: 0.500 ± 0.000
  時間: 47.2 分鐘
  公式:
    Fold 1: 0.189674470000000
    Fold 2: 0.198478670000000
    Fold 3: 0.190536885050366

All26_parsimony=0:
  AUC: 0.571 ± 0.123
  時間: 46.5 分鐘
  公式:
    Fold 1: 0.06502808*SBP_T1 + 0.06502808*SBP_T2 + 0.18967049
    Fold 2: 0.198479260000000
    Fold 3: 0.190536840000000


In [9]:
# 儲存結果
summary_data = []
for r in all_results:
    summary_data.append({
        '疾病': r['disease'],
        '實驗': r['exp_name'],
        'AUC': f"{r['AUC_mean']:.3f} ± {r['AUC_std']:.3f}",
        '時間(分)': f"{r['total_time']:.1f}",
        '公式_Fold1': r['formulas'][0][:80] if len(r['formulas']) > 0 else '',
    })

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('../../results/tables/pysr_depth_experiment.csv', index=False)
print("已儲存: results/tables/pysr_depth_experiment.csv")
print(summary_df)

已儲存: results/tables/pysr_depth_experiment.csv
    疾病                     實驗            AUC 時間(分)  \
0  高血壓  Top5_parsimony=0.0001  0.566 ± 0.114  46.7   
1  高血壓       Top5_parsimony=0  0.500 ± 0.000  47.2   
2  高血壓      All26_parsimony=0  0.571 ± 0.123  46.5   

                                            公式_Fold1  
0                                  0.189674480000000  
1                                  0.189674470000000  
2  0.06502808*SBP_T1 + 0.06502808*SBP_T2 + 0.1896...  


## 8. 結論

In [None]:
print("="*70)
print("PySR 樹深度實驗結論")
print("="*70)
print("""
實驗設計：
- A: Top 5 特徵 + parsimony=0.0001（重現 Nb16）
- B: Top 5 特徵 + parsimony=0（嘗試更深公式）
- C: 全部 26 特徵 + parsimony=0（對照組）

執行時間：約 140 分鐘（3 個實驗 × 3-Fold CV）

實際結果：
┌─────────────────────────┬─────────────────┬──────────────┐
│ 實驗                    │ AUC             │ 公式穩定性   │
├─────────────────────────┼─────────────────┼──────────────┤
│ A: Top5 + p=0.0001      │ 0.566 ± 0.114   │ ❌ 2/3 常數  │
│ B: Top5 + p=0           │ 0.500 ± 0.000   │ ❌ 3/3 常數  │
│ C: All26 + p=0          │ 0.571 ± 0.123   │ ❌ 2/3 常數  │
└─────────────────────────┴─────────────────┴──────────────┘

複雜公式 vs 簡單公式：
- 常數 (depth=0)：AUC ≈ 0.50（無預測能力）
- depth=1 `0.10*SBP_T2`：AUC ≈ 0.70
- depth=2 `0.065*(SBP_T1+SBP_T2)`：AUC ≈ 0.71
→ 複雜公式確實較好，但提升有限（+1-2%），且不穩定

關鍵發現：
1. 高血壓 PySR 搜索容易陷入常數解（6/9 folds 退化）
2. 即使產生 depth=2 公式，AUC 仍遠低於 RF (0.743) / LR (0.72)
3. 減少特徵數 + 調整 parsimony 無法解決不穩定問題

結論：
- PySR 對高血壓預測不適合，建議論文以高血糖公式為主
- 高血糖 `0.12*FBG_T2` 穩定且 AUC=0.918（Nb23 結果）
- 高血壓/高血脂作為「嘗試但不穩定」的補充說明
""")