# 16. PySR 參數調整實驗

針對 notebook 15 發現的「常數解」問題，調整參數讓 PySR 產生包含重要特徵的公式。

## 調整策略
1. 降低 parsimony（複雜度懲罰）
2. 增加迭代次數
3. 限制特徵數（使用 Top 5 重要特徵）

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import warnings
import os
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

from pysr import PySRRegressor

plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei', 'SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False

os.makedirs('./pysr_temp', exist_ok=True)
print("套件載入完成")

套件載入完成


## 1. 載入資料與前處理

In [11]:
# 載入資料
df = pd.read_csv('../../data/processed/SUA_CVDs_wide_format.csv')
print(f"資料形狀: {df.shape}")

# 定義特徵
all_feature_names = [
    'sex', 'Age',
    'FBG_T1', 'TC_T1', 'Cr_T1', 'UA_T1', 'GFR_T1', 'BMI_T1', 'SBP_T1', 'DBP_T1',
    'FBG_T2', 'TC_T2', 'Cr_T2', 'UA_T2', 'GFR_T2', 'BMI_T2', 'SBP_T2', 'DBP_T2',
    'Delta1_FBG', 'Delta1_TC', 'Delta1_Cr', 'Delta1_UA', 'Delta1_GFR', 'Delta1_BMI', 'Delta1_SBP', 'Delta1_DBP'
]

X = df[all_feature_names]
y = (df['hypertension_T3'] == 2).astype(int)

print(f"特徵數: {len(all_feature_names)}")
print(f"高血壓患病率: {y.mean():.2%}")

資料形狀: (6056, 107)
特徵數: 26
高血壓患病率: 16.68%


In [12]:
# 資料分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"訓練集: {X_train_scaled.shape[0]} -> SMOTE後: {X_train_smote.shape[0]}")
print(f"測試集: {X_test_scaled.shape[0]}")

訓練集: 4844 -> SMOTE後: 8072
測試集: 1212


## 2. RF 篩選 Top 5 重要特徵

In [13]:
# 訓練 RF
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train_smote, y_train_smote)

# 特徵重要性
importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("=== RF 特徵重要性 Top 10 ===")
print(importance_df.head(10))

=== RF 特徵重要性 Top 10 ===
       feature  importance
8       SBP_T1    0.138711
16      SBP_T2    0.099733
24  Delta1_SBP    0.057603
17      DBP_T2    0.049500
9       DBP_T1    0.048007
6       GFR_T1    0.039652
1          Age    0.036843
22  Delta1_GFR    0.034789
2       FBG_T1    0.034275
21   Delta1_UA    0.031668


In [14]:
# 選擇 Top 5
top_5_features = importance_df.head(5)['feature'].tolist()
top_5_indices = [all_feature_names.index(f) for f in top_5_features]

X_train_top5 = X_train_smote[:, top_5_indices]
X_test_top5 = X_test_scaled[:, top_5_indices]

print(f"Top 5 特徵: {top_5_features}")
print(f"Top 5 訓練集形狀: {X_train_top5.shape}")

Top 5 特徵: ['SBP_T1', 'SBP_T2', 'Delta1_SBP', 'DBP_T2', 'DBP_T1']
Top 5 訓練集形狀: (8072, 5)


## 3. PySR 實驗函數

In [15]:
def run_pysr(X_train, y_train, X_test, y_test, feature_names,
             maxsize=25, niterations=200, parsimony=0.001, timeout=600, exp_name="exp"):
    
    print(f"\n{'='*50}")
    print(f"實驗: {exp_name}")
    print(f"maxsize={maxsize}, niter={niterations}, parsimony={parsimony}")
    print(f"特徵: {feature_names}")
    print(f"{'='*50}")
    
    model = PySRRegressor(
        niterations=niterations,
        binary_operators=["+", "-", "*", "/"],
        unary_operators=["exp", "log", "abs", "square"],
        maxsize=maxsize,
        populations=20,
        population_size=100,
        parsimony=parsimony,
        weight_optimize=0.01,
        timeout_in_seconds=timeout,
        temp_equation_file=True,
        tempdir="./pysr_temp",
        random_state=42,
        deterministic=True,
        procs=0,
        multithreading=False,
        progress=True,
        verbosity=1,
    )
    
    model.fit(X_train, y_train, variable_names=feature_names)
    
    y_pred_proba = np.clip(model.predict(X_test), 0, 1)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    try:
        auc = roc_auc_score(y_test, y_pred_proba)
    except:
        auc = 0.5
    
    best_eq = str(model.sympy())
    
    print(f"\n結果: AUC={auc:.3f}, Recall={recall_score(y_test, y_pred, zero_division=0):.3f}")
    print(f"公式: {best_eq[:120]}..." if len(best_eq) > 120 else f"公式: {best_eq}")
    
    return {
        'exp_name': exp_name,
        'AUC': auc,
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        '公式': best_eq,
        'equations_df': model.equations_,
        'model': model
    }

## 4. 實驗 1: Top 5 + 低 parsimony

In [16]:
result_1 = run_pysr(
    X_train_top5, y_train_smote, X_test_top5, y_test,
    top_5_features,
    maxsize=25, niterations=200, parsimony=0.0005, timeout=600,
    exp_name="Top5_低parsimony"
)


實驗: Top5_低parsimony
maxsize=25, niter=200, parsimony=0.0005
特徵: ['SBP_T1', 'SBP_T2', 'Delta1_SBP', 'DBP_T2', 'DBP_T1']
Compiling Julia backend...


[ Info: Started!



Expressions evaluated per second: 1.190e+04
Progress: 10 / 4000 total iterations (0.250%)
Hall of Fame:
---------------------------------------------------------------------------------------------------
Complexity  Loss       Score     Equation
2           2.500e-01  7.971e+00  y = abs(-0.50007)
4           2.260e-01  5.050e-02  y = exp(SBP_T1 + -1.574)
6           2.239e-01  4.564e-03  y = abs(0.42843 + (0.24017 * SBP_T2))
9           2.065e-01  2.700e-02  y = abs(1.1299 * ((square(0.43855) * SBP_T1) + 0.42843))
10          2.045e-01  9.951e-03  y = abs((0.57309 * (log(1.5026) * SBP_T1)) + log(1.5026))
11          2.038e-01  3.253e-03  y = abs((abs(-0.59329) * (log(1.5132) * SBP_T1)) + log(1.5326)...
                                  )
12          2.025e-01  6.377e-03  y = abs(-0.41654 + (-0.092525 * (SBP_T1 - ((-0.41654 * SBP_T1)...
                                   - SBP_T2))))
13          2.015e-01  5.113e-03  y = abs(abs((-0.092525 * (SBP_T1 - ((-0.55004 * SBP_T1) - SBP_...
   

In [17]:
# Pareto 前沿
print("\n=== Pareto 前沿 ===")
print(result_1['equations_df'][['complexity', 'loss', 'equation']])


=== Pareto 前沿 ===
    complexity      loss                                           equation
0            1  0.941361                                             SBP_T2
1            2  0.250000                                   abs(-0.50007015)
2            4  0.225982                            exp(SBP_T1 + -1.573978)
3            5  0.204723                 0.4387614 - (-0.22708407 * SBP_T1)
4            6  0.203258        square(-0.6337315 + (-0.18875423 * SBP_T1))
5            8  0.201783  abs(0.43503353 + (0.1325171 * (SBP_T2 + SBP_T1)))
6            9  0.193727  abs(exp(-1.8595866 / (exp(SBP_T1) + exp(SBP_T2...
7           11  0.190036  exp(-1.7586297 / (exp(SBP_T2) + square(exp(SBP...
8           12  0.189353  square(exp((SBP_T1 + -1.811582) / (2.9942756 +...
9           14  0.187659  abs(exp(-0.2719723) * exp((-1.2178488 + SBP_T1...
10          16  0.187657  abs(exp(-0.27447298) * exp((-1.2082299 + SBP_T...
11          17  0.187514  abs(exp((SBP_T1 + -1.1018956) / (exp(SBP_T2

## 5. 實驗 2: Top 5 + 極低 parsimony

In [18]:
result_2 = run_pysr(
    X_train_top5, y_train_smote, X_test_top5, y_test,
    top_5_features,
    maxsize=35, niterations=250, parsimony=0.0001, timeout=900,
    exp_name="Top5_極低parsimony"
)


實驗: Top5_極低parsimony
maxsize=35, niter=250, parsimony=0.0001
特徵: ['SBP_T1', 'SBP_T2', 'Delta1_SBP', 'DBP_T2', 'DBP_T1']


[ Info: Started!



Expressions evaluated per second: 2.730e+03
Progress: 3 / 5000 total iterations (0.060%)
Hall of Fame:
---------------------------------------------------------------------------------------------------
Complexity  Loss       Score     Equation
2           2.500e-01  7.971e+00  y = exp(-0.69343)
6           2.057e-01  4.882e-02  y = abs((SBP_T1 * 0.19624) + 0.46052)
10          2.056e-01  9.743e-05  y = abs(0.48654 * (abs(-0.77986) + (SBP_T1 / square(1.374))))
11          2.040e-01  7.646e-03  y = abs(0.47657 * ((SBP_T1 / square(1.3923)) + (0.13921 - -0.7...
                                  8815)))
13          2.039e-01  2.846e-04  y = abs(0.47657 * ((SBP_T1 / square(1.3923)) + ((0.13921 * DBP...
                                  _T1) - -0.78815)))
14          1.984e-01  2.718e-02  y = abs(0.47657 * ((SBP_T1 / square(1.3923)) + ((0.13921 * abs...
                                  (Delta1_SBP)) - -0.78815)))
20          1.980e-01  3.980e-04  y = abs(0.47657 * (((0.13921 * log(square(-

In [19]:
print("\n=== Pareto 前沿 ===")
print(result_2['equations_df'][['complexity', 'loss', 'equation']])


=== Pareto 前沿 ===
    complexity      loss                                           equation
0            1  0.250047                                          0.4931216
1            2  0.250000                                    abs(0.49999285)
2            4  0.220259                            exp(SBP_T1) * 0.2418046
3            5  0.204723                (0.22708504 * SBP_T1) - -0.43876034
4            6  0.203258         square((0.18874988 * SBP_T1) - -0.6337047)
5            7  0.201852       (SBP_T2 - (-3.311526 - SBP_T1)) * 0.13166021
6            8  0.201783  abs((SBP_T2 - (-3.2853353 - SBP_T1)) * 0.13243...
7            9  0.200237  abs((abs(Delta1_SBP) - (-1.4584364 - SBP_T1)) ...
8           10  0.197753  abs(exp(exp((-0.24826662 * Delta1_SBP) - SBP_T...
9           11  0.190750  abs(exp(exp(0.22650065 - SBP_T1) / (Delta1_SBP...
10          12  0.188191  exp(abs(1.1869891 - SBP_T1) / (Delta1_SBP - ex...
11          13  0.188190  abs(exp(abs(1.1869891 - SBP_T1) / (Delta1_S

## 6. 結果比較

In [20]:
all_results = [result_1, result_2]

print("=== 實驗結果比較 ===")
for r in all_results:
    print(f"\n{r['exp_name']}: AUC={r['AUC']:.3f}")
    print(f"  公式: {r['公式']}")

=== 實驗結果比較 ===

Top5_低parsimony: AUC=0.500
  公式: 0.500070150000000

Top5_極低parsimony: AUC=0.714
  公式: 0.22708504*SBP_T1 + 0.43876034


In [21]:
# 儲存結果
os.makedirs('../../results/tables', exist_ok=True)

pd.DataFrame([{
    '實驗': r['exp_name'],
    'AUC': r['AUC'],
    '公式': r['公式']
} for r in all_results]).to_csv('../../results/tables/pysr_tuning_results.csv', index=False)

print("結果已儲存")

結果已儲存


## 7. 不同複雜度公式的 AUC 評估

PySR 預設選擇 Pareto 前沿的「最佳平衡點」，但較複雜的公式可能有更好的預測性能。
這裡我們評估實驗 2 中不同複雜度公式的實際 AUC。