In [2]:
import pandas as pd

df = pd.read_csv('data_processed.csv')

我们首先使用多元线性回归（OLS）进行建模，得到 $R^2 \approx 0.1$，说明线性模型拟合效果较差。随后引入多项式回归与随机森林回归等非线性方法，结果显示 $R^2$ 显著提升，表明 Y 染色体浓度与年龄、孕妇 BMI、检测孕周及末次月经之间存在非线性关系。

In [3]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# 假设 df 是你的 DataFrame
features = ['年龄', '孕妇BMI', '检测孕周_周数', '末次月经']
target = 'Y染色体浓度'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. OLS
ols = LinearRegression().fit(X_train, y_train)
print("OLS R²:", r2_score(y_test, ols.predict(X_test)))

# 2. 多项式回归
poly_model = Pipeline([
    ('poly', PolynomialFeatures(degree=3, include_bias=False)),
    ('linear', LinearRegression())
]).fit(X_train, y_train)
print("Polynomial R²:", r2_score(y_test, poly_model.predict(X_test)))

# 3. 随机森林
rf = RandomForestRegressor(n_estimators=200, random_state=42).fit(X_train, y_train)
print("RandomForest R²:", r2_score(y_test, rf.predict(X_test)))

# 4. SVR
svr = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.01).fit(X_train, y_train)
print("SVR R²:", r2_score(y_test, svr.predict(X_test)))


OLS R²: 0.14671240552091303
Polynomial R²: 0.17892760051567702
RandomForest R²: 0.43708420921476854
SVR R²: 0.10668572515170605


可以看到：

随机森林已经明显优于其他模型（R² 提升到 0.437）
多项式回归有一点提升，但幅度不大（说明非线性存在，但多项式形式不足以完全捕捉）
SVR效果反而下降（可能是参数不适合或数据特征不适合 RBF 核）

In [8]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.6,
    colsample_bytree=0.8,
    random_state=42
)
xgb.fit(X_train, y_train)
print("XGBoost R²:", r2_score(y_test, xgb.predict(X_test)))


print("XGBoost MAE:", mean_absolute_error(y_test, xgb.predict(X_test)))



XGBoost R²: 0.4891296107727223
XGBoost MAE: 0.4927215567388203
