In [1]:
import pandas as pd
import numpy as np
import time

# 引入贝叶斯优化库
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder # 去掉了 StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import clone

# ==========================================
# 1. 数据加载与分割 (80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/标准化数据_无独热_Log变换12.18.csv'

try:
    data = pd.read_csv(file_path)
    print("数据加载成功 (UTF-8)")
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')
    print("数据加载成功 (GBK)")

excluded_columns = ['log_Separation factor', 'polymer', 'DOI', 'Flux']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Separation factor']

# 80% 用于调优和交叉验证，20% 用于最终测试
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 构建预处理 (关键修改：移除标准化)
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

# 【修改点】：数值列使用 'passthrough'，即直接通过，不再标准化
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features), # <--- 关键修改：不做处理
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) # 独热编码保留
    ])

# Pipeline: 预处理 -> 决策树
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

# ==========================================
# 3. 第一阶段：贝叶斯超参数优化
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始贝叶斯优化 (BayesSearchCV)...")
print("="*50)

# 定义决策树搜索空间
search_spaces = {
    'regressor__max_depth': Integer(3, 50),
    'regressor__min_samples_split': Integer(2, 20),
    'regressor__min_samples_leaf': Integer(1, 10),
    'regressor__max_features': Categorical(['sqrt', 'log2', None])
}

opt = BayesSearchCV(
    pipeline,
    search_spaces,
    n_iter=50,      # 迭代50次
    cv=5,           # 内部5折
    n_jobs=-1,
    random_state=42,
    scoring='neg_mean_absolute_error',
    verbose=0
)

start_time = time.time()
opt.fit(X_train_full, y_train_full)
end_time = time.time()

best_estimator = opt.best_estimator_
best_params = opt.best_params_

print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
print("\n>>> 最佳参数:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# ==========================================
# 4. 第二阶段：十折交叉验证 (使用最佳参数)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 切分数据
    X_train_fold = X_train_full.iloc[train_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    # 克隆模型并训练
    model = clone(best_estimator)
    model.fit(X_train_fold, y_train_fold)
    
    # 预测
    y_train_pred = model.predict(X_train_fold)
    y_val_pred = model.predict(X_val_fold)
    
    # 记录指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出十折表
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果 (Per-Fold Results):")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

# 全量训练最佳模型
best_estimator.fit(X_train_full, y_train_full)

# 最终预测
y_test_pred = best_estimator.predict(X_test)

final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Training (Avg)": [
        df_final_cv.iloc[-1]["Train MAE"],
        df_final_cv.iloc[-1]["Train RMSE"],
        df_final_cv.iloc[-1]["Train R2"]
    ],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 模型性能最终汇总 (Pre-standardized Data):")
print(pd.DataFrame(final_summary).round(4))

数据加载成功 (UTF-8)
原始数据总量: 816
用于调优和CV的训练集 (80%): 652
独立测试集 (20%): 164

>>> Stage 1: 开始贝叶斯优化 (BayesSearchCV)...
贝叶斯优化耗时: 54.91 秒

>>> 最佳参数:
  regressor__max_depth: 46
  regressor__max_features: None
  regressor__min_samples_leaf: 1
  regressor__min_samples_split: 2

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...
Fold 1/10 完成 | Val R2: 0.6562
Fold 2/10 完成 | Val R2: 0.4065
Fold 3/10 完成 | Val R2: 0.7465
Fold 4/10 完成 | Val R2: 0.7354
Fold 5/10 完成 | Val R2: 0.7337
Fold 6/10 完成 | Val R2: 0.6991
Fold 7/10 完成 | Val R2: 0.6377
Fold 8/10 完成 | Val R2: 0.5265
Fold 9/10 完成 | Val R2: 0.8077
Fold 10/10 完成 | Val R2: 0.7237

>>> 十折交叉验证详细结果 (Per-Fold Results):
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1       1.0         0.0        0.0  0.6562    0.4483   0.2615
1         2       1.0         0.0        0.0  0.4065    0.4207   0.2125
2         3       1.0         0.0        0.0  0.7465    0.3599   0.1907
3         4       1.0         0.0        0.0  0.7354    0.3229   0.1

In [2]:
import pandas as pd
import numpy as np
import time

# 引入贝叶斯优化库
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder # 去掉了 StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import clone

# ==========================================
# 1. 数据加载与分割 (80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/通量标准化数据_无独热_Log变换12.18.csv'
data = pd.read_csv(file_path)

# Define excluded columns (unchanged)
excluded_columns = ['log_Flux', 'polymer', 'DOI', 'Separation factor']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Flux']

# 80% 用于调优和交叉验证，20% 用于最终测试
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 构建预处理 (关键修改：移除标准化)
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

# 【修改点】：数值列使用 'passthrough'，即直接通过，不再标准化
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features), # <--- 关键修改：不做处理
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) # 独热编码保留
    ])

# Pipeline: 预处理 -> 决策树
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

# ==========================================
# 3. 第一阶段：贝叶斯超参数优化
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始贝叶斯优化 (BayesSearchCV)...")
print("="*50)

# 定义决策树搜索空间
search_spaces = {
    'regressor__max_depth': Integer(3, 50),
    'regressor__min_samples_split': Integer(2, 20),
    'regressor__min_samples_leaf': Integer(1, 10),
    'regressor__max_features': Categorical(['sqrt', 'log2', None])
}

opt = BayesSearchCV(
    pipeline,
    search_spaces,
    n_iter=50,      # 迭代50次
    cv=5,           # 内部5折
    n_jobs=-1,
    random_state=42,
    scoring='neg_mean_absolute_error',
    verbose=0
)

start_time = time.time()
opt.fit(X_train_full, y_train_full)
end_time = time.time()

best_estimator = opt.best_estimator_
best_params = opt.best_params_

print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
print("\n>>> 最佳参数:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# ==========================================
# 4. 第二阶段：十折交叉验证 (使用最佳参数)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 切分数据
    X_train_fold = X_train_full.iloc[train_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    # 克隆模型并训练
    model = clone(best_estimator)
    model.fit(X_train_fold, y_train_fold)
    
    # 预测
    y_train_pred = model.predict(X_train_fold)
    y_val_pred = model.predict(X_val_fold)
    
    # 记录指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出十折表
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果 (Per-Fold Results):")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

# 全量训练最佳模型
best_estimator.fit(X_train_full, y_train_full)

# 最终预测
y_test_pred = best_estimator.predict(X_test)

final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Training (Avg)": [
        df_final_cv.iloc[-1]["Train MAE"],
        df_final_cv.iloc[-1]["Train RMSE"],
        df_final_cv.iloc[-1]["Train R2"]
    ],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 模型性能最终汇总 (Pre-standardized Data):")
print(pd.DataFrame(final_summary).round(4))

原始数据总量: 791
用于调优和CV的训练集 (80%): 632
独立测试集 (20%): 159

>>> Stage 1: 开始贝叶斯优化 (BayesSearchCV)...
贝叶斯优化耗时: 44.89 秒

>>> 最佳参数:
  regressor__max_depth: 22
  regressor__max_features: log2
  regressor__min_samples_leaf: 1
  regressor__min_samples_split: 2

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...
Fold 1/10 完成 | Val R2: -0.1407
Fold 2/10 完成 | Val R2: 0.5551
Fold 3/10 完成 | Val R2: 0.3752
Fold 4/10 完成 | Val R2: 0.5223
Fold 5/10 完成 | Val R2: 0.1061
Fold 6/10 完成 | Val R2: 0.6586
Fold 7/10 完成 | Val R2: 0.6000
Fold 8/10 完成 | Val R2: 0.0919
Fold 9/10 完成 | Val R2: 0.5191
Fold 10/10 完成 | Val R2: 0.3646

>>> 十折交叉验证详细结果 (Per-Fold Results):
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.9999      0.0073     0.0007 -0.1407    0.6358   0.3909
1         2    1.0000      0.0000     0.0000  0.5551    0.3845   0.2739
2         3    1.0000      0.0000     0.0000  0.3752    0.4286   0.2726
3         4    0.9997      0.0098     0.0009  0.5223    0.5208   0.3465
4         