In [1]:
import pandas as pd
import numpy as np
import time

# 引入贝叶斯优化库
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==========================================
# 1. 数据加载与分割 (80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/标准化数据_无独热_Log变换12.18.csv'

try:
    data = pd.read_csv(file_path)
    print("数据加载成功 (UTF-8)")
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')
    print("数据加载成功 (GBK)")

excluded_columns = ['log_Separation factor', 'polymer', 'DOI', 'Flux']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Separation factor']

# 划分出 20% 的独立测试集
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 构建预处理与模型流水线
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Pipeline: 预处理 -> SVR
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR()) # 这里先不设参数，由贝叶斯优化来定
])

# ==========================================
# 3. 第一阶段：贝叶斯超参数优化 (Bayesian Optimization)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始贝叶斯优化 (BayesSearchCV)...")
print("="*50)

# 定义搜索空间 (Search Space)
# Real: 连续变量 (如 C, epsilon)
# Categorical: 离散类别 (如 kernel, gamma)
search_spaces = {
    'regressor__C': Real(0.1, 1000, prior='log-uniform'),      # 对数均匀分布搜索 C
    'regressor__epsilon': Real(0.001, 1.0, prior='log-uniform'), # 对数均匀分布搜索 epsilon
    'regressor__gamma': Categorical(['scale', 'auto']),        # gamma 选项
    'regressor__kernel': Categorical(['rbf'])                  # 通常用 rbf 核
}

# 初始化贝叶斯搜索器
opt = BayesSearchCV(
    pipeline,
    search_spaces,
    n_iter=30,      # 迭代次数 (尝试多少种组合)
    cv=5,           # 内部 5 折验证
    n_jobs=-1,      # 并行计算
    random_state=42,
    scoring='neg_mean_absolute_error', # 优化目标：最小化 MAE
    verbose=0
)

start_time = time.time()
opt.fit(X_train_full, y_train_full)
end_time = time.time()

best_model = opt.best_estimator_
best_params = opt.best_params_

print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
print("\n>>> 最佳参数 (Best Parameters):")
# 打印最佳参数 (注意: OrderedDict 格式)
for param, value in best_params.items():
    print(f"  {param}: {value}")

# ==========================================
# 4. 第二阶段：使用最佳参数进行十折交叉验证
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 切分数据
    X_train_fold, X_val_fold = X_train_full.iloc[train_idx], X_train_full.iloc[val_idx]
    y_train_fold, y_val_fold = y_train_full.iloc[train_idx], y_train_full.iloc[val_idx]
    
    # 训练模型 (Pipeline 自动处理预处理)
    best_model.fit(X_train_fold, y_train_fold)
    
    # 预测
    y_train_pred = best_model.predict(X_train_fold)
    y_val_pred = best_model.predict(X_val_fold)
    
    # 记录详细指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出十折详细表格
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果:")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

# 使用 80% 的全量数据重新训练最佳模型
best_model.fit(X_train_full, y_train_full)

# 在 20% 独立测试集上预测
y_test_pred = best_model.predict(X_test)

final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 模型性能最终汇总 (SVR + BayesOpt):")
print(pd.DataFrame(final_summary).round(4))

数据加载成功 (UTF-8)
原始数据总量: 816
用于调优和CV的训练集 (80%): 652
独立测试集 (20%): 164

>>> Stage 1: 开始贝叶斯优化 (BayesSearchCV)...
贝叶斯优化耗时: 30.17 秒

>>> 最佳参数 (Best Parameters):
  regressor__C: 21.09194643701318
  regressor__epsilon: 0.001
  regressor__gamma: scale
  regressor__kernel: rbf

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...
Fold 1/10 完成 | Val R2: 0.7699
Fold 2/10 完成 | Val R2: 0.7632
Fold 3/10 完成 | Val R2: 0.7576
Fold 4/10 完成 | Val R2: 0.6878
Fold 5/10 完成 | Val R2: 0.7204
Fold 6/10 完成 | Val R2: 0.7878
Fold 7/10 完成 | Val R2: 0.1876
Fold 8/10 完成 | Val R2: 0.4860
Fold 9/10 完成 | Val R2: 0.7040
Fold 10/10 完成 | Val R2: 0.7294

>>> 十折交叉验证详细结果:
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.8843      0.2499     0.1161  0.7699    0.3668   0.2214
1         2    0.8808      0.2614     0.1197  0.7632    0.2657   0.1624
2         3    0.8872      0.2490     0.1101  0.7576    0.3519   0.2190
3         4    0.8760      0.2638     0.1202  0.6878    0.3508   0.1839
4         

In [2]:
import pandas as pd
import numpy as np
import time

# 引入贝叶斯优化库
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==========================================
# 1. 数据加载与分割 (80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/通量标准化数据_无独热_Log变换12.18.csv'
data = pd.read_csv(file_path)

# Define excluded columns (unchanged)
excluded_columns = ['log_Flux', 'polymer', 'DOI', 'Separation factor']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Flux']

# 划分出 20% 的独立测试集
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 构建预处理与模型流水线
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Pipeline: 预处理 -> SVR
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR()) # 这里先不设参数，由贝叶斯优化来定
])

# ==========================================
# 3. 第一阶段：贝叶斯超参数优化 (Bayesian Optimization)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始贝叶斯优化 (BayesSearchCV)...")
print("="*50)

# 定义搜索空间 (Search Space)
# Real: 连续变量 (如 C, epsilon)
# Categorical: 离散类别 (如 kernel, gamma)
search_spaces = {
    'regressor__C': Real(0.1, 1000, prior='log-uniform'),      # 对数均匀分布搜索 C
    'regressor__epsilon': Real(0.001, 1.0, prior='log-uniform'), # 对数均匀分布搜索 epsilon
    'regressor__gamma': Categorical(['scale', 'auto']),        # gamma 选项
    'regressor__kernel': Categorical(['rbf'])                  # 通常用 rbf 核
}

# 初始化贝叶斯搜索器
opt = BayesSearchCV(
    pipeline,
    search_spaces,
    n_iter=30,      # 迭代次数 (尝试多少种组合)
    cv=5,           # 内部 5 折验证
    n_jobs=-1,      # 并行计算
    random_state=42,
    scoring='neg_mean_absolute_error', # 优化目标：最小化 MAE
    verbose=0
)

start_time = time.time()
opt.fit(X_train_full, y_train_full)
end_time = time.time()

best_model = opt.best_estimator_
best_params = opt.best_params_

print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
print("\n>>> 最佳参数 (Best Parameters):")
# 打印最佳参数 (注意: OrderedDict 格式)
for param, value in best_params.items():
    print(f"  {param}: {value}")

# ==========================================
# 4. 第二阶段：使用最佳参数进行十折交叉验证
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 切分数据
    X_train_fold, X_val_fold = X_train_full.iloc[train_idx], X_train_full.iloc[val_idx]
    y_train_fold, y_val_fold = y_train_full.iloc[train_idx], y_train_full.iloc[val_idx]
    
    # 训练模型 (Pipeline 自动处理预处理)
    best_model.fit(X_train_fold, y_train_fold)
    
    # 预测
    y_train_pred = best_model.predict(X_train_fold)
    y_val_pred = best_model.predict(X_val_fold)
    
    # 记录详细指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出十折详细表格
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果:")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

# 使用 80% 的全量数据重新训练最佳模型
best_model.fit(X_train_full, y_train_full)

# 在 20% 独立测试集上预测
y_test_pred = best_model.predict(X_test)

final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 模型性能最终汇总 (SVR + BayesOpt):")
print(pd.DataFrame(final_summary).round(4))

原始数据总量: 791
用于调优和CV的训练集 (80%): 632
独立测试集 (20%): 159

>>> Stage 1: 开始贝叶斯优化 (BayesSearchCV)...
贝叶斯优化耗时: 21.14 秒

>>> 最佳参数 (Best Parameters):
  regressor__C: 15.69541931478427
  regressor__epsilon: 0.04143369067161429
  regressor__gamma: scale
  regressor__kernel: rbf

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...
Fold 1/10 完成 | Val R2: 0.0486
Fold 2/10 完成 | Val R2: 0.5746
Fold 3/10 完成 | Val R2: 0.7049
Fold 4/10 完成 | Val R2: 0.4450
Fold 5/10 完成 | Val R2: 0.5135
Fold 6/10 完成 | Val R2: 0.7844
Fold 7/10 完成 | Val R2: 0.5053
Fold 8/10 完成 | Val R2: 0.6492
Fold 9/10 完成 | Val R2: 0.4709
Fold 10/10 完成 | Val R2: 0.4929

>>> 十折交叉验证详细结果:
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.8291      0.2580     0.1387  0.0486    0.5806   0.3496
1         2    0.7781      0.2955     0.1488  0.5746    0.3760   0.2568
2         3    0.7707      0.3021     0.1523  0.7049    0.2945   0.2111
3         4    0.7771      0.2862     0.1441  0.4450    0.5613   0.3279
4         5