In [1]:
import pandas as pd
import numpy as np
import time

# 引入贝叶斯优化库
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import clone

# ==========================================
# 1. 数据加载与分割 (80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/标准化数据_无独热_Log变换12.18.csv'

try:
    data = pd.read_csv(file_path)
    print("数据加载成功 (UTF-8)")
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')
    print("数据加载成功 (GBK)")

excluded_columns = ['log_Separation factor', 'polymer', 'DOI', 'Flux']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Separation factor']

# 划分出 20% 的独立测试集 (Hold-out Test Set)
# 剩下的 80% (X_train_full) 用于贝叶斯调优和十折交叉验证
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 构建预处理与模型流水线
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

# 定义预处理器
# 注意：因为数据已经标准化，数值列使用 'passthrough' (不处理)
# 分类列使用 OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])

# 定义 Pipeline: 预处理 -> 随机森林
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1)) # n_jobs=-1 调用所有CPU加速
])

# ==========================================
# 3. 第一阶段：贝叶斯超参数优化
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始随机森林贝叶斯优化 (BayesSearchCV)...")
print("="*50)

# 定义随机森林的搜索空间
search_spaces = {
    'regressor__n_estimators': Integer(50, 500),         # 树的数量
    'regressor__max_depth': Integer(5, 50),              # 树的最大深度
    'regressor__min_samples_split': Integer(2, 10),      # 分裂所需的最小样本数
    'regressor__min_samples_leaf': Integer(1, 10),       # 叶节点最小样本数
    'regressor__max_features': Categorical(['sqrt', 'log2']) # 特征选择策略
}

# 初始化贝叶斯搜索
# n_iter=30 表示尝试30种组合，随机森林比较慢，建议设为 20-30，如果电脑配置好可设为 50
opt = BayesSearchCV(
    pipeline,
    search_spaces,
    n_iter=30,      
    cv=5,           # 调优阶段内部使用 5 折
    n_jobs=-1,      # 并行计算
    random_state=42,
    scoring='neg_mean_absolute_error',
    verbose=0
)

start_time = time.time()
opt.fit(X_train_full, y_train_full)
end_time = time.time()

best_estimator = opt.best_estimator_ # 这是一个包含预处理和最佳参数RF的完整 Pipeline
best_params = opt.best_params_

print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
print("\n>>> 最佳参数:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# ==========================================
# 4. 第二阶段：十折交叉验证 (使用最佳参数)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 4.1 切分数据
    X_train_fold = X_train_full.iloc[train_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    # 4.2 克隆最佳模型 (确保每一折从头训练)
    model = clone(best_estimator)
    
    # 4.3 训练 (Pipeline 自动处理 OneHot)
    model.fit(X_train_fold, y_train_fold)
    
    # 4.4 预测
    y_train_pred = model.predict(X_train_fold)
    y_val_pred = model.predict(X_val_fold)
    
    # 4.5 记录指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出详细表格
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果 (Per-Fold Results):")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

# 使用 80% 的全量数据重新训练最佳模型
best_estimator.fit(X_train_full, y_train_full)

# 在 20% 独立测试集上预测
y_test_pred = best_estimator.predict(X_test)

final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Training (Avg)": [
        df_final_cv.iloc[-1]["Train MAE"],
        df_final_cv.iloc[-1]["Train RMSE"],
        df_final_cv.iloc[-1]["Train R2"]
    ],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 模型性能最终汇总 (Random Forest + BayesOpt):")
print(pd.DataFrame(final_summary).round(4))

数据加载成功 (UTF-8)
原始数据总量: 816
用于调优和CV的训练集 (80%): 652
独立测试集 (20%): 164

>>> Stage 1: 开始随机森林贝叶斯优化 (BayesSearchCV)...
贝叶斯优化耗时: 47.89 秒

>>> 最佳参数:
  regressor__max_depth: 25
  regressor__max_features: log2
  regressor__min_samples_leaf: 1
  regressor__min_samples_split: 2
  regressor__n_estimators: 500

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...
Fold 1/10 完成 | Val R2: 0.8123
Fold 2/10 完成 | Val R2: 0.8222
Fold 3/10 完成 | Val R2: 0.7910
Fold 4/10 完成 | Val R2: 0.8819
Fold 5/10 完成 | Val R2: 0.8761
Fold 6/10 完成 | Val R2: 0.9109
Fold 7/10 完成 | Val R2: 0.7537
Fold 8/10 完成 | Val R2: 0.8117
Fold 9/10 完成 | Val R2: 0.8360
Fold 10/10 完成 | Val R2: 0.7741

>>> 十折交叉验证详细结果 (Per-Fold Results):
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.9795      0.1051     0.0651  0.8123    0.3313   0.2091
1         2    0.9789      0.1100     0.0671  0.8222    0.2302   0.1353
2         3    0.9787      0.1081     0.0661  0.7910    0.3268   0.1891
3         4    0.9774      0.1125

In [2]:
import pandas as pd
import numpy as np
import time

# 引入贝叶斯优化库
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import clone

# ==========================================
# 1. 数据加载与分割 (80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/通量标准化数据_无独热_Log变换12.18.csv'
data = pd.read_csv(file_path)

# Define excluded columns (unchanged)
excluded_columns = ['log_Flux', 'polymer', 'DOI', 'Separation factor']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Flux']

# 划分出 20% 的独立测试集 (Hold-out Test Set)
# 剩下的 80% (X_train_full) 用于贝叶斯调优和十折交叉验证
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 构建预处理与模型流水线
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

# 定义预处理器
# 注意：因为数据已经标准化，数值列使用 'passthrough' (不处理)
# 分类列使用 OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])

# 定义 Pipeline: 预处理 -> 随机森林
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1)) # n_jobs=-1 调用所有CPU加速
])

# ==========================================
# 3. 第一阶段：贝叶斯超参数优化
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始随机森林贝叶斯优化 (BayesSearchCV)...")
print("="*50)

# 定义随机森林的搜索空间
search_spaces = {
    'regressor__n_estimators': Integer(50, 500),         # 树的数量
    'regressor__max_depth': Integer(5, 50),              # 树的最大深度
    'regressor__min_samples_split': Integer(2, 10),      # 分裂所需的最小样本数
    'regressor__min_samples_leaf': Integer(1, 10),       # 叶节点最小样本数
    'regressor__max_features': Categorical(['sqrt', 'log2']) # 特征选择策略
}

# 初始化贝叶斯搜索
# n_iter=30 表示尝试30种组合，随机森林比较慢，建议设为 20-30，如果电脑配置好可设为 50
opt = BayesSearchCV(
    pipeline,
    search_spaces,
    n_iter=30,      
    cv=5,           # 调优阶段内部使用 5 折
    n_jobs=-1,      # 并行计算
    random_state=42,
    scoring='neg_mean_absolute_error',
    verbose=0
)

start_time = time.time()
opt.fit(X_train_full, y_train_full)
end_time = time.time()

best_estimator = opt.best_estimator_ # 这是一个包含预处理和最佳参数RF的完整 Pipeline
best_params = opt.best_params_

print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
print("\n>>> 最佳参数:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# ==========================================
# 4. 第二阶段：十折交叉验证 (使用最佳参数)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 4.1 切分数据
    X_train_fold = X_train_full.iloc[train_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    # 4.2 克隆最佳模型 (确保每一折从头训练)
    model = clone(best_estimator)
    
    # 4.3 训练 (Pipeline 自动处理 OneHot)
    model.fit(X_train_fold, y_train_fold)
    
    # 4.4 预测
    y_train_pred = model.predict(X_train_fold)
    y_val_pred = model.predict(X_val_fold)
    
    # 4.5 记录指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出详细表格
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果 (Per-Fold Results):")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

# 使用 80% 的全量数据重新训练最佳模型
best_estimator.fit(X_train_full, y_train_full)

# 在 20% 独立测试集上预测
y_test_pred = best_estimator.predict(X_test)

final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Training (Avg)": [
        df_final_cv.iloc[-1]["Train MAE"],
        df_final_cv.iloc[-1]["Train RMSE"],
        df_final_cv.iloc[-1]["Train R2"]
    ],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 模型性能最终汇总 (Random Forest + BayesOpt):")
print(pd.DataFrame(final_summary).round(4))

原始数据总量: 791
用于调优和CV的训练集 (80%): 632
独立测试集 (20%): 159

>>> Stage 1: 开始随机森林贝叶斯优化 (BayesSearchCV)...
贝叶斯优化耗时: 40.93 秒

>>> 最佳参数:
  regressor__max_depth: 42
  regressor__max_features: sqrt
  regressor__min_samples_leaf: 2
  regressor__min_samples_split: 4
  regressor__n_estimators: 500

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...
Fold 1/10 完成 | Val R2: 0.4502
Fold 2/10 完成 | Val R2: 0.7810
Fold 3/10 完成 | Val R2: 0.6564
Fold 4/10 完成 | Val R2: 0.6615
Fold 5/10 完成 | Val R2: 0.5836
Fold 6/10 完成 | Val R2: 0.7438
Fold 7/10 完成 | Val R2: 0.6521
Fold 8/10 完成 | Val R2: 0.7471
Fold 9/10 完成 | Val R2: 0.6822
Fold 10/10 完成 | Val R2: 0.5341

>>> 十折交叉验证详细结果 (Per-Fold Results):
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.8850      0.2116     0.1407  0.4502    0.4414   0.2708
1         2    0.8751      0.2217     0.1479  0.7810    0.2698   0.2033
2         3    0.8827      0.2160     0.1433  0.6564    0.3179   0.2200
3         4    0.8727      0.2162     0.1410  0.