In [1]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb

# 引入贝叶斯优化库
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import clone

# ==========================================
# 1. 数据加载与分割 (80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/标准化数据_无独热_Log变换12.18.csv'

try:
    data = pd.read_csv(file_path)
    print("数据加载成功 (UTF-8)")
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')
    print("数据加载成功 (GBK)")

excluded_columns = ['log_Separation factor', 'polymer', 'DOI', 'Flux']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Separation factor']

# 划分出 20% 的独立测试集 (Test Set)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 构建预处理与模型流水线
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

# 定义预处理器
# 数值列：直接通过 (passthrough)，因为已经标准化过
# 分类列：独热编码 (OneHotEncoder)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])

# 定义 Pipeline: 预处理 -> LightGBM
# 注意：LightGBM 的 sklearn 接口是 LGBMRegressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(random_state=42, verbose=-1))
])

# ==========================================
# 3. 第一阶段：贝叶斯超参数优化
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始 LightGBM 贝叶斯优化 (BayesSearchCV)...")
print("="*50)

# 定义搜索空间
# LightGBM 的核心参数：num_leaves 和 max_depth 需要配合调整
search_spaces = {
    'regressor__learning_rate': Real(0.01, 0.3, prior='log-uniform'), # 学习率
    'regressor__n_estimators': Integer(100, 1000),       # 树的数量
    'regressor__num_leaves': Integer(20, 100),           # 叶子节点数 (控制复杂度)
    'regressor__max_depth': Integer(3, 15),              # 树深度
    'regressor__min_child_samples': Integer(5, 50),      # 叶节点最小样本数 (防过拟合)
    'regressor__reg_alpha': Real(0.0, 1.0),              # L1 正则化
    'regressor__reg_lambda': Real(0.0, 1.0)              # L2 正则化
}

# 初始化贝叶斯搜索
opt = BayesSearchCV(
    pipeline,
    search_spaces,
    n_iter=30,      # 迭代 30 次
    cv=5,           # 调优阶段内部 5 折
    n_jobs=-1,      # 并行计算
    random_state=42,
    scoring='neg_mean_absolute_error',
    verbose=0
)

start_time = time.time()
opt.fit(X_train_full, y_train_full)
end_time = time.time()

best_estimator = opt.best_estimator_ # 包含预处理的完整 Pipeline
best_params = opt.best_params_

print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
print("\n>>> 最佳参数:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# ==========================================
# 4. 第二阶段：十折交叉验证 (使用最佳参数)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 4.1 切分数据
    X_train_fold = X_train_full.iloc[train_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    # 4.2 克隆最佳模型 (确保每一折独立训练)
    model = clone(best_estimator)
    
    # 4.3 训练 (Pipeline 自动处理 OneHot)
    model.fit(X_train_fold, y_train_fold)
    
    # 4.4 预测
    y_train_pred = model.predict(X_train_fold)
    y_val_pred = model.predict(X_val_fold)
    
    # 4.5 记录指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出详细表格
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果 (Per-Fold Results):")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

# 使用 80% 的全量数据重新训练最佳模型
best_estimator.fit(X_train_full, y_train_full)

# 在 20% 独立测试集上预测
y_test_pred = best_estimator.predict(X_test)

final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Training (Avg)": [
        df_final_cv.iloc[-1]["Train MAE"],
        df_final_cv.iloc[-1]["Train RMSE"],
        df_final_cv.iloc[-1]["Train R2"]
    ],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 模型性能最终汇总 (LightGBM + BayesOpt):")
print(pd.DataFrame(final_summary).round(4))

数据加载成功 (UTF-8)
原始数据总量: 816
用于调优和CV的训练集 (80%): 652
独立测试集 (20%): 164

>>> Stage 1: 开始 LightGBM 贝叶斯优化 (BayesSearchCV)...
贝叶斯优化耗时: 59.95 秒

>>> 最佳参数:
  regressor__learning_rate: 0.13254000461783513
  regressor__max_depth: 15
  regressor__min_child_samples: 5
  regressor__n_estimators: 100
  regressor__num_leaves: 46
  regressor__reg_alpha: 0.0
  regressor__reg_lambda: 1.0

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...
Fold 1/10 完成 | Val R2: 0.7496
Fold 2/10 完成 | Val R2: 0.7401
Fold 3/10 完成 | Val R2: 0.7998
Fold 4/10 完成 | Val R2: 0.8806
Fold 5/10 完成 | Val R2: 0.8647
Fold 6/10 完成 | Val R2: 0.9020
Fold 7/10 完成 | Val R2: 0.7736
Fold 8/10 完成 | Val R2: 0.7229
Fold 9/10 完成 | Val R2: 0.8820
Fold 10/10 完成 | Val R2: 0.7713

>>> 十折交叉验证详细结果 (Per-Fold Results):
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.9979      0.0334     0.0215  0.7496    0.3826   0.2154
1         2    0.9971      0.0407     0.0241  0.7401    0.2784   0.1380
2         3    0.9971      0.03

In [2]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb

# 引入贝叶斯优化库
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import clone

# ==========================================
# 1. 数据加载与分割 (80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/通量标准化数据_无独热_Log变换12.18.csv'
data = pd.read_csv(file_path)

# Define excluded columns (unchanged)
excluded_columns = ['log_Flux', 'polymer', 'DOI', 'Separation factor']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Flux']

# 划分出 20% 的独立测试集 (Test Set)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 构建预处理与模型流水线
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

# 定义预处理器
# 数值列：直接通过 (passthrough)，因为已经标准化过
# 分类列：独热编码 (OneHotEncoder)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])

# 定义 Pipeline: 预处理 -> LightGBM
# 注意：LightGBM 的 sklearn 接口是 LGBMRegressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(random_state=42, verbose=-1))
])

# ==========================================
# 3. 第一阶段：贝叶斯超参数优化
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始 LightGBM 贝叶斯优化 (BayesSearchCV)...")
print("="*50)

# 定义搜索空间
# LightGBM 的核心参数：num_leaves 和 max_depth 需要配合调整
search_spaces = {
    'regressor__learning_rate': Real(0.01, 0.3, prior='log-uniform'), # 学习率
    'regressor__n_estimators': Integer(100, 1000),       # 树的数量
    'regressor__num_leaves': Integer(20, 100),           # 叶子节点数 (控制复杂度)
    'regressor__max_depth': Integer(3, 15),              # 树深度
    'regressor__min_child_samples': Integer(5, 50),      # 叶节点最小样本数 (防过拟合)
    'regressor__reg_alpha': Real(0.0, 1.0),              # L1 正则化
    'regressor__reg_lambda': Real(0.0, 1.0)              # L2 正则化
}

# 初始化贝叶斯搜索
opt = BayesSearchCV(
    pipeline,
    search_spaces,
    n_iter=30,      # 迭代 30 次
    cv=5,           # 调优阶段内部 5 折
    n_jobs=-1,      # 并行计算
    random_state=42,
    scoring='neg_mean_absolute_error',
    verbose=0
)

start_time = time.time()
opt.fit(X_train_full, y_train_full)
end_time = time.time()

best_estimator = opt.best_estimator_ # 包含预处理的完整 Pipeline
best_params = opt.best_params_

print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
print("\n>>> 最佳参数:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# ==========================================
# 4. 第二阶段：十折交叉验证 (使用最佳参数)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 4.1 切分数据
    X_train_fold = X_train_full.iloc[train_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    # 4.2 克隆最佳模型 (确保每一折独立训练)
    model = clone(best_estimator)
    
    # 4.3 训练 (Pipeline 自动处理 OneHot)
    model.fit(X_train_fold, y_train_fold)
    
    # 4.4 预测
    y_train_pred = model.predict(X_train_fold)
    y_val_pred = model.predict(X_val_fold)
    
    # 4.5 记录指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出详细表格
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果 (Per-Fold Results):")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

# 使用 80% 的全量数据重新训练最佳模型
best_estimator.fit(X_train_full, y_train_full)

# 在 20% 独立测试集上预测
y_test_pred = best_estimator.predict(X_test)

final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Training (Avg)": [
        df_final_cv.iloc[-1]["Train MAE"],
        df_final_cv.iloc[-1]["Train RMSE"],
        df_final_cv.iloc[-1]["Train R2"]
    ],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 模型性能最终汇总 (LightGBM + BayesOpt):")
print(pd.DataFrame(final_summary).round(4))

原始数据总量: 791
用于调优和CV的训练集 (80%): 632
独立测试集 (20%): 159

>>> Stage 1: 开始 LightGBM 贝叶斯优化 (BayesSearchCV)...
贝叶斯优化耗时: 42.96 秒

>>> 最佳参数:
  regressor__learning_rate: 0.07743392536999648
  regressor__max_depth: 15
  regressor__min_child_samples: 5
  regressor__n_estimators: 1000
  regressor__num_leaves: 20
  regressor__reg_alpha: 0.0
  regressor__reg_lambda: 0.0

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...
Fold 1/10 完成 | Val R2: 0.5279
Fold 2/10 完成 | Val R2: 0.8556
Fold 3/10 完成 | Val R2: 0.5154
Fold 4/10 完成 | Val R2: 0.7129
Fold 5/10 完成 | Val R2: 0.6911
Fold 6/10 完成 | Val R2: 0.8458
Fold 7/10 完成 | Val R2: 0.7444
Fold 8/10 完成 | Val R2: 0.8097
Fold 9/10 完成 | Val R2: 0.7607
Fold 10/10 完成 | Val R2: 0.7462

>>> 十折交叉验证详细结果 (Per-Fold Results):
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.9974      0.0317     0.0085  0.5279    0.4090   0.2427
1         2    0.9975      0.0317     0.0085  0.8556    0.2191   0.1599
2         3    0.9975      0.0315     0.0080 