In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# 1. 加载数据 (增加编码容错)
file_path = 'C:/Users/tinid/polymer/major revision/标准化数据_无独热_Log变换12.18.csv'
try:
    data = pd.read_csv(file_path)
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')

# 定义排除列和目标
excluded_columns = ['log_Separation factor', 'polymer', 'DOI', 'Flux']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Separation factor']

# 2. 【新增步骤】先划分出 20% 的独立测试集 (Test Set)
# 剩下的 80% (X_train_full) 将用于十折交叉验证
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于交叉验证的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# 识别特征
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

# 预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        # 必须设置 sparse_output=False 否则 Keras 会报错
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ],
    verbose_feature_names_out=False
)

# ANN 模型构建函数
def build_ann(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# 3. 设置十折交叉验证 (10-Fold CV)
# 注意：十折意味着每次训练用 90% 的 X_train_full，验证用 10% 的 X_train_full
kf = KFold(n_splits=10, shuffle=True, random_state=42)

train_mae, train_rmse, train_r2 = [], [], []
val_mae, val_rmse, val_r2 = [], [], []

print("\n开始十折交叉验证 (基于 80% 的训练数据)...")

# 注意：这里使用的是 X_train_full 和 y_train_full 进行循环
for fold, (train_index, val_index) in enumerate(kf.split(X_train_full)):
    # 切分数据 (基于 X_train_full 的索引)
    X_train_fold, X_val_fold = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train_fold, y_val_fold = y_train_full.iloc[train_index], y_train_full.iloc[val_index]
    
    # 预处理 (在 Fold 内部 fit，防止数据泄露)
    X_train_processed = preprocessor.fit_transform(X_train_fold)
    X_val_processed = preprocessor.transform(X_val_fold)
    
    # 转换格式
    if hasattr(X_train_processed, "toarray"):
        X_train_processed = X_train_processed.toarray()
        X_val_processed = X_val_processed.toarray()
    
    input_dim = X_train_processed.shape[1]
    
    # 构建并训练模型
    model = build_ann(input_dim)
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    model.fit(
        X_train_processed, y_train_fold,
        validation_data=(X_val_processed, y_val_fold),
        epochs=100, batch_size=16, callbacks=[early_stop], verbose=0
    )
    
    # 预测
    y_train_pred = model.predict(X_train_processed, verbose=0).flatten()
    y_val_pred = model.predict(X_val_processed, verbose=0).flatten()
    
    # 记录指标
    train_mae.append(mean_absolute_error(y_train_fold, y_train_pred))
    train_rmse.append(np.sqrt(mean_squared_error(y_train_fold, y_train_pred)))
    train_r2.append(r2_score(y_train_fold, y_train_pred))
    
    val_mae.append(mean_absolute_error(y_val_fold, y_val_pred))
    val_rmse.append(np.sqrt(mean_squared_error(y_val_fold, y_val_pred)))
    val_r2.append(r2_score(y_val_fold, y_val_pred))
    
    print(f"Fold {fold+1}/10: Val R2 = {r2_score(y_val_fold, y_val_pred):.4f}")

# 4. 交叉验证结果汇总
cv_results = {
    "CV Training MAE": np.mean(train_mae),
    "CV Training RMSE": np.mean(train_rmse),
    "CV Training R2": np.mean(train_r2),
    "CV Validation MAE": np.mean(val_mae),
    "CV Validation RMSE": np.mean(val_rmse),
    "CV Validation R2": np.mean(val_r2),
}

print("\n=== 十折交叉验证平均结果 ===")
print(pd.DataFrame([cv_results]).T)

# ==========================================
# 5. (可选) 在独立测试集上进行最终评估
# ==========================================
print("\n=== 正在独立测试集 (Hold-out Test Set) 上进行最终测试 ===")

# 使用全部的训练数据 (X_train_full) 重新 fit 预处理器和模型
X_train_final = preprocessor.fit_transform(X_train_full)
X_test_final = preprocessor.transform(X_test)

# 确保格式正确
if hasattr(X_train_final, "toarray"):
    X_train_final = X_train_final.toarray()
    X_test_final = X_test_final.toarray()

# 重新训练最终模型
final_model = build_ann(X_train_final.shape[1])
final_model.fit(X_train_final, y_train_full, epochs=100, batch_size=16, verbose=0)

# 在从未见过的 20% 数据上预测
y_test_pred = final_model.predict(X_test_final, verbose=0).flatten()

test_results = {
    "Test Set MAE": mean_absolute_error(y_test, y_test_pred),
    "Test Set RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
    "Test Set R2": r2_score(y_test, y_test_pred)
}

print(pd.DataFrame([test_results]).T)

原始数据总量: 816
用于交叉验证的训练集 (80%): 652
独立测试集 (20%): 164

开始十折交叉验证 (基于 80% 的训练数据)...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 1/10: Val R2 = 0.7505


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 2/10: Val R2 = 0.6895


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 3/10: Val R2 = 0.6962


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 4/10: Val R2 = 0.7979


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 5/10: Val R2 = 0.7286


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 6/10: Val R2 = 0.9307


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 7/10: Val R2 = 0.4110


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 8/10: Val R2 = 0.5597


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 9/10: Val R2 = 0.6604


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 10/10: Val R2 = 0.7556

=== 十折交叉验证平均结果 ===
                           0
CV Training MAE     0.211741
CV Training RMSE    0.309981
CV Training R2      0.813108
CV Validation MAE   0.258760
CV Validation RMSE  0.372548
CV Validation R2    0.698013

=== 正在独立测试集 (Hold-out Test Set) 上进行最终测试 ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                      0
Test Set MAE   0.273300
Test Set RMSE  0.364160
Test Set R2    0.700304


In [2]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.8-py3-none-any.whl.metadata (5.6 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.8-py3-none-any.whl (129 kB)
   ---------------------------------------- 0.0/129.4 kB ? eta -:--:--
   ------ -------------------------------- 20.5/129.4 kB 640.0 kB/s eta 0:00:01
   ------------ -------------------------- 41.0/129.4 kB 653.6 kB/s eta 0:00:01
   ------------ -------------------------- 41.0/129.4 kB 653.6 kB/s eta 0:00:01
   --------------------- ----------------- 71.7/129.4 kB 391.3 kB/s eta 0:00:01
   --------------------------- ----------- 92.2/129.4 kB 435.7 kB/s eta 0:00:01
   ------------------------------ ------- 102.4/129.4 kB 368.6 kB/s eta 0:00:01
   -------------------------------------- 129.4/129.4 kB 401.3 kB/s eta 0:00:00
Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Success

In [3]:
import pandas as pd
import numpy as np
import keras_tuner as kt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import shutil
import os

# ==========================================
# 1. 数据加载与分割 (保持与上一段代码一致的 80:20)
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/标准化数据_无独热_Log变换12.18.csv'
try:
    data = pd.read_csv(file_path)
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')

excluded_columns = ['log_Separation factor', 'polymer', 'DOI', 'Flux']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Separation factor']

# 80% 用于调优 (Training)，20% 彻底封存 (Test)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================================
# 2. 预处理 (为调优准备数据)
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ],
    verbose_feature_names_out=False
)

# 对调优用的训练集进行拟合和转换
X_train_processed = preprocessor.fit_transform(X_train_full)
input_dim = X_train_processed.shape[1]

# ==========================================
# 3. 定义超参数搜索空间 (The Search Space)
# ==========================================
def build_hypermodel(hp):
    model = Sequential()
    
    # 调优层数 (1 到 3 层隐藏层)
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            # 调优每层的神经元数量 (32 到 128，步长 32)
            units=hp.Int(f'units_{i}', min_value=32, max_value=128, step=32),
            activation='relu'
        ))
        # 可选：调优 Dropout 率 (防止过拟合)
        if hp.Boolean('dropout'):
            model.add(Dropout(hp.Float('dropout_rate', 0.0, 0.3)))
            
    model.add(Dense(1, activation='linear'))
    
    # 调优学习率
    learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 5e-4])
    
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='mse',
        metrics=['mae']
    )
    return model

# ==========================================
# 4. 设置调优器 (Tuner)
# ==========================================
# 清理旧的搜索记录 (可选，防止报错)
if os.path.exists('my_dir/polymer_tuning'):
    shutil.rmtree('my_dir/polymer_tuning')

tuner = kt.Hyperband(
    build_hypermodel,
    objective='val_mae',     # 目标是最小化验证集 MAE
    max_epochs=50,           # 每一轮最大的训练轮数
    factor=3,
    directory='my_dir',      # 临时文件存放目录
    project_name='polymer_tuning',
    seed=42
)

stop_early = EarlyStopping(monitor='val_loss', patience=5)

print("开始超参数搜索...")

# ==========================================
# 5. 开始搜索
# ==========================================
# Tuner 会自动从 X_train_processed 中再划分一部分做验证 (validation_split=0.2)
tuner.search(
    X_train_processed, y_train_full, 
    epochs=50, 
    validation_split=0.2, 
    callbacks=[stop_early],
    verbose=1
)

# ==========================================
# 6. 获取最佳结果
# ==========================================
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
搜索完成。最佳超参数如下:
- 隐藏层数量: {best_hps.get('num_layers')}
- 学习率: {best_hps.get('learning_rate')}
""")

for i in range(best_hps.get('num_layers')):
    print(f"- 第 {i+1} 层神经元: {best_hps.get(f'units_{i}')}")
if best_hps.get('dropout'):
    print(f"- Dropout 率: {best_hps.get('dropout_rate')}")

# ==========================================
# 7. (可选) 用最佳参数重新训练并保存
# ==========================================
print("\n正在使用最佳参数构建最终模型...")
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(
    X_train_processed, y_train_full,
    epochs=100,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=10)],
    verbose=0
)

# 在完全独立的测试集上评估
X_test_processed = preprocessor.transform(X_test)
test_loss, test_mae = best_model.evaluate(X_test_processed, y_test)
print(f"\n在独立测试集 (20%) 上的最终 MAE: {test_mae:.4f}")

Trial 90 Complete [00h 00m 03s]
val_mae: 0.26270124316215515

Best val_mae So Far: 0.21693353354930878
Total elapsed time: 00h 04m 19s

搜索完成。最佳超参数如下:
- 隐藏层数量: 3
- 学习率: 0.01

- 第 1 层神经元: 64
- 第 2 层神经元: 96
- 第 3 层神经元: 96
- Dropout 率: 0.007903014137857666

正在使用最佳参数构建最终模型...
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1154 - mae: 0.2590

在独立测试集 (20%) 上的最终 MAE: 0.2607


In [6]:
import os
# ==========================================
# 0. 【防闪退设置】强制使用 CPU
# ==========================================
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import pandas as pd
import numpy as np
import keras_tuner as kt
import tensorflow.keras.backend as K
import gc
import shutil
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# ==========================================
# 1. 数据加载与分割
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/标准化数据_无独热_Log变换12.18.csv'
try:
    data = pd.read_csv(file_path)
    print("数据加载成功 (UTF-8)")
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')
    print("数据加载成功 (GBK)")

excluded_columns = ['log_Separation factor', 'polymer', 'DOI', 'Flux']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Separation factor']

# 80% 用于 (调优 + 交叉验证)，20% 用于最终独立测试
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================================
# 2. 预处理
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ],
    verbose_feature_names_out=False
)

# 先拟合一次以获取输入维度，用于构建模型
X_train_processed_sample = preprocessor.fit_transform(X_train_full)
input_dim = X_train_processed_sample.shape[1]

# ==========================================
# 3. Keras Tuner: 定义搜索空间
# ==========================================
def build_hypermodel(hp):
    model = Sequential()
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=32, max_value=128, step=32),
            activation='relu',
            input_shape=(input_dim,) if i == 0 else None
        ))
        if hp.Boolean('dropout'):
            model.add(Dropout(hp.Float('dropout_rate', 0.0, 0.3)))
    model.add(Dense(1, activation='linear'))
    model.compile(
        optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 5e-4])),
        loss='mse', metrics=['mae']
    )
    return model

# ==========================================
# 4. 执行超参数搜索
# ==========================================
if os.path.exists('my_dir/polymer_tuning'):
    shutil.rmtree('my_dir/polymer_tuning')

tuner = kt.Hyperband(
    build_hypermodel,
    objective='val_mae',
    max_epochs=30, # 稍微减少轮数以节省时间，可按需改回 50
    factor=3,
    directory='my_dir',
    project_name='polymer_tuning',
    seed=42
)

print("\n>>> 第一阶段：开始超参数搜索 (Hyperparameter Tuning)...")
stop_early = EarlyStopping(monitor='val_loss', patience=5)
tuner.search(X_train_processed_sample, y_train_full, epochs=30, validation_split=0.2, callbacks=[stop_early], verbose=1)

# 获取最佳参数
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("\n>>> 最佳超参数已找到:")
print(f"  - 隐藏层数: {best_hps.get('num_layers')}")
print(f"  - 学习率: {best_hps.get('learning_rate')}")

# ==========================================
# 5. 【核心部分】使用最佳参数进行十折交叉验证
# ==========================================
print("\n>>> 第二阶段：使用最佳参数进行十折交叉验证 (10-Fold CV)...")

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = [] # 用于存储每一折的详细数据

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    # 5.1 内存清理
    K.clear_session()
    gc.collect()
    
    # 5.2 数据切分
    X_train_fold = X_train_full.iloc[train_idx]
    y_train_fold = y_train_full.iloc[train_idx]
    X_val_fold = X_train_full.iloc[val_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    # 5.3 独立预处理 (防止数据泄露)
    X_train_fold_proc = preprocessor.fit_transform(X_train_fold)
    X_val_fold_proc = preprocessor.transform(X_val_fold)
    
    # 5.4 使用最佳参数构建模型
    model = tuner.hypermodel.build(best_hps)
    
    # 5.5 训练
    history = model.fit(
        X_train_fold_proc, y_train_fold,
        validation_data=(X_val_fold_proc, y_val_fold),
        epochs=100,
        batch_size=16,
        callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)],
        verbose=0
    )
    
    # 5.6 预测与评估
    # 训练集指标
    pred_train = model.predict(X_train_fold_proc, verbose=0).flatten()
    train_mae = mean_absolute_error(y_train_fold, pred_train)
    train_rmse = np.sqrt(mean_squared_error(y_train_fold, pred_train))
    train_r2 = r2_score(y_train_fold, pred_train)
    
    # 验证集指标
    pred_val = model.predict(X_val_fold_proc, verbose=0).flatten()
    val_mae = mean_absolute_error(y_val_fold, pred_val)
    val_rmse = np.sqrt(mean_squared_error(y_val_fold, pred_val))
    val_r2 = r2_score(y_val_fold, pred_val)
    
    # 存入列表
    fold_results.append({
        "Fold": fold + 1,
        "Train R2": train_r2, "Train RMSE": train_rmse, "Train MAE": train_mae,
        "Val R2": val_r2,     "Val RMSE": val_rmse,     "Val MAE": val_mae
    })
    
    print(f"Fold {fold+1}/10 完成 | Val R2: {val_r2:.4f}")

# ==========================================
# 6. 输出十折详细结果表
# ==========================================
df_cv_results = pd.DataFrame(fold_results)

# 计算平均值行
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_output = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n" + "="*80)
print("             十折交叉验证详细结果 (Per-Fold Results)")
print("="*80)
# 设置显示格式，防止小数点太多
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
print(df_final_output)
print("="*80)

# ==========================================
# 7. 最终独立测试集评估
# ==========================================
print("\n>>> 第三阶段：在独立测试集 (Test Set) 上进行最终评估...")
K.clear_session()
gc.collect()

# 使用全部 80% 数据训练最终模型
X_train_final_proc = preprocessor.fit_transform(X_train_full)
X_test_final_proc = preprocessor.transform(X_test)

final_model = tuner.hypermodel.build(best_hps)
final_model.fit(
    X_train_final_proc, y_train_full,
    epochs=100,
    batch_size=16,
    verbose=0
)

y_test_pred = final_model.predict(X_test_final_proc, verbose=0).flatten()

print("\n" + "="*40)
print("   独立测试集最终表现 (Hold-out Test Set)")
print("="*40)
print(f"Test MAE : {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"Test R2  : {r2_score(y_test, y_test_pred):.4f}")
print("="*40)

# 如果需要保存结果到 Excel/CSV，取消下面注释
# df_final_output.to_csv("cv_fold_results.csv", index=False)

Trial 90 Complete [00h 00m 06s]
val_mae: 0.26468396186828613

Best val_mae So Far: 0.22471702098846436
Total elapsed time: 00h 07m 05s

>>> 最佳超参数已找到:
  - 隐藏层数: 3
  - 学习率: 0.01

>>> 第二阶段：使用最佳参数进行十折交叉验证 (10-Fold CV)...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 1/10 完成 | Val R2: 0.7991


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 2/10 完成 | Val R2: 0.6951


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 3/10 完成 | Val R2: 0.6654


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 4/10 完成 | Val R2: 0.7153


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 5/10 完成 | Val R2: 0.7047


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 6/10 完成 | Val R2: 0.9298


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 7/10 完成 | Val R2: 0.3845


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 8/10 完成 | Val R2: 0.5951


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 9/10 完成 | Val R2: 0.7666


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 10/10 完成 | Val R2: 0.7845

             十折交叉验证详细结果 (Per-Fold Results)
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.8452      0.2890     0.2012  0.7991    0.3427   0.2358
1         2    0.8321      0.3101     0.2254  0.6951    0.3015   0.2194
2         3    0.8210      0.3136     0.2150  0.6654    0.4135   0.2917
3         4    0.6445      0.4467     0.2798  0.7153    0.3349   0.2355
4         5    0.7021      0.4091     0.2559  0.7047    0.3424   0.2501
5         6    0.8653      0.2521     0.1714  0.9298    0.2907   0.2176
6         7    0.6322      0.4517     0.3049  0.3845    0.5349   0.3759
7         8    0.8505      0.2916     0.2022  0.5951    0.3644   0.2456
8         9    0.8639      0.2707     0.1767  0.7666    0.3772   0.2551
9        10    0.8133      0.3169     0.2222  0.7845    0.3641   0.2528
10  Average    0.7870      0.3352     0.2255  0.7040    0.3666   0.2579

>>> 第三阶段：在独立测试集 (Test Set) 上进行最终评估...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



   独立测试集最终表现 (Hold-out Test Set)
Test MAE : 0.2510
Test RMSE: 0.3729
Test R2  : 0.6857


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load data
file_path = 'C:/Users/tinid/polymer/major revision/通量标准化数据_无独热_Log变换12.18.csv'
data = pd.read_csv(file_path)

# Define excluded columns (unchanged)
excluded_columns = ['log_Flux', 'polymer', 'DOI', 'Separation factor']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Flux']

# 2. 【新增步骤】先划分出 20% 的独立测试集 (Test Set)
# 剩下的 80% (X_train_full) 将用于十折交叉验证
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于交叉验证的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# 识别特征
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

# 预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        # 必须设置 sparse_output=False 否则 Keras 会报错
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ],
    verbose_feature_names_out=False
)

# ANN 模型构建函数
def build_ann(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# 3. 设置十折交叉验证 (10-Fold CV)
# 注意：十折意味着每次训练用 90% 的 X_train_full，验证用 10% 的 X_train_full
kf = KFold(n_splits=10, shuffle=True, random_state=42)

train_mae, train_rmse, train_r2 = [], [], []
val_mae, val_rmse, val_r2 = [], [], []

print("\n开始十折交叉验证 (基于 80% 的训练数据)...")

# 注意：这里使用的是 X_train_full 和 y_train_full 进行循环
for fold, (train_index, val_index) in enumerate(kf.split(X_train_full)):
    # 切分数据 (基于 X_train_full 的索引)
    X_train_fold, X_val_fold = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_train_fold, y_val_fold = y_train_full.iloc[train_index], y_train_full.iloc[val_index]
    
    # 预处理 (在 Fold 内部 fit，防止数据泄露)
    X_train_processed = preprocessor.fit_transform(X_train_fold)
    X_val_processed = preprocessor.transform(X_val_fold)
    
    # 转换格式
    if hasattr(X_train_processed, "toarray"):
        X_train_processed = X_train_processed.toarray()
        X_val_processed = X_val_processed.toarray()
    
    input_dim = X_train_processed.shape[1]
    
    # 构建并训练模型
    model = build_ann(input_dim)
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    model.fit(
        X_train_processed, y_train_fold,
        validation_data=(X_val_processed, y_val_fold),
        epochs=100, batch_size=16, callbacks=[early_stop], verbose=0
    )
    
    # 预测
    y_train_pred = model.predict(X_train_processed, verbose=0).flatten()
    y_val_pred = model.predict(X_val_processed, verbose=0).flatten()
    
    # 记录指标
    train_mae.append(mean_absolute_error(y_train_fold, y_train_pred))
    train_rmse.append(np.sqrt(mean_squared_error(y_train_fold, y_train_pred)))
    train_r2.append(r2_score(y_train_fold, y_train_pred))
    
    val_mae.append(mean_absolute_error(y_val_fold, y_val_pred))
    val_rmse.append(np.sqrt(mean_squared_error(y_val_fold, y_val_pred)))
    val_r2.append(r2_score(y_val_fold, y_val_pred))
    
    print(f"Fold {fold+1}/10: Val R2 = {r2_score(y_val_fold, y_val_pred):.4f}")

# 4. 交叉验证结果汇总
cv_results = {
    "CV Training MAE": np.mean(train_mae),
    "CV Training RMSE": np.mean(train_rmse),
    "CV Training R2": np.mean(train_r2),
    "CV Validation MAE": np.mean(val_mae),
    "CV Validation RMSE": np.mean(val_rmse),
    "CV Validation R2": np.mean(val_r2),
}

print("\n=== 十折交叉验证平均结果 ===")
print(pd.DataFrame([cv_results]).T)

# ==========================================
# 5. (可选) 在独立测试集上进行最终评估
# ==========================================
print("\n=== 正在独立测试集 (Hold-out Test Set) 上进行最终测试 ===")

# 使用全部的训练数据 (X_train_full) 重新 fit 预处理器和模型
X_train_final = preprocessor.fit_transform(X_train_full)
X_test_final = preprocessor.transform(X_test)

# 确保格式正确
if hasattr(X_train_final, "toarray"):
    X_train_final = X_train_final.toarray()
    X_test_final = X_test_final.toarray()

# 重新训练最终模型
final_model = build_ann(X_train_final.shape[1])
final_model.fit(X_train_final, y_train_full, epochs=100, batch_size=16, verbose=0)

# 在从未见过的 20% 数据上预测
y_test_pred = final_model.predict(X_test_final, verbose=0).flatten()

test_results = {
    "Test Set MAE": mean_absolute_error(y_test, y_test_pred),
    "Test Set RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
    "Test Set R2": r2_score(y_test, y_test_pred)
}

print(pd.DataFrame([test_results]).T)

原始数据总量: 791
用于交叉验证的训练集 (80%): 632
独立测试集 (20%): 159

开始十折交叉验证 (基于 80% 的训练数据)...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 1/10: Val R2 = 0.2999


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 2/10: Val R2 = 0.5745


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 3/10: Val R2 = 0.7150


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 4/10: Val R2 = 0.5636


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 5/10: Val R2 = 0.4175


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 6/10: Val R2 = 0.7243


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 7/10: Val R2 = 0.5378


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 8/10: Val R2 = 0.5730


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 9/10: Val R2 = 0.4692


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 10/10: Val R2 = 0.3354

=== 十折交叉验证平均结果 ===
                        0
CV Training MAE    0.2391
CV Training RMSE   0.3355
CV Training R2     0.7087
CV Validation MAE  0.3002
CV Validation RMSE 0.4184
CV Validation R2   0.5210

=== 正在独立测试集 (Hold-out Test Set) 上进行最终测试 ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                   0
Test Set MAE  0.2915
Test Set RMSE 0.4305
Test Set R2   0.5240


In [8]:
import os
# ==========================================
# 0. 【防闪退设置】强制使用 CPU
# ==========================================
# 解决 Windows 下 TensorFlow 显存溢出或驱动冲突导致的闪退
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import pandas as pd
import numpy as np
import keras_tuner as kt
import tensorflow.keras.backend as K
import gc
import shutil
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# ==========================================
# 1. 数据加载与分割
# ==========================================
file_path = 'C:/Users/tinid/polymer/major revision/通量标准化数据_无独热_Log变换12.18.csv'

try:
    data = pd.read_csv(file_path)
    print("成功读取数据 (UTF-8)")
except UnicodeDecodeError:
    data = pd.read_csv(file_path, encoding='gbk')
    print("成功读取数据 (GBK)")

# 定义排除列 (根据你的代码)
excluded_columns = ['log_Flux', 'polymer', 'DOI', 'Separation factor']
X = data.drop(columns=excluded_columns, errors='ignore')
y = data['log_Flux']

# 划分出 20% 的独立测试集 (Test Set)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"原始数据总量: {len(X)}")
print(f"用于调优和CV的训练集 (80%): {len(X_train_full)}")
print(f"独立测试集 (20%): {len(X_test)}")

# ==========================================
# 2. 预处理管道
# ==========================================
categorical_features = ['Permeation type']
numerical_features = [col for col in X.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ],
    verbose_feature_names_out=False
)

# 先拟合一次 X_train_full 以获取输入维度 (input_dim)
X_train_full_processed = preprocessor.fit_transform(X_train_full)
input_dim = X_train_full_processed.shape[1]

# ==========================================
# 3. 第一阶段：超参数搜索 (Hyperparameter Tuning)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 1: 开始超参数自动搜索 (Keras Tuner)...")
print("="*50)

def build_hypermodel(hp):
    model = Sequential()
    
    # 搜索隐藏层数量 (1到3层)
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            # 搜索神经元数量
            units=hp.Int(f'units_{i}', min_value=32, max_value=128, step=32),
            activation='relu',
            input_shape=(input_dim,) if i == 0 else None
        ))
        # 搜索是否使用 Dropout
        if hp.Boolean('dropout'):
            model.add(Dropout(hp.Float('dropout_rate', 0.0, 0.3)))
            
    model.add(Dense(1, activation='linear'))
    
    # 搜索学习率
    lr = hp.Choice('learning_rate', values=[1e-2, 1e-3, 5e-4])
    
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse', metrics=['mae'])
    return model

# 清理旧的搜索记录
if os.path.exists('my_dir/flux_tuning'):
    shutil.rmtree('my_dir/flux_tuning')

tuner = kt.Hyperband(
    build_hypermodel,
    objective='val_mae',
    max_epochs=30, # 稍微降低轮数加快速度
    factor=3,
    directory='my_dir',
    project_name='flux_tuning',
    seed=42
)

stop_early = EarlyStopping(monitor='val_loss', patience=5)

# 开始搜索 (使用部分训练数据自动划分验证集)
tuner.search(
    X_train_full_processed, y_train_full, 
    epochs=30, 
    validation_split=0.2, 
    callbacks=[stop_early],
    verbose=1
)

# 获取最佳超参数
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("\n>>> 最佳超参数已找到:")
print(f"  - 隐藏层数量: {best_hps.get('num_layers')}")
print(f"  - 学习率: {best_hps.get('learning_rate')}")
for i in range(best_hps.get('num_layers')):
    print(f"  - 第 {i+1} 层神经元: {best_hps.get(f'units_{i}')}")

# ==========================================
# 4. 第二阶段：十折交叉验证 (使用最佳参数)
# ==========================================
print("\n" + "="*50)
print(">>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...")
print("="*50)

kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold_results = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train_full)):
    # 内存清理
    K.clear_session()
    gc.collect()
    
    # 切分数据
    X_train_fold = X_train_full.iloc[train_index]
    y_train_fold = y_train_full.iloc[train_index]
    X_val_fold = X_train_full.iloc[val_index]
    y_val_fold = y_train_full.iloc[val_index]
    
    # 预处理 (防止数据泄露)
    X_train_fold_proc = preprocessor.fit_transform(X_train_fold)
    X_val_fold_proc = preprocessor.transform(X_val_fold)
    
    # 构建模型 (使用最佳参数)
    model = tuner.hypermodel.build(best_hps)
    
    # 训练
    history = model.fit(
        X_train_fold_proc, y_train_fold,
        validation_data=(X_val_fold_proc, y_val_fold),
        epochs=100,
        batch_size=16,
        callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)],
        verbose=0
    )
    
    # 预测
    y_train_pred = model.predict(X_train_fold_proc, verbose=0).flatten()
    y_val_pred = model.predict(X_val_fold_proc, verbose=0).flatten()
    
    # 计算指标
    metrics = {
        "Fold": fold + 1,
        "Train R2": r2_score(y_train_fold, y_train_pred),
        "Train RMSE": np.sqrt(mean_squared_error(y_train_fold, y_train_pred)),
        "Train MAE": mean_absolute_error(y_train_fold, y_train_pred),
        "Val R2": r2_score(y_val_fold, y_val_pred),
        "Val RMSE": np.sqrt(mean_squared_error(y_val_fold, y_val_pred)),
        "Val MAE": mean_absolute_error(y_val_fold, y_val_pred)
    }
    fold_results.append(metrics)
    print(f"Fold {fold+1}/10 完成 | Val R2: {metrics['Val R2']:.4f}")

# 输出每一折的详细表
df_cv_results = pd.DataFrame(fold_results)
avg_row = df_cv_results.mean(numeric_only=True).to_frame().T
avg_row["Fold"] = "Average"
df_final_cv = pd.concat([df_cv_results, avg_row], ignore_index=True)

print("\n>>> 十折交叉验证详细结果:")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(df_final_cv.round(4))

# ==========================================
# 5. 第三阶段：独立测试集最终评估
# ==========================================
print("\n" + "="*50)
print(">>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...")
print("="*50)

K.clear_session()
gc.collect()

# 使用全部 80% 数据训练最终模型
X_test_processed = preprocessor.transform(X_test) # 注意：这里用 fit 好的 preprocessor

final_model = tuner.hypermodel.build(best_hps)
final_model.fit(
    X_train_full_processed, y_train_full, # 使用全部 80% 数据
    epochs=100,
    batch_size=16,
    verbose=0
)

# 最终预测
y_test_pred = final_model.predict(X_test_processed, verbose=0).flatten()

# 汇总所有结果
final_summary = {
    "Metric": ["MAE", "RMSE", "R2"],
    "CV Training (Avg)": [
        df_final_cv.iloc[-1]["Train MAE"],
        df_final_cv.iloc[-1]["Train RMSE"],
        df_final_cv.iloc[-1]["Train R2"]
    ],
    "CV Validation (Avg)": [
        df_final_cv.iloc[-1]["Val MAE"],
        df_final_cv.iloc[-1]["Val RMSE"],
        df_final_cv.iloc[-1]["Val R2"]
    ],
    "Test Set (Final)": [
        mean_absolute_error(y_test, y_test_pred),
        np.sqrt(mean_squared_error(y_test, y_test_pred)),
        r2_score(y_test, y_test_pred)
    ]
}

print("\n>>> 所有数据集最终性能汇总 (Summary):")
print(pd.DataFrame(final_summary).round(4))

print("\n程序运行结束。")

Trial 90 Complete [00h 00m 07s]
val_mae: 0.318742036819458

Best val_mae So Far: 0.28738105297088623
Total elapsed time: 00h 09m 05s

>>> 最佳超参数已找到:
  - 隐藏层数量: 3
  - 学习率: 0.01
  - 第 1 层神经元: 64
  - 第 2 层神经元: 96
  - 第 3 层神经元: 96

>>> Stage 2: 使用最佳参数进行十折交叉验证 (10-Fold CV)...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 1/10 完成 | Val R2: 0.3015


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 2/10 完成 | Val R2: 0.5880


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 3/10 完成 | Val R2: 0.6314


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 4/10 完成 | Val R2: 0.6194


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 5/10 完成 | Val R2: 0.4032


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 6/10 完成 | Val R2: 0.7151


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 7/10 完成 | Val R2: 0.6993


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 8/10 完成 | Val R2: 0.6346


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 9/10 完成 | Val R2: 0.5255


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 10/10 完成 | Val R2: 0.4230

>>> 十折交叉验证详细结果:
       Fold  Train R2  Train RMSE  Train MAE  Val R2  Val RMSE  Val MAE
0         1    0.4840      0.4483     0.3462  0.3015    0.4975   0.3522
1         2    0.7057      0.3404     0.2422  0.5880    0.3701   0.2752
2         3    0.6473      0.3746     0.2788  0.6314    0.3292   0.2530
3         4    0.7137      0.3243     0.2403  0.6194    0.4649   0.3235
4         5    0.6783      0.3505     0.2465  0.4032    0.4945   0.3597
5         6    0.6891      0.3479     0.2442  0.7151    0.3267   0.2559
6         7    0.7538      0.3122     0.2093  0.6993    0.3085   0.2154
7         8    0.7002      0.3438     0.2422  0.6346    0.3404   0.2606
8         9    0.7697      0.2920     0.2014  0.5255    0.4963   0.3528
9        10    0.6547      0.3711     0.2665  0.4230    0.4077   0.2949
10  Average    0.6796      0.3505     0.2518  0.5541    0.4036   0.2943

>>> Stage 3: 独立测试集 (Hold-out Test Set) 最终评估...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



>>> 所有数据集最终性能汇总 (Summary):
  Metric  CV Training (Avg)  CV Validation (Avg)  Test Set (Final)
0    MAE             0.2518               0.2943            0.2996
1   RMSE             0.3505               0.4036            0.4187
2     R2             0.6796               0.5541            0.5498

程序运行结束。
