In [1]:
# 导入相关包
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pylab import *
from matplotlib.ticker import MultipleLocator
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import ensemble
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.ensemble import VotingRegressor
import shap
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.metrics import make_scorer
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import LeaveOneOut
from sklearn.svm import SVR
from sklearn.metrics import max_error
from sklearn.preprocessing import StandardScaler
# 设置警告过滤器
from sklearn.model_selection import LeaveOneOut, cross_val_score, cross_val_predict
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

def set_random_seeds(seed=24):
    random.seed(seed)          # Python内置随机种子
    np.random.seed(seed)       # NumPy随机种子
    tf.random.set_seed(seed)   # TensorFlow随机种子
    # 对于GPU使用的额外设置
    tf.config.experimental.enable_op_determinism()

set_random_seeds(42)  # 设置为任意固定值
# ====================================================
# 1. 加载已有模型和标准化器
# ====================================================

# 加载已保存的标准化器（如果已保存）
# 如果您之前已经保存了scaler，可以这样加载：
# import joblib
# scaler = joblib.load('fitted_scaler.pkl')

# 如果没有保存，需要重新创建并使用训练数据拟合
# 首先加载原始训练数据
train_excel_file = 'processed_data-selected-feature-16 -plus.xlsx'
train_df = pd.read_excel(train_excel_file)

# 提取训练数据的描述符（使用与训练时相同的列）
train_descriptors = train_df.iloc[:, 4:30]

# 创建并拟合标准化器（使用训练数据）
scaler = StandardScaler()
scaler.fit(train_descriptors)  # 只使用训练数据拟合

# 保存标准化器以备将来使用
import joblib
joblib.dump(scaler, 'fitted_scaler.pkl')
print("标准化器已保存到 'fitted_scaler.pkl'")

# ====================================================
# 2. 加载和处理新数据
# ====================================================

# 加载新数据
new_excel_file = 'dataset-留1燃料验证法.xlsx'
new_df = pd.read_excel(new_excel_file)

# 提取新数据的SMILES和描述符
new_smiles_column = new_df['SMILES'].astype(str).tolist()

# 确保新数据的描述符列与训练数据相同
new_descriptors = new_df.iloc[:, 4:20]  # 假设列索引相同

# 使用训练数据拟合的标准化器对新数据进行标准化
new_descriptors_scaled = scaler.transform(new_descriptors)
print(f"新数据标准化完成，形状: {new_descriptors_scaled.shape}")

# ====================================================
# 3. 对新数据的SMILES进行One-Hot编码
# ====================================================

# 加载已有的字符到索引映射
# 如果您之前保存了char_to_index和max_len
# 假设您已经从训练过程中获取了这些参数
# 这里需要从原始训练代码中获取或保存这些参数

# 方法1：从保存的文件中加载
try:
    import pickle
    with open('char_to_index.pkl', 'rb') as f:
        char_to_index = pickle.load(f)
    with open('max_len.pkl', 'rb') as f:
        max_len = pickle.load(f)
    print(f"加载字符映射: {len(char_to_index)}个字符, 最大长度: {max_len}")
except FileNotFoundError:
    # 方法2：重新计算（需要原始训练数据）
    print("未找到保存的映射文件，重新计算...")
    all_smiles = train_df['smiles'].astype(str).tolist() + new_smiles_column
    all_characters = set(''.join(all_smiles))
    char_to_index = {char: idx for idx, char in enumerate(sorted(all_characters))}
    max_len = max(len(smiles) for smiles in all_smiles)
    
    # 保存映射
    with open('char_to_index.pkl', 'wb') as f:
        pickle.dump(char_to_index, f)
    with open('max_len.pkl', 'wb') as f:
        pickle.dump(max_len, f)

def encode_smiles(smiles, char_to_index, max_len):
    """对单个SMILES进行One-Hot编码"""
    one_hot_matrix = np.zeros((max_len, len(char_to_index)), dtype=int)
    for i, char in enumerate(smiles):
        if char in char_to_index:
            one_hot_matrix[i, char_to_index[char]] = 1
        else:
            print(f"警告: 字符 '{char}' 不在训练词汇表中")
    return one_hot_matrix

# 对新数据的SMILES进行编码
new_smiles_encoded = np.array([
    encode_smiles(smile, char_to_index, max_len) 
    for smile in new_smiles_column
])
print(new_smiles_encoded)
print(f"新SMILES编码完成，形状: {new_smiles_encoded.shape}")



# 5. 构建确定性模型（关键修改）
# ====================================================
def create_deterministic_model(max_len, num_chars, descriptor_dim):
    smiles_input = Input(shape=(max_len, num_chars), name='smiles_input')
    descriptors_input = Input(shape=(descriptor_dim,), name='descriptors_input')
    
    # 使用固定初始化器
    x = Flatten()(smiles_input)
    x = Dense(16, 
              activation='relu',
              kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42))(x)
    
    merged = Concatenate()([x, descriptors_input])
    model = Model(inputs=[smiles_input, descriptors_input], outputs=merged)
    
    # 固定编译参数
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mean_squared_error'
    )
    return model

model = create_deterministic_model(
    max_len=max_len,
    num_chars=len(char_to_index),
    descriptor_dim=X_train_descriptors.shape[1]
)

# ====================================================
# 6. 获取确定性特征（关键修改）
# ====================================================
# 第一次运行会初始化权重，之后固定
X_new_smiles_train = model.predict([new_smiles_encoded, new_descriptors_scaled], verbose=0)
print(X_train.shape)
features_df = pd.DataFrame(X_new_smiles_train)
output_file = '留1燃料验证描述符.xlsx'
features_df.to_excel(output_file, index=False)
print(f"数据已保存到 '{output_file}'")

标准化器已保存到 'fitted_scaler.pkl'
新数据标准化完成，形状: (38, 16)
加载字符映射: 9个字符, 最大长度: 18
[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]
新SMILES编码完成，形状: (38, 18, 9)
(67, 32)
数据已保存到 '留1燃料验证描述符.xlsx'


In [21]:
# 加载保存的集成模型
ensemble_model = joblib.load('voting_regressor_model-LFS-3754.pkl')
print("集成模型加载成功")

# 标准化描述符数据
print(X_new_smiles_train.shape)
# 使用训练好的模型进行预测
y_pred_new = ensemble_model.predict(X_new_smiles_train)

# 输出预测结果
print(f'预测结果: {y_pred_new}')
# 将预测结果添加到 DataFrame 中，假设我们添加到 'Predicted_LFS' 列
print("逐行打印预测结果:")
for idx, prediction in enumerate(y_pred_new):
    print(f' {prediction}')

集成模型加载成功
(38, 32)
预测结果: [29.6468577  36.36429069 41.33291258 42.22561459 40.49532777 34.77717198
 28.46546979 21.11639527 15.04792168 12.96426063 34.96443729 42.5610286
 47.17113026 48.6727407  46.29867414 40.23310849 33.1211635  23.89275049
 16.74220693 12.93226094 29.2818993  36.06631168 41.08604918 42.01229055
 40.43820678 35.18838836 28.85470508 21.4667801  15.3445811  13.19488108
 37.50864    46.26168865 51.00131246 52.72007396 50.32091041 44.37432349
 37.48240977 27.09445431]
逐行打印预测结果:
 29.646857699838485
 36.364290691471574
 41.332912583041214
 42.22561458784122
 40.495327765203164
 34.77717198398362
 28.46546979303836
 21.11639526567103
 15.047921684182995
 12.964260625465172
 34.96443729130808
 42.5610286040313
 47.17113026081514
 48.67274070270621
 46.29867413821669
 40.23310849123063
 33.12116350397023
 23.892750493547922
 16.7422069292033
 12.93226094390981
 29.281899295610557
 36.06631168405235
 41.086049183440416
 42.01229054627568
 40.43820678457948
 35.18838836054741
 2

In [19]:
file_path = '预测数据集.xlsx'  # 请替换为新数据的文件路径
df_new = pd.read_excel(file_path)
# print(df_new)
X_new =df_new.iloc[:,9:41] 
# 标准化描述符数据
print(X_new.shape)
# 使用训练好的模型进行预测
y_pred_new = ensemble_model.predict(X_new)

# 输出预测结果
print(f'预测结果: {y_pred_new}')
# 将预测结果添加到 DataFrame 中，假设我们添加到 'Predicted_LFS' 列
print("逐行打印预测结果:")
for idx, prediction in enumerate(y_pred_new):
    print(f' {prediction}')

(27, 32)
预测结果: [34.9496563  42.58245144 40.02592303 42.55375274 40.20140694 59.04404589
 37.62697003 46.01739176 66.55143158 38.30855874 39.35595203 34.10507539
 35.99421407 43.5871489  53.70993342 40.88979897 37.73473497 43.62270909
 47.84616124 47.26714609 46.61179029 51.28056928 47.76439238 61.71573117
 41.24592732 43.55945077 51.82650964]
逐行打印预测结果:
 34.94965630061602
 42.582451437526984
 40.02592302524653
 42.55375274120963
 40.20140693956762
 59.04404588711128
 37.626970034953395
 46.01739175534136
 66.55143158205738
 38.30855874215168
 39.355952032341285
 34.10507539199374
 35.99421406613353
 43.58714889802576
 53.70993341751661
 40.88979896605902
 37.73473496920412
 43.622709086043514
 47.8461612355205
 47.26714609156808
 46.61179029025832
 51.280569284000485
 47.764392382716295
 61.71573116595642
 41.245927322387416
 43.559450771247164
 51.826509638713794
