In [1]:
# 系统库
import os
import subprocess
import time
import shutil
import json
import socket
from datetime import datetime, timedelta

# 第三方科学计算 & 可视化
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 设置中文字体，避免乱码
plt.rcParams['font.sans-serif'] = ['SimHei']        # 黑体
plt.rcParams['axes.unicode_minus'] = False          # 解决负号显示成方块的问题

# 机器学习 & 优化
import xgboost as xgb
import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer

# 化学信息学 (RDKit)
from rdkit import Chem, RDLogger
from rdkit.Chem import (
    Descriptors, Crippen, rdMolDescriptors,
    MACCSkeys, RDKFingerprint, rdFingerprintGenerator
)
from rdkit.Chem.AtomPairs import Pairs, Torsions

# 关闭 RDKit 的警告
RDLogger.DisableLog('rdApp.*')

# Avalon 指纹（可选）
try:
    from rdkit.Avalon import pyAvalonTools
    avalon_available = True
except ImportError:
    avalon_available = False
print(f"Avalon available: {avalon_available}")

# Kaggle API
from kaggle.api.kaggle_api_extended import KaggleApi


import plotly.io as pio
pio.renderers.default = "iframe_connected"







if socket.gethostname() == 'hao-2':
    dir = r'D:\数据\Kaggle\Thermophysical Property Melting Point'
else:
    dir = os.getcwd()


DIRS = {
    "dir":              dir,                                       
    "DATA_DIR000":      os.path.join(dir, "DATA_DIR000"),
    "HISTORY":          os.path.join(dir, "HISTORY"),
    "SUBMISSION":       os.path.join(dir, "SUBMISSION"),
}

# 自动创建目录
for key, path in DIRS.items():
    os.makedirs(path, exist_ok=True)

# 打印时一行一个地址
print("✅ 路径已创建：\n")
for key, path in DIRS.items():
    print(f"{key:<12} : {path}")


Avalon available: True
✅ 路径已创建：

dir          : D:\数据\Kaggle\Thermophysical Property Melting Point
DATA_DIR000  : D:\数据\Kaggle\Thermophysical Property Melting Point\DATA_DIR000
HISTORY      : D:\数据\Kaggle\Thermophysical Property Melting Point\HISTORY
SUBMISSION   : D:\数据\Kaggle\Thermophysical Property Melting Point\SUBMISSION


# 数据提取处理

In [None]:
# 加载Kaggle 训练集和 Bradley 熔点公开数据集

# Kaggle 提供的训练集和测试集
train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))

# 外部 Bradley 熔点公开数据集
bradley_df = pd.read_excel(os.path.join(DIRS['DATA_DIR000'], "BradleyMeltingPointDataset.xlsx"))
bradleyplus_df = pd.read_excel(os.path.join(DIRS['DATA_DIR000'], "BradleyDoublePlusGoodMeltingPointDataset.xlsx"))

# 只保留需要的列
train_df = train_df[['SMILES', 'Tm']]
test_df  = test_df[['id', 'SMILES']]

# 输出数据集规模，确认加载成功
print("Train                        shape:", train_df.shape)
print("Test                         shape:", test_df.shape)
print("Bradley dataset              shape:", bradley_df.shape)
print("Bradley Plus Good dataset    shape:", bradleyplus_df.shape)

In [None]:
# 外部 Bradley 熔点数据集处理 & 合并Kaggle 训练集

# 1. 摄氏度 → 开尔文: T(K) = T(°C) + 273.15
bradley_df['Tm']     = bradley_df['mpC'] + 273.15
bradleyplus_df['Tm'] = bradleyplus_df['mpC'] + 273.15

# 2. 保留 [SMILES, Tm] 并统一列名
bradley_df     = bradley_df[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
bradleyplus_df = bradleyplus_df[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})

# 打印原始信息
print(f"📊 Kaggle 训练集    shape    : {train_df.shape}")
print(f"📊 Bradley          shape    : {bradley_df.shape}")
print(f"📊 Bradley Plus     shape    : {bradleyplus_df.shape}")

# 3. 合并 Bradley & Bradley Plus
bradley_merge = pd.concat([bradley_df, bradleyplus_df], axis=0).reset_index(drop=True)
print(f"📊 Bradley 合并后   shape    : {bradley_merge.shape}")

# 4. 拼接到 Kaggle 训练集
merge_df = pd.concat([train_df, bradley_merge], axis=0).reset_index(drop=True)
print(f"📊 拼接后 merge_df  shape    : {merge_df.shape}")

# 5. 去重处理
dup_count = merge_df.duplicated(subset=['SMILES', 'Tm']).sum()
print(f"⚠️ 发现重复数据条数          : {dup_count}")

merge_df = merge_df.drop_duplicates(subset=['SMILES', 'Tm']).reset_index(drop=True)
print(f"✅ 去重后 merge_df  shape    : {merge_df.shape}")

# 6. 最终确认
print("🎯 数据合并 & 去重完成！")



In [None]:
# 提取所有分子描述符 (Descriptors)
def extract_all_descriptors(df, SMILES_col):
    """
    输入:
        df         : DataFrame，包含 SMILES 列
        SMILES_col : 字符串，SMILES 列的名称
    输出:
        DataFrame，原始数据 + 208 个分子描述符
    """

    # 1. 获取 RDKit 内置的分子描述符
    descriptor_list = Descriptors._descList   # [(name, func), ...]
    descriptors = [desc[0] for desc in descriptor_list]
    print(f"📊 一共存在 {len(descriptors)} 个分子描述符特征")

    # 2. 遍历每个分子，计算描述符
    results = []
    total = len(df)
    for idx, smi in enumerate(df[SMILES_col]):
        mol = Chem.MolFromSmiles(smi)

        if mol is None:
            row = {name: None for name, func in descriptor_list}   # 无效 SMILES
        else:
            row = {name: func(mol) for name, func in descriptor_list}  # 有效 SMILES

        results.append(row)

        # 打印进度条（覆盖式打印）
        print(f"🔄 处理进度: {idx+1:5d}/{total:5d}", end="\r", flush=True)
    print("\n✅ 描述符计算完成")

    # 3. 合并原始数据与新特征
    df_desc = pd.DataFrame(results)
    return pd.concat([df, df_desc], axis=1)


# ============ 应用函数 ============
merge_df = extract_all_descriptors(merge_df, "SMILES")
test_df  = extract_all_descriptors(test_df, "SMILES")

# 删除无效数据 (有 NaN 的行)
merge_df = merge_df.dropna().reset_index(drop=True)
test_df  = test_df.dropna().reset_index(drop=True)

print(f"✅ merge_df shape = {merge_df.shape}")
print(f"✅ test_df shape  = {test_df.shape}")


# # 保存到 CSV
# merge_path = os.path.join(DIRS['DATA_DIR000'], "merge_descriptors.csv")
# test_path  = os.path.join(DIRS['DATA_DIR000'], "test_descriptors.csv")
# merge_df.to_csv(merge_path, index=False)
# test_df.to_csv(test_path, index=False)

# print(f"✅ merge_df shape = {merge_df.shape}，已保存到 {merge_path}")
# print(f"✅ test_df shape  = {test_df.shape}，已保存到 {test_path}")


In [None]:
# 提取所有分子指纹 (Fingerprints)
def extract_all_fingerprint(df, SMILES_col, morgan_radius=2, morgan_nbits=1024):
    """
    输入参数:
        df            : DataFrame，包含 SMILES 的表格
        SMILES_col    : str，SMILES 所在列的列名
        morgan_radius : int，Morgan 指纹半径 (默认=2)
        morgan_nbits  : int，Morgan/FCFP/AtomPair 指纹长度 (默认=1024)

    返回:
        DataFrame，原始数据 + 多种分子指纹特征
    """

    fps_data = []   # 存储所有分子的指纹特征字典

    # 1. 定义指纹生成器
    morgan_gen = rdFingerprintGenerator.GetMorganGenerator(
        radius=morgan_radius, fpSize=morgan_nbits,
        countSimulation=True, includeChirality=False
    )
    fcfp = rdFingerprintGenerator.GetMorganFeatureAtomInvGen()
    fcfp_gen = rdFingerprintGenerator.GetMorganGenerator(
        radius=morgan_radius, fpSize=morgan_nbits,
        atomInvariantsGenerator=fcfp, countSimulation=True, includeChirality=False
    )
    atom_gen = rdFingerprintGenerator.GetAtomPairGenerator(
        fpSize=morgan_nbits, countSimulation=True, includeChirality=False
    )

    # 2. 遍历分子，提取指纹
    total = len(df)
    for idx, smi in enumerate(df[SMILES_col]):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            fps_data.append({})
            print(f"⚠ 无效 SMILES: {smi}")
            continue

        feature_row = {}

        # 2.1 Morgan 指纹 (ECFP)
        morgan_fp = morgan_gen.GetFingerprint(mol)
        for i in range(morgan_nbits):
            feature_row[f"Morgan_{i}"] = morgan_fp[i]

        # 2.2 功能类 Morgan (FCFP)
        fcfp_fp = fcfp_gen.GetFingerprint(mol)
        for i in range(morgan_nbits):
            feature_row[f"FCFP_{i}"] = fcfp_fp[i]

        # 2.3 MACCS Keys (固定 167 位)
        maccs_fp = MACCSkeys.GenMACCSKeys(mol)
        for i in range(len(maccs_fp)):
            feature_row[f"MACCS_{i}"] = int(maccs_fp[i])

        # 2.4 AtomPair 指纹
        atompair_fp = atom_gen.GetCountFingerprint(mol)
        for i in range(morgan_nbits):
            feature_row[f"AtomPair_{i}"] = atompair_fp[i]

        # 2.5 RDKit 内置指纹
        rdkit_fp = RDKFingerprint(mol)
        for i in range(len(rdkit_fp)):
            feature_row[f"RDKIT_{i}"] = int(rdkit_fp[i])

        # 2.6 Avalon 指纹 (若可用)
        if avalon_available:
            avalon_fp = pyAvalonTools.GetAvalonFP(mol, morgan_nbits)
            for i in range(len(avalon_fp)):
                feature_row[f"Avalon_{i}"] = int(avalon_fp[i])

        fps_data.append(feature_row)
        print(f"🔄 指纹提取进度: {idx+1:5d}/{total:5d}", end="\r", flush=True)
    print("\n✅ 分子指纹计算完成")

    # 3. 合并结果并返回
    fps_df = pd.DataFrame(fps_data)
    return pd.concat([df, fps_df], axis=1)


# ============ 应用函数 ============
merge_df = extract_all_fingerprint(merge_df, "SMILES")
test_df  = extract_all_fingerprint(test_df, "SMILES")

print(f"✅ merge_df shape = {merge_df.shape}")
print(f"✅ test_df shape  = {test_df.shape}")

# # 保存结果
# merge_fp_path = os.path.join(DIRS['DATA_DIR000'], "merge_fingerprints.csv")
# test_fp_path  = os.path.join(DIRS['DATA_DIR000'], "test_fingerprints.csv")
# merge_df.to_csv(merge_fp_path, index=False)
# test_df.to_csv(test_fp_path, index=False)

# print(f"✅ merge_df shape = {merge_df.shape}，已保存到 {merge_fp_path}")
# print(f"✅ test_df shape  = {test_df.shape}，已保存到 {test_fp_path}")


# 数据分析

In [67]:
# 加载数据
def loaddata(DIRS):
    # 定义路径
    merge_fp_path = os.path.join(DIRS['DATA_DIR000'], "merge_fingerprints.csv")
    test_fp_path  = os.path.join(DIRS['DATA_DIR000'], "test_fingerprints.csv")
    # 读取数据
    merge_df = pd.read_csv(merge_fp_path)
    test_df  = pd.read_csv(test_fp_path)

    # 打印信息
    print(f"✅ merge_df 加载完成，shape = {merge_df.shape}")
    print(f"✅ test_df  加载完成，shape = {test_df.shape}")

    print("特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024")
    print("合计特征总数 = 6528")

    return  merge_df, test_df



merge_df, test_df =  loaddata(DIRS)

✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528


In [26]:
# 打印清单
def config_to_str(config: dict, indent: int = 0) -> str:
    """递归生成配置字符串"""
    prefix = "     " * indent
    lines = []
    for key, value in config.items():
        if isinstance(value, dict):
            lines.append(f"{prefix}🔹 {key}:")
            lines.append(config_to_str(value, indent + 1))  # 递归拼接子字典
        else:
            lines.append(f"{prefix}- {key:<20}: {value}")
    return "\n".join(lines)



In [29]:
# 实验配置单
config = {
    # 固定开关
    "ISTEST"            : True,

    "remove_dup_smiles" : True, 
    "use_feature_gen"   : False,
    "use_pca"           : True,
    "pca_components"    : 100,

    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cpu",
        "objective"     : "reg:absoluteerror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "mean",   

    # 训练设置
    "xgb_train_model_params": {
        'max_depth'   : 6,
        'eta'         : 0.1,
        'tree_method' : 'hist',
        'eval_metric' : 'mae',
    },
    "num_boost_round": 15000,
}

In [45]:
# 数据拆分 (特征矩阵 与 目标向量)
# ============================================
# 特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
# 合计特征总数 = 6528

import numpy as np
import pandas as pd

def prepare_features_and_target(merge_df: pd.DataFrame, test_df: pd.DataFrame, config: dict):
    """
    数据拆分函数：构造训练集和测试集的特征矩阵与目标向量
    """


    # 1. 检查并处理重复 SMILES
    if config["remove_dup_smiles"]:

        dup_smiles = set(merge_df['SMILES']) & set(test_df['SMILES'])
        print(f"⚠️ 检测到 {len(dup_smiles)} 个重复 SMILES")

        before_shape = merge_df.shape
        # 删除训练集中出现在测试集的 SMILES，避免数据泄漏
        merge_df = merge_df[~merge_df['SMILES'].isin(test_df['SMILES'])].reset_index(drop=True)
        after_shape = merge_df.shape

        print(f"✅ 删除完成: 从 {before_shape} → {after_shape}")




    # 2. 构造特征矩阵和目标向量
    features_train = merge_df.drop(columns=['SMILES', 'Tm'])   # 训练集特征 (X)
    target_train   = merge_df['Tm']                            # 训练集目标 (y, 熔点)
    features_test  = test_df.drop(columns=['SMILES', 'id'])    # 测试集特征 (无 Tm)


    # 随机选取部分特征（示例：50 个）
    if config["ISTEST"]:
        np.random.seed(42)
        selected_features = np.random.choice(
            merge_df.drop(columns=['SMILES', 'Tm']).columns,
            size=110,
            replace=False
        )
        sample_len = 100
        features_train = merge_df.iloc[:sample_len][selected_features]   # 训练特征 (前 1000 条)
        target_train = merge_df.iloc[:sample_len]['Tm']               # 训练目标
        features_test = test_df[selected_features]          # 测试特征 (同样的特征列)




    # 3. 打印维度信息
    print("📊 数据拆分完成")
    print(f"训练集特征 features_train  shape   : {features_train.shape}")
    print(f"训练集目标   target_train  shape   : {target_train.shape}")
    print(f"测试集特征  features_test  shape   : {features_test.shape}")
    print(f"           features_train  类型    : {type(features_train)}")

    return features_train, target_train, features_test

features_train, target_train, features_test = prepare_features_and_target(merge_df, test_df, config)


⚠️ 检测到 0 个重复 SMILES
✅ 删除完成: 从 (28405, 6530) → (28405, 6530)
📊 数据拆分完成
训练集特征 features_train  shape   : (100, 110)
训练集目标   target_train  shape   : (100,)
测试集特征  features_test  shape   : (666, 110)
           features_train  类型    : <class 'pandas.core.frame.DataFrame'>


### 非零占比分布的直方图

In [31]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def plot_nonzero_ratio_hist(features_train: pd.DataFrame, features_test: pd.DataFrame, bins_size: int = 10):
    """
    绘制并比较训练集和测试集每列非零占比分布的直方图

    参数:
        features_train : pd.DataFrame
            训练集特征矩阵
        features_test  : pd.DataFrame
            测试集特征矩阵
        bins_size : int, 默认=10
            分箱数量 (0%~100% 区间划分)

    返回:
        train_counts, test_counts : np.ndarray
            两个数据集在各区间内的列数
    """
    # 逐列非零占比（百分比形式）
    train_ratio = features_train.apply(lambda col: (col != 0).mean() * 100)
    test_ratio  = features_test.apply(lambda col: (col != 0).mean() * 100)

    plt.figure(figsize=(10, 5))

    # 绘制直方图 (density=True 表示频率形式)
    counts1, bins1, _ = plt.hist(train_ratio, bins=bins_size, alpha=0.6, 
                                 label="features_train", density=True)
    counts2, bins2, _ = plt.hist(test_ratio, bins=bins_size, alpha=0.6, 
                                 label="features_test", density=True)

    # 分别计算数量（而不是频率）
    train_counts, _ = np.histogram(train_ratio, bins=bins1)
    test_counts, _ = np.histogram(test_ratio, bins=bins2)

    # 打印结果
    print("features_train 各区间数量：")
    for i in range(len(bins1)-1):
        print(f"{bins1[i]:.0f}% - {bins1[i+1]:.0f}% : {train_counts[i]} 列")

    print("\nfeatures_test 各区间数量：")
    for i in range(len(bins2)-1):
        print(f"{bins2[i]:.0f}% - {bins2[i+1]:.0f}% : {test_counts[i]} 列")

    # 在柱子上标注数量
    for c, b in zip(train_counts, bins1[:-1]):
        if c > 0:
            plt.text(b + (bins1[1]-bins1[0])/2, 0.01, str(c), 
                     ha="center", va="bottom", fontsize=8, color="black", rotation=90)

    for c, b in zip(test_counts, bins2[:-1]):
        if c > 0:
            plt.text(b + (bins2[1]-bins2[0])/2, 0.03, str(c), 
                     ha="center", va="bottom", fontsize=8, color="blue", rotation=90)

    plt.xlabel("非零占比 (%)")
    plt.ylabel("频率 (Frequency, 0~1)")
    plt.title("各列非零占比分布直方图")
    plt.legend()
    plt.show()

    return train_counts, test_counts


In [32]:
# train_counts, test_counts = plot_nonzero_ratio_hist(features_train, features_test, bins_size=20)


### 特征生成

In [33]:
import pandas as pd

def add_chemical_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    基于分子描述符构造新的衍生特征
    输入:
        df : pd.DataFrame，必须包含以下列：
            ['NumHDonors', 'NumHAcceptors', 'MolLogP', 'TPSA',
            'NumRotatableBonds', 'MolWt', 'NumAromaticRings', 'BertzCT']
    输出:
        df_new : pd.DataFrame，包含新增特征
    """
    df = df.copy()

    df['HBond_Product']        = df['NumHDonors'] * df['NumHAcceptors']
    df['HBond_Sum']            = df['NumHDonors'] + df['NumHAcceptors']
    df['LogP_div_TPSA']        = df['MolLogP'] / (df['TPSA'] + 1)
    df['LogP_x_TPSA']          = df['MolLogP'] * df['TPSA']
    df['Flexibility_Score']    = df['NumRotatableBonds'] / (df['MolWt'] + 1)
    df['MolWt_x_AromaticRings']= df['MolWt'] * df['NumAromaticRings']
    df['Complexity_per_MW']    = df['BertzCT'] / (df['MolWt'] + 1)
    df['Rigidity_Score']       = df['NumAromaticRings'] / (df['NumRotatableBonds'] + 1)

    return df


In [34]:
if config["use_feature_gen"]:
    features_train = add_chemical_features(features_train)
    features_test  = add_chemical_features(features_test)

features_train.shape, features_train.shape

((100, 110), (100, 110))

### PCA降维

In [35]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy import sparse

def apply_truncated_svd(df: pd.DataFrame, n_components: int = 100, random_state: int = 42):
    """
    使用 TruncatedSVD 对 DataFrame 进行降维
    输入:
        df           : pd.DataFrame，特征矩阵（需去掉 ID / label 等非特征列）
        n_components : int，降维后的目标维度
        random_state : int，随机种子
    输出:
        reduced_df   : pd.DataFrame，降维后的结果，保持原行索引
    """
    # 转换为稀疏矩阵
    X_sparse = sparse.csr_matrix(df.values)

    # 初始化 SVD
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)

    # 训练并降维
    X_reduced_array = svd.fit_transform(X_sparse)

    # 包装为 DataFrame
    reduced_df = pd.DataFrame(
        X_reduced_array,
        index=df.index,
        columns=[f"SVD_{i+1}" for i in range(X_reduced_array.shape[1])]
    )
    # 方差解释率
    explained_var = svd.explained_variance_ratio_.sum()

    # 打印信息
    print( "原始维度         : ", df.shape)
    print( "降维后           : ", reduced_df.shape) 
    print(f"累计解释方差比   :  {explained_var:.2%}")

    return reduced_df


In [36]:
# 对数据降维
if config["use_pca"]:
    features_train_reduced = apply_truncated_svd(features_train, n_components = 100)
    features_test_reduced = apply_truncated_svd(features_test, n_components = 100)

    features_train = pd.concat([features_train, features_train_reduced], axis=1)
    features_test = pd.concat([features_test, features_test_reduced], axis=1)

features_train.shape, features_train.shape

原始维度         :  (100, 110)
降维后           :  (100, 100)
累计解释方差比   :  100.00%
原始维度         :  (666, 110)
降维后           :  (666, 100)
累计解释方差比   :  100.00%


((100, 210), (100, 210))

# 单次训练推导

In [43]:
# Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果
# ==============================================================
def run_kfold_xgb(features_train, target_train, features_test, config, DIRS, K_FOLDS=10, verbose=0):
    """
    使用 Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果

    参数:
        features_train, target_train        : 训练集特征和标签
        features_test      : 测试集特征
        params      : XGBoost 最优参数 (dict)
        DIRS        : 保存结果的目录字典
        K_FOLDS     : 折数 (默认=5)
        verbose     : 是否打印详细信息
    """

    
        
    config["X shape"] = features_train.shape
    config["y shape"] = target_train.shape
    config["X_test shape"] = features_test.shape


    # ---------- 创建目录 ----------
    for _, path in DIRS.items():
        os.makedirs(path, exist_ok=True)


    time_str = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    history_DIR = os.path.join(DIRS['HISTORY'], time_str)
    os.makedirs(history_DIR, exist_ok=True)



    print("——" * 20)
    print(f"✅ 当前结果将保存到: {time_str}")


    # ---------- 定义交叉验证 ----------
    skfold = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    yeo = PowerTransformer(method="yeo-johnson")                                # 定义 Yeo-Johnson 变换

    # ---------- 初始化存储 ----------
    oof_val = np.zeros(len(features_train))       # OOF 预测
    train_score, val_score = [], []  # 每折 MAE
    test_pred = []                   # 每折 test 预测
    fold_records = []                # 保存每折信息
    all_importances = []             # 特征重要性
    elapsed_list = []                # 耗时记录



    # 循环每一折
    # ==============================================================

    for i, (train_idx, val_idx) in enumerate(skfold.split(features_train, pd.qcut(target_train, q=10).cat.codes), 1):

        # ----- 打印时间信息 -----
        start_now = datetime.now()
        start_str = start_now.strftime("%H:%M:%S")

        if elapsed_list:
            avg_time = np.mean(elapsed_list)
            est_end = start_now + timedelta(seconds=avg_time)

            # 每 5 个一组输出耗时
            parts = [f"{t:6.1f}s" for t in elapsed_list]
            grouped = [" ".join(parts[j:j+5]) for j in range(0, len(parts), 5)]
            elapsed_str = " /// ".join(grouped)

            print(
                f"🔄{i:2d}/{K_FOLDS} ST {start_str}"
                f" ET {est_end.strftime('%H:%M:%S')}"
                f" avg {avg_time:.1f}s"
                f" [{elapsed_str}]",
                end="\r", flush=True
            )
        else:
            print(f"🔄{i:2d}/{K_FOLDS} ST {start_str} ET (暂无历史数据)", end="\r", flush=True)



        # ----- 开始训练 -----
        t0 = time.time()

        # 1. 数据集划分
        x_train, x_val = features_train.iloc[train_idx], features_train.iloc[val_idx]
        y_train, y_val = target_train[train_idx], target_train[val_idx]

        # 2. Yeo-Johnson 变换
        y_train = yeo.fit_transform(y_train.values.reshape(-1, 1)).squeeze()
        y_val   = yeo.transform(y_val.values.reshape(-1, 1)).squeeze()


        # 3. 特征选择（轻量级 XGBoost）
        # 使用
        selector_model = xgb.XGBRegressor(**config["xgb_selector_model_params"])
        # selector_model = xgb.XGBRegressor(
        #     n_estimators   = 500,
        #     max_depth      = 6,
        #     learning_rate  = 0.05,
        #     random_state   = 2025,
        #     device         = "cpu",
        #     objective      = "reg:absoluteerror",
        #     tree_method    = "hist",
        #     verbosity      = 0
        # )
        
        

        selector_model.fit(x_train, y_train)

        selector = SelectFromModel(selector_model, prefit=True, threshold=config["selector_threshold"])
        selected_features = x_train.columns[selector.get_support()].tolist()
        if verbose > 0:
            print(f"✅ 选择的特征数量: {len(selected_features)}")


        # 4. 保留重要特征
        x_train_new = x_train[selected_features]
        x_val_new   = x_val[selected_features]
        x_test_new  = features_test[selected_features]

        # 5. 转换为 DMatrix
        dtrain = xgb.DMatrix(x_train_new, y_train, feature_names=selected_features)
        dval   = xgb.DMatrix(x_val_new,   y_val,   feature_names=selected_features)
        dtest  = xgb.DMatrix(x_test_new,             feature_names=selected_features)


        # 6. XGBoost 训练
        xgb_model = xgb.train(
            params                 = config["xgb_train_model_params"],
            dtrain                 = dtrain,
            num_boost_round        = config["num_boost_round"],
            evals                  = [(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds  = 300,
            verbose_eval           = (1000 if verbose > 0 else False)
        )


        # 保存模型
        model_path = os.path.join(history_DIR, f"xgb_model_fold{i}.json")
        xgb_model.save_model(model_path)

        # 7. 获取特征重要性
        imp_dict = xgb_model.get_score(importance_type="gain")
        imp_df = pd.DataFrame(imp_dict.items(), columns=["Feature", "Importance"])
        imp_df["Fold"] = i
        all_importances.append(imp_df)


        # 8. 预测
        y_train_pred = xgb_model.predict(dtrain)
        y_val_pred   = xgb_model.predict(dval)
        y_test_pred  = xgb_model.predict(dtest)

        # 9. 逆变换
        y_train      = yeo.inverse_transform(y_train.reshape(-1, 1)).squeeze()
        y_val        = yeo.inverse_transform(y_val.reshape(-1, 1)).squeeze()
        y_train_pred = yeo.inverse_transform(y_train_pred.reshape(-1, 1)).squeeze()
        y_val_pred   = yeo.inverse_transform(y_val_pred.reshape(-1, 1)).squeeze()
        y_test_pred  = yeo.inverse_transform(y_test_pred.reshape(-1, 1)).squeeze()

        # 10. 计算 MAE
        train_mae = mean_absolute_error(y_train, y_train_pred)
        val_mae   = mean_absolute_error(y_val,   y_val_pred)
        if verbose > 0:
            print(f"Fold {i}: Train MAE={train_mae:.4f}, Val MAE={val_mae:.4f}，用时 {elapsed:.2f} 秒")


        # ----- 保存结果 -----
        train_score.append(train_mae)
        val_score.append(val_mae)
        oof_val[val_idx] = y_val_pred
        test_pred.append(y_test_pred)

        elapsed = time.time() - t0
        elapsed_list.append(elapsed)

        fold_records.append({
            "Fold": i,
            "Train_MAE": train_mae,
            "Val_MAE": val_mae,
            "Num_Features": len(selected_features),
            "Selected_Features": selected_features,
            "elapsed": elapsed
        })

    # 保存整体结果
    # ==============================================================
    if verbose > 0:
        print("\n")
        print(f"📊 Train MAE 平均值 : {np.mean(train_score):.4f}")
        print(f"📊 Val   MAE 平均值 : {np.mean(val_score):.4f}")
        print(f"📊 Train MAE 标准差 : {np.std(train_score, ddof=0):.4f}")
        print(f"📊 Val   MAE 标准差 : {np.std(val_score, ddof=0):.4f}")

    # 参数
    with open(os.path.join(history_DIR, "config.json"), "w", encoding="utf-8") as f:
        json.dump(config, f, indent=4, ensure_ascii=False)

    # 每折信息
    folds_df = pd.DataFrame(fold_records)
    folds_df.to_csv(os.path.join(history_DIR, "folds_info.csv"), index=False, encoding="utf-8-sig")


    # 特征重要性
    if all_importances:
        valid_imps = [df for df in all_importances if not df.empty]
        all_imp_df = pd.concat(valid_imps, axis=0) if valid_imps else pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    else:
        all_imp_df = pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    all_imp_df.to_csv(os.path.join(history_DIR, "feature_importance_all.csv"), index=False, encoding="utf-8-sig")


    # 测试集预测
    test_pred_array = np.vstack(test_pred).T
    test_pred_df = pd.DataFrame(test_pred_array, columns=[f"Fold_{j+1}" for j in range(test_pred_array.shape[1])])
    test_pred_df["Final_Pred"] = test_pred_df.mean(axis=1)
    test_pred_df.to_csv(os.path.join(history_DIR, "test_predictions.csv"), index=False, encoding="utf-8-sig")

    # 总结
    with open(os.path.join(history_DIR, "summary.txt"), "w", encoding="utf-8") as f:
        f.write(f"Train MAE Mean : {np.mean(train_score):.4f}\n")
        f.write(f"Val   MAE Mean : {np.mean(val_score):.4f}\n")
        f.write(f"Train MAE Std  : {np.std(train_score, ddof=0):.4f}\n")
        f.write(f"Val   MAE Std  : {np.std(val_score, ddof=0):.4f}\n")


    # 最终提交
    final_score = np.mean(val_score)
    submission = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "sample_submission.csv"))
    submission["Tm"] = test_pred_df["Final_Pred"]

    submission_path = os.path.join(history_DIR, f"sub_{time_str}_{final_score:.8f}.csv")
    submission.to_csv(submission_path, index=False)
    submission.to_csv(os.path.join(DIRS['SUBMISSION'], f"sub_{time_str}_{final_score:.8f}.csv"), index=False)

        
    config["time_str"] = time_str
    config["score"] = final_score


    # ---------- 返回结果 ----------
    return {
        "oof_val": oof_val,
        "train_score": train_score,
        "val_score": val_score,
        "test_pred": test_pred_df,
        "folds_info": folds_df,
        "feature_importance": all_imp_df,
        "submission_path": submission_path,
        "time": time_str,
        "final_score": final_score,
        "config": config
    }


In [44]:
# 执行一次

X = features_train
y = target_train
X_test = features_test
print(X.shape, X_test.shape)


results = run_kfold_xgb(X, y, X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
config = results['config']

print('\n',results['final_score'])

(100, 210) (666, 210)
————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-22 00-12-29
🔄 1/10 ST 00:12:29 ET (暂无历史数据)


The least populated class in y has only 9 members, which is less than n_splits=10.



🔄10/10 ST 00:12:39 ET 00:12:40 avg 1.1s [   1.1s    1.1s    1.2s    1.1s    1.1s ///    1.1s    1.1s    1.2s    1.1s]
 53.25167077636718


In [None]:
# 打印当前config
print(config_to_str(config))

- ISTEST              : True
- remove_dup_smiles   : True
- use_feature_gen     : False
- use_pca             : True
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 6
     - eta                 : 0.1
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
- X shape             : (100, 210)
- y shape             : (100,)
- X_test shape        : (666, 210)
- time_str            : 2025-10-22 00-07-23
- score               : 53.25167077636718


# 提交 kaggle 平台测试

In [40]:
# 根据 submission_time 定位文件路径 提交 kaggle 平台测试

import os
import itertools
import time
from kaggle.api.kaggle_api_extended import KaggleApi


def find_submission_file(submission_time, submission_dir):
    """
    在 submission_dir 下查找包含 submission_time 的文件
    一旦找到立刻返回完整路径；如果没找到则返回 None
    """
    for fname in os.listdir(submission_dir):
        if submission_time in fname:
            file_path = os.path.join(submission_dir, fname)
            print(f"✅ 找到目标文件: {fname}")
            return file_path
    
    print(f"⚠️ 未找到包含 {submission_time} 的文件")
    return None

def submit_and_get_score(file_path, competition_name, message="My submission"):
    """
    封装 Kaggle 提交并等待结果评分
    --------------------------------------
    file_path        : str  提交文件路径
    competition_name : str  Kaggle 比赛名称 (URL 最后一段)
    message          : str  提交备注
    """
    # 1. 配置 Kaggle API
    os.environ["KAGGLE_CONFIG_DIR"] = r"C:\Users\Admin\.kaggle"
    api = KaggleApi()
    api.authenticate()
    print("✅ Kaggle API 已经配置成功！")

    # 2. 提交文件
    api.competition_submit(
        file_name=file_path,
        competition=competition_name,
        message=message
    )
    print("✅ 提交完成！请等待评分...")

    # 3. 动态等待
    spinner = itertools.cycle(["|", "/", "-", "\\"])
    while True:
        submissions = api.competition_submissions(competition_name)
        latest = submissions[0]
        status_str = str(latest._status).lower()

        if "complete" in status_str and latest._public_score is not None:
            print("\n🎯 最终结果:")
            print(f"Public 分数 : {latest._public_score}")
            print(f"Private 分数: {latest._private_score}")
            print(f"提交 ID     : {latest._ref}")
            print(f"文件名      : {latest._file_name}")
            print(f"状态        : {latest._status}")
            print(f"提交时间    : {latest._date}")
            print(f"描述/备注   : {latest._description}")
            return latest

        spin_char = next(spinner)
        print(f"当前状态: {status_str} , 等待中 {spin_char}", end="\r", flush=True)
        time.sleep(0.2)  # 每 0.5 秒检查一次


### 不轻易运行，再三考虑

In [41]:
# submission_time 提交
submission_time = "2025-10-21 23-51-09"
competition_name = "melting-point"
message =  f"该提交文件的参数：\n{config_to_str(config)} "
print(message)

target_file = find_submission_file(submission_time, DIRS['SUBMISSION'] )

# submit_and_get_score(target_file, competition_name, message)

该提交文件的参数：
- ISTEST              : True
- remove_dup_smiles   : True
- use_feature_gen     : False
- use_pca             : True
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 6
     - eta                 : 0.1
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
- X shape             : (100, 210)
- y shape             : (100,)
- X_test shape        : (666, 210)
- time_str            : 2025-10-22 00-07-23
- score               : 53.25167077636718 
⚠️ 未找到包含 2025-10-21 23-51-09 的文件


# 参数优化

In [57]:
# 实验配置单
base_config  = {
    # 固定开关
    "ISTEST"            : False,

    "remove_dup_smiles" : True, 
    "use_feature_gen"   : False,
    "use_pca"           : True,
    "pca_components"    : 100,

    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cpu",
        "objective"     : "reg:absoluteerror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "mean",   

    # 训练设置
    "xgb_train_model_params": {
        'max_depth'   : 6,
        'eta'         : 0.1,
        'tree_method' : 'hist',
        'eval_metric' : 'mae',
    },
    "num_boost_round": 15000,
}

In [74]:
# 定义优化任务  加入标识符 host: hao-2   ip: 192.168.40.1

import copy
import contextlib
import io

def objective(trial):
    """
    Optuna 的目标函数 (Objective Function)
    每次 trial 会生成一组超参数，用于训练 XGBoost 模型，
    并返回交叉验证的平均 RMSE 作为优化目标。
    """




    # 1. 定义 超参数 搜索空间
    # 拷贝一份 config，避免全局污染
    config = copy.deepcopy(base_config)

    # 只修改需要优化的参数
    config["remove_dup_smiles"] = trial.suggest_categorical("remove_dup_smiles", [True, False])
    config["use_feature_gen"]   = trial.suggest_categorical("use_feature_gen", [True, False])
    config["use_pca"]           = trial.suggest_categorical("use_pca", [True, False])

    # config["xgb_selector_model_params"]["random_state"] = trial.suggest_int("selector_random_state", 1, 9999)
    config["xgb_selector_model_params"]["random_state"] = trial.suggest_categorical("selector_random_state", [42, 2025])
    config["xgb_selector_model_params"]["device"]       = trial.suggest_categorical("selector_device", ["cpu", "cuda"])
    # config["xgb_selector_model_params"]["tree_method"]  = trial.suggest_categorical("selector_tree_method", ["hist", "approx"])

    config["selector_threshold"] = trial.suggest_categorical("selector_threshold", ["mean", "0.75*mean", "0.5*mean", "1.25*mean"])

    config["xgb_train_model_params"]["max_depth"] = trial.suggest_int("train_max_depth", 3, 12)
    config["xgb_train_model_params"]["eta"] = trial.suggest_float("train_eta", 0.01 , 0.3 , log=True)








    # 主流程---------------------------------------------------------------------------------------------------
    # 创建一个黑洞缓冲区
    f = io.StringIO()
    with contextlib.redirect_stdout(f):
        None

    # 打印当前config
    print(config_to_str(config))
    

    # 加载数据
    merge_df, test_df =  loaddata(DIRS)

    # 数据拆分
    print("数据拆分---------------------------")
    features_train, target_train, features_test = prepare_features_and_target(merge_df, test_df, config)

    # 特征生成
    if config["use_feature_gen"]:
        print("特征生成---------------------------")
        features_train = add_chemical_features(features_train)
        features_test  = add_chemical_features(features_test)
        print(features_train.shape, features_test.shape)

    # 数据降维
    if config["use_pca"]:
        print("数据降维---------------------------")
        features_train_reduced = apply_truncated_svd(features_train, n_components = 100)
        features_test_reduced = apply_truncated_svd(features_test, n_components = 100)

        features_train = pd.concat([features_train, features_train_reduced], axis=1)
        features_test = pd.concat([features_test, features_test_reduced], axis=1)
        print(features_train.shape, features_test.shape)

    X, y, X_test = features_train, target_train, features_test
    print("开始训练---------------------------")





    results = run_kfold_xgb(X, y, X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
    config = results['config']
    score = results['final_score']



    HOSTNAME = socket.gethostname()
    HOST_IP = socket.gethostbyname(HOSTNAME)
    trial.set_user_attr("host", HOSTNAME)        # 你自己定义主机 A/B
    trial.set_user_attr("ip", HOST_IP)        # 你自己定义角色 A/B

    # 4. 返回平均 MAE
    return score

In [59]:
STUDY_NAME = "test" if base_config["ISTEST"] else "optuna_task1"



In [78]:
# 开始优化

# 1. 定义 SQLite 数据库存储路径

storage_url = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

study = optuna.create_study(
    study_name = STUDY_NAME,
    # study_name="ghsdjsrtjrswtjhwrt",
    storage=storage_url,
    load_if_exists=True
)

# 自动获取当前主机名\当前主机的 IP 地址
HOSTNAME = socket.gethostname()
HOST_IP = socket.gethostbyname(HOSTNAME)
print("主机名:", HOSTNAME," 主机 IP:", HOST_IP)
time.sleep(1)

# 5. 启动超参数搜索
print("🔎 开始超参数搜索...")
if base_config["ISTEST"]:
    study.optimize(objective, n_trials = 3)
else:
    study.optimize(objective, n_trials = 100)


# 6. 打印最优结果
print("\n✅ 训练完成！")
print(f"📊 已完成试验次数 : {len(study.trials)}")
print(f"🏆 最优试验编号   : {study.best_trial.number}")
print(f"📉 最优 MAE       : {study.best_value}")
print(f"⚙️ 最优参数组合   : {study.best_trial.params}")


[I 2025-10-22 01:01:07,374] Using an existing study with name 'optuna_task1' instead of creating a new one.


主机名: hao-2  主机 IP: 192.168.40.1
🔎 开始超参数搜索...
- ISTEST              : False
- remove_dup_smiles   : True
- use_feature_gen     : True
- use_pca             : True
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 42
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 6
     - eta                 : 0.01368700824230773
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
⚠️ 检测到 276 个重复 SMILES
✅ 删除完成: 从 (28808,

[I 2025-10-22 02:45:55,299] Trial 8 finished with value: 17.849929328445675 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': True, 'selector_random_state': 42, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 6, 'train_eta': 0.01368700824230773}. Best is trial 8 with value: 17.849929328445675.


- ISTEST              : False
- remove_dup_smiles   : False
- use_feature_gen     : True
- use_pca             : True
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 42
     - device              : cuda
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : 0.75*mean
🔹 xgb_train_model_params:
     - max_depth           : 4
     - eta                 : 0.05053010910637573
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
📊 数据拆分完成
训练集特征 features_train  shape   : (28808, 6528)
训练集目标   target_train  

[I 2025-10-22 04:18:13,600] Trial 9 finished with value: 17.995321574148264 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': True, 'use_pca': True, 'selector_random_state': 42, 'selector_device': 'cuda', 'selector_threshold': '0.75*mean', 'train_max_depth': 4, 'train_eta': 0.05053010910637573}. Best is trial 8 with value: 17.849929328445675.


- ISTEST              : False
- remove_dup_smiles   : True
- use_feature_gen     : False
- use_pca             : True
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 42
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : 0.5*mean
🔹 xgb_train_model_params:
     - max_depth           : 4
     - eta                 : 0.026729074261343022
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
⚠️ 检测到 276 个重复 SMILES
✅ 删除完成: 从 (28808, 6530) → (28405, 6530)
📊 数据拆分完成
训练集特征 f

[I 2025-10-22 05:58:28,065] Trial 10 finished with value: 18.924810628669103 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': False, 'use_pca': True, 'selector_random_state': 42, 'selector_device': 'cpu', 'selector_threshold': '0.5*mean', 'train_max_depth': 4, 'train_eta': 0.026729074261343022}. Best is trial 8 with value: 17.849929328445675.


- ISTEST              : False
- remove_dup_smiles   : False
- use_feature_gen     : False
- use_pca             : False
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : 0.75*mean
🔹 xgb_train_model_params:
     - max_depth           : 12
     - eta                 : 0.0955937800637472
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
📊 数据拆分完成
训练集特征 features_train  shape   : (28808, 6528)
训练集目标   target_trai

[I 2025-10-22 06:18:13,573] Trial 11 finished with value: 18.474602442284255 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': False, 'use_pca': False, 'selector_random_state': 2025, 'selector_device': 'cpu', 'selector_threshold': '0.75*mean', 'train_max_depth': 12, 'train_eta': 0.0955937800637472}. Best is trial 8 with value: 17.849929328445675.


- ISTEST              : False
- remove_dup_smiles   : True
- use_feature_gen     : False
- use_pca             : False
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 5
     - eta                 : 0.01844230194954457
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
⚠️ 检测到 276 个重复 SMILES
✅ 删除完成: 从 (28808, 6530) → (28405, 6530)
📊 数据拆分完成
训练集特征 fea

[I 2025-10-22 07:45:30,510] Trial 12 finished with value: 18.843924579801143 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': False, 'use_pca': False, 'selector_random_state': 2025, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 5, 'train_eta': 0.01844230194954457}. Best is trial 8 with value: 17.849929328445675.


- ISTEST              : False
- remove_dup_smiles   : True
- use_feature_gen     : True
- use_pca             : True
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 42
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 11
     - eta                 : 0.013329388592334063
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
⚠️ 检测到 276 个重复 SMILES
✅ 删除完成: 从 (28808, 6530) → (28405, 6530)
📊 数据拆分完成
训练集特征 featu

[I 2025-10-22 09:13:17,184] Trial 13 finished with value: 18.582485674397393 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': True, 'selector_random_state': 42, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 11, 'train_eta': 0.013329388592334063}. Best is trial 8 with value: 17.849929328445675.


- ISTEST              : False
- remove_dup_smiles   : False
- use_feature_gen     : True
- use_pca             : False
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 8
     - eta                 : 0.013678962412610542
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
📊 数据拆分完成
训练集特征 features_train  shape   : (28808, 6528)
训练集目标   target_train  sh

[I 2025-10-22 11:06:58,744] Trial 14 finished with value: 17.556320884921817 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': True, 'use_pca': False, 'selector_random_state': 2025, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 8, 'train_eta': 0.013678962412610542}. Best is trial 14 with value: 17.556320884921817.


- ISTEST              : False
- remove_dup_smiles   : False
- use_feature_gen     : True
- use_pca             : True
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 42
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : 1.25*mean
🔹 xgb_train_model_params:
     - max_depth           : 9
     - eta                 : 0.045732428074001644
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
📊 数据拆分完成
训练集特征 features_train  shape   : (28808, 6528)
训练集目标   target_train  

[I 2025-10-22 11:41:15,531] Trial 19 finished with value: 18.085312780044305 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': True, 'use_pca': True, 'selector_random_state': 42, 'selector_device': 'cpu', 'selector_threshold': '1.25*mean', 'train_max_depth': 9, 'train_eta': 0.045732428074001644}. Best is trial 14 with value: 17.556320884921817.


- ISTEST              : False
- remove_dup_smiles   : False
- use_feature_gen     : False
- use_pca             : False
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : 0.75*mean
🔹 xgb_train_model_params:
     - max_depth           : 4
     - eta                 : 0.06743144217230011
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
📊 数据拆分完成
训练集特征 features_train  shape   : (28808, 6528)
训练集目标   target_trai

[I 2025-10-22 13:15:15,085] Trial 21 finished with value: 17.975167301336665 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': False, 'use_pca': False, 'selector_random_state': 2025, 'selector_device': 'cpu', 'selector_threshold': '0.75*mean', 'train_max_depth': 4, 'train_eta': 0.06743144217230011}. Best is trial 14 with value: 17.556320884921817.


- ISTEST              : False
- remove_dup_smiles   : True
- use_feature_gen     : True
- use_pca             : False
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 42
     - device              : cuda
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 7
     - eta                 : 0.01061463707842147
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
⚠️ 检测到 276 个重复 SMILES
✅ 删除完成: 从 (28808, 6530) → (28405, 6530)
📊 数据拆分完成
训练集特征 featu

[I 2025-10-22 14:56:17,858] Trial 23 finished with value: 18.02550230756041 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_random_state': 42, 'selector_device': 'cuda', 'selector_threshold': 'mean', 'train_max_depth': 7, 'train_eta': 0.01061463707842147}. Best is trial 14 with value: 17.556320884921817.


- ISTEST              : False
- remove_dup_smiles   : True
- use_feature_gen     : True
- use_pca             : True
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 42
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 7
     - eta                 : 0.02880197004042992
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
⚠️ 检测到 276 个重复 SMILES
✅ 删除完成: 从 (28808, 6530) → (28405, 6530)
📊 数据拆分完成
训练集特征 feature

[I 2025-10-22 16:18:13,719] Trial 25 finished with value: 17.762302445492622 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': True, 'selector_random_state': 42, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 7, 'train_eta': 0.02880197004042992}. Best is trial 14 with value: 17.556320884921817.


- ISTEST              : False
- remove_dup_smiles   : True
- use_feature_gen     : True
- use_pca             : False
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 9
     - eta                 : 0.027734152778904956
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
⚠️ 检测到 276 个重复 SMILES
✅ 删除完成: 从 (28808, 6530) → (28405, 6530)
📊 数据拆分完成
训练集特征 fea

[I 2025-10-22 17:27:23,095] Trial 27 finished with value: 17.725474441857436 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_random_state': 2025, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 9, 'train_eta': 0.027734152778904956}. Best is trial 14 with value: 17.556320884921817.


- ISTEST              : False
- remove_dup_smiles   : False
- use_feature_gen     : True
- use_pca             : False
- pca_components      : 100
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:absoluteerror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 10
     - eta                 : 0.020728885589285505
     - tree_method         : hist
     - eval_metric         : mae
- num_boost_round     : 15000
✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528
数据拆分---------------------------
📊 数据拆分完成
训练集特征 features_train  shape   : (28808, 6528)
训练集目标   target_train  s

[W 2025-10-22 17:58:59,082] Trial 29 failed with parameters: {'remove_dup_smiles': False, 'use_feature_gen': True, 'use_pca': False, 'selector_random_state': 2025, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 10, 'train_eta': 0.020728885589285505} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "d:\Software\conda\envs\py39_tf\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_24996\4259608936.py", line 84, in objective
    results = run_kfold_xgb(X, y, X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_24996\1853940361.py", line 130, in run_kfold_xgb
    xgb_model = xgb.train(
  File "d:\Software\conda\envs\py39_tf\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "d:\Software\conda\envs\py39_tf\lib\site-packages\xgboo

KeyboardInterrupt: 

# 管理数据库信息

In [79]:
# 查询数据库详细数据

storage_url = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

studies = optuna.study.get_all_study_summaries(storage=storage_url)

if not studies:
    print("❌ 当前数据库里无 study")
else:
    print("✅ 数据库中的 study 列表:")
    for s in studies:

        print("-", s.study_name)

        study = optuna.load_study(study_name=s.study_name, storage=storage_url)

        print("         Trials:")
        for trial in study.trials:
            host = trial.user_attrs.get("host") or "unknown"
            ip = trial.user_attrs.get("ip") or "unknown"
            value = f"{trial.value:.4f}" if trial.value is not None else "None"

            print(
                f"    Trial {trial.number:4d}: "
                f"host={host:<16}, ip={ip:<15}, "
                f"value={value:<10}, params={trial.params}"
            )

        print("    总 trial 数量:", len(study.trials))
        print("=" * 100)


✅ 数据库中的 study 列表:
- optuna_task1
         Trials:
    Trial    0: host=unknown         , ip=unknown        , value=None      , params={'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_random_state': 2025, 'selector_device': 'cpu', 'selector_threshold': '0.75*mean', 'train_max_depth': 6, 'train_eta': 0.14963770710824598}
    Trial    1: host=unknown         , ip=unknown        , value=None      , params={'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_random_state': 42, 'selector_device': 'cuda', 'selector_threshold': '1.25*mean', 'train_max_depth': 7, 'train_eta': 0.018021464633564754}
    Trial    2: host=unknown         , ip=unknown        , value=None      , params={'remove_dup_smiles': False, 'use_feature_gen': False, 'use_pca': False, 'selector_random_state': 42, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 8, 'train_eta': 0.021659776125338565}
    Trial    3: host=unknown         , ip=unk

In [51]:
# 清理前：先查看数据库里当前有哪些 study 存在，以及每个 study 里有多少个 trial

storage = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

studies = optuna.study.get_all_study_summaries(storage=storage)
print("现有 study：", [s.study_name for s in studies])

for s in studies:
    study = optuna.load_study(study_name=s.study_name, storage=storage)
    print(f"Study:   {s.study_name:30s}, Trials: {len(study.trials):4d}")

现有 study： ['test']
Study:   test                          , Trials:   12


In [63]:
# 清理中：删除指定 study
# 指定要删除的名称
to_delete = ["melting_point_study"]   # 可以写一个或多个

to_delete = [            ]

for s in studies:
    if s.study_name in to_delete:
        optuna.delete_study(study_name=s.study_name, storage=storage)
        print("已删除:", s.study_name)


In [62]:
# 清理后：再次检查
studies_after = optuna.study.get_all_study_summaries(storage=storage)
print("清理后 study：", [s.study_name for s in studies_after])


清理后 study： []
