In [1]:
# 系统库
import os
import subprocess
import time
import shutil
import json
import socket
from datetime import datetime, timedelta

# 第三方科学计算 & 可视化
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 设置中文字体，避免乱码
plt.rcParams['font.sans-serif'] = ['SimHei']        # 黑体
plt.rcParams['axes.unicode_minus'] = False          # 解决负号显示成方块的问题

# 机器学习 & 优化
import xgboost as xgb
import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer

# 化学信息学 (RDKit)
from rdkit import Chem, RDLogger
from rdkit.Chem import (
    Descriptors, Crippen, rdMolDescriptors,
    MACCSkeys, RDKFingerprint, rdFingerprintGenerator
)
from rdkit.Chem.AtomPairs import Pairs, Torsions

# 关闭 RDKit 的警告
RDLogger.DisableLog('rdApp.*')

# Avalon 指纹（可选）
try:
    from rdkit.Avalon import pyAvalonTools
    avalon_available = True
except ImportError:
    avalon_available = False
print(f"Avalon available: {avalon_available}")

# Kaggle API
from kaggle.api.kaggle_api_extended import KaggleApi



  from .autonotebook import tqdm as notebook_tqdm


Avalon available: True


In [2]:
# 初始化

host = "10.162.147.95"
user = "user1"
password = "123456"

database_name = 'thermophysical_property_melting_point'  # 数据库名称
competition = database_name  # 竞赛名称
kaggle_competition_name = "melting-point"
study_save_name = "XGBoost_Optuna_pure_data"  # 实验名称，用于保存 Optuna study 和结果文件夹命名





if socket.gethostname() == 'hao-2':
    dir = rf'D:\DATA_hao\Kaggle_\{competition}'
else:
    dir = os.getcwd()




DIRS = {
    "dir":              dir,                                       
    "DATA_DIR000":      os.path.join(dir, "DATA_DIR000"),
    "HISTORY":          os.path.join(dir, "HISTORY", f"{study_save_name}"),
    "SUBMISSION":       os.path.join(dir, "SUBMISSION", f"{study_save_name}"),
}


# 自动创建目录
for key, path in DIRS.items():
    os.makedirs(path, exist_ok=True)

# 打印时一行一个地址
print("✅ 路径已创建：\n")
for key, path in DIRS.items():
    print(f"{key:<12} : {path}")


✅ 路径已创建：

dir          : D:\DATA_hao\Kaggle_\thermophysical_property_melting_point
DATA_DIR000  : D:\DATA_hao\Kaggle_\thermophysical_property_melting_point\DATA_DIR000
HISTORY      : D:\DATA_hao\Kaggle_\thermophysical_property_melting_point\HISTORY\XGBoost_Optuna_pure_data
SUBMISSION   : D:\DATA_hao\Kaggle_\thermophysical_property_melting_point\SUBMISSION\XGBoost_Optuna_pure_data


# 数据提取处理

In [3]:
def show_df_info(df, name: str):
    """
    打印单个 DataFrame 的形状与列名信息。
    参数:
        df   : pandas.DataFrame
        name : 显示名称（字符串）
    """
    print(f"📊 {name:<16} shape: {str(df.shape):<16}  列名: {df.columns.tolist()}")


In [4]:
# 加载Kaggle 训练集和 Bradley 熔点公开数据集

# Kaggle 提供的训练集和测试集
train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))
train_df["origin_info"] = 'Kaggle' 
test_df["origin_info"] = 'Kaggle' 
# 外部 Bradley 熔点公开数据集
bradley_df = pd.read_excel(os.path.join(DIRS['DATA_DIR000'], "BradleyMeltingPointDataset.xlsx"))
bradleyplus_df = pd.read_excel(os.path.join(DIRS['DATA_DIR000'], "BradleyDoublePlusGoodMeltingPointDataset.xlsx"))
bradley_df["origin_info"] = 'bradley' 
bradleyplus_df["origin_info"] = 'bradleyplus' 
# 只保留需要的列
train_df = train_df[['SMILES', 'Tm', 'origin_info']]
test_df  = test_df[['id', 'SMILES', 'origin_info']]

# 输出数据集规模，确认加载成功
show_df_info(train_df, "Train")
show_df_info(test_df, "Test")
show_df_info(bradley_df, "Bradley")
show_df_info(bradleyplus_df, "Bradley Plus")

📊 Train            shape: (2662, 3)         列名: ['SMILES', 'Tm', 'origin_info']
📊 Test             shape: (666, 3)          列名: ['id', 'SMILES', 'origin_info']
📊 Bradley          shape: (28645, 10)       列名: ['key', 'name', 'smiles', 'mpC', 'csid', 'link', 'source', 'donotuse', 'donotusebecause', 'origin_info']
📊 Bradley Plus     shape: (3041, 12)        列名: ['key', 'name', 'smiles', 'mpC', 'csid', 'link', 'source', 'count', 'min', 'max', 'range', 'origin_info']


In [5]:
# 外部 Bradley 熔点数据集处理
# 1. 摄氏度 → 开尔文: T(K) = T(°C) + 273.15
bradley_df      ['Tm']     = bradley_df      ['mpC'] + 273.15
bradleyplus_df  ['Tm']     = bradleyplus_df  ['mpC'] + 273.15

# 2. 保留 [SMILES, Tm] 并统一列名
bradley_df     = bradley_df     [['smiles', 'Tm', "origin_info"]].rename(columns={'smiles': 'SMILES'})
bradleyplus_df = bradleyplus_df [['smiles', 'Tm', "origin_info"]].rename(columns={'smiles': 'SMILES'})

show_df_info(bradley_df, "Bradley")
show_df_info(bradleyplus_df, "Bradley Plus")

# 打印原始信息
show_df_info(train_df, "Train")

# 拼接到 Kaggle 训练集
merge_df = pd.concat([train_df, bradley_df, bradleyplus_df], axis=0).reset_index(drop=True)

show_df_info(train_df, "train_df")


📊 Bradley          shape: (28645, 3)        列名: ['SMILES', 'Tm', 'origin_info']
📊 Bradley Plus     shape: (3041, 3)         列名: ['SMILES', 'Tm', 'origin_info']
📊 Train            shape: (2662, 3)         列名: ['SMILES', 'Tm', 'origin_info']
📊 train_df         shape: (2662, 3)         列名: ['SMILES', 'Tm', 'origin_info']


In [6]:
# 规范化 SMILES
def canonicalize(smile):
    try:
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None
    


merge_df["SMILES_normalized"] = merge_df["SMILES"].apply(canonicalize)
test_df["SMILES_normalized"] = test_df["SMILES"].apply(canonicalize)

# 输出数据集规模，确认加载成功
show_df_info(merge_df, "merge_df")
show_df_info(test_df, "test_df")


📊 merge_df         shape: (34348, 4)        列名: ['SMILES', 'Tm', 'origin_info', 'SMILES_normalized']
📊 test_df          shape: (666, 4)          列名: ['id', 'SMILES', 'origin_info', 'SMILES_normalized']


In [7]:
# 按 SMILES_normalized 分组并根据 origin_info 条件进行筛选与处理。
def process_merge_df(df):
    """
    按 SMILES_normalized 分组并根据 origin_info 条件进行筛选与处理。
    
    规则：
      1. 如果组中存在 'Kaggle' → 删除 'bradleyplus' 和 'bradley'，
         同时 'Kaggle' 仅保留一个，Tm 取平均值。
         
      2. 否则如果存在 'bradleyplus' → 删除 'bradley'，
         同时 'bradleyplus' 仅保留一个，Tm 取平均值。
         
      3. 否则（仅有 bradley）：
           - 若有两行：保留一行，Tm 取平均值；
           - 若多于两行：保留 Tm 为中位数的那一行，删除其他。
    """
    processed_groups = []

    for smiles, group in df.groupby("SMILES_normalized", group_keys=False):
        origin_values = group["origin_info"].unique()

        # ✅ 规则 1：存在 Kaggle
        if "Kaggle" in origin_values:
            kaggle_rows = group[group["origin_info"] == "Kaggle"].copy()
            if len(kaggle_rows) > 1:
                avg_tm = kaggle_rows["Tm"].mean()
                kaggle_rows = kaggle_rows.iloc[[0]].copy()
                kaggle_rows["Tm"] = avg_tm
            group = kaggle_rows  # 删除其他来源（bradley, bradleyplus）

        # ✅ 规则 2：无 Kaggle，但存在 bradleyplus
        elif "bradleyplus" in origin_values:
            bp_rows = group[group["origin_info"] == "bradleyplus"].copy()
            if len(bp_rows) > 1:
                avg_tm = bp_rows["Tm"].mean()
                bp_rows = bp_rows.iloc[[0]].copy()
                bp_rows["Tm"] = avg_tm
            group = bp_rows  # 删除 bradley 行

        # ✅ 规则 3：仅 bradley
        else:
            if len(group) == 2:
                avg_tm = group["Tm"].mean()
                group = group.iloc[[0]].copy()
                group["Tm"] = avg_tm
            elif len(group) > 2:
                median_tm = group["Tm"].median()
                group = group.iloc[(group["Tm"] - median_tm).abs().argsort()[:1]]
            # 若只有一行，则保持不变

        processed_groups.append(group)

    result = pd.concat(processed_groups, ignore_index=True)
    return result


In [8]:
# 筛选与处理重复数据
duplicates = merge_df[
    merge_df.duplicated("SMILES_normalized", keep=False)
].copy()

# 排序让相同的值挨在一起
duplicates = duplicates.sort_values(by="SMILES_normalized").reset_index(drop=True)


merge_filtered_df = process_merge_df(merge_df)
show_df_info(merge_filtered_df, "merge_filtered_df")


duplicates2 = merge_filtered_df[
    merge_filtered_df.duplicated("SMILES_normalized", keep=False)
].copy()

# 排序让相同的值挨在一起
duplicates2 = duplicates2.sort_values(by="SMILES_normalized").reset_index(drop=True)


📊 merge_filtered_df shape: (20262, 4)        列名: ['SMILES', 'Tm', 'origin_info', 'SMILES_normalized']


In [9]:
# 提取所有分子描述符 (Descriptors)
def extract_all_descriptors(df, SMILES_col):
    """
    输入:
        df         : DataFrame，包含 SMILES 列
        SMILES_col : 字符串，SMILES 列的名称
    输出:
        DataFrame，原始数据 + 208 个分子描述符
    """

    # 1. 获取 RDKit 内置的分子描述符
    descriptor_list = Descriptors._descList   # [(name, func), ...]
    descriptors = [desc[0] for desc in descriptor_list]
    print(f"📊 一共存在 {len(descriptors)} 个分子描述符特征")

    # 2. 遍历每个分子，计算描述符
    results = []
    total = len(df)
    for idx, smi in enumerate(df[SMILES_col]):
        mol = Chem.MolFromSmiles(smi)

        if mol is None:
            row = {name: None for name, func in descriptor_list}   # 无效 SMILES
        else:
            row = {name: func(mol) for name, func in descriptor_list}  # 有效 SMILES

        results.append(row)

        # 打印进度条（覆盖式打印）
        print(f"🔄 处理进度: {idx+1:5d}/{total:5d}", end="\r", flush=True)
    print("\n✅ 描述符计算完成")

    # 3. 合并原始数据与新特征
    df_desc = pd.DataFrame(results)
    return pd.concat([df, df_desc], axis=1)








# # ============ 应用函数 ============

# show_df_info(merge_filtered_df, "filtered_df")
# show_df_info(test_df, "test_df")

# descriptors_train_df = extract_all_descriptors(merge_filtered_df, "SMILES_normalized")
# descriptors_test_df  = extract_all_descriptors(test_df, "SMILES_normalized")

# show_df_info(descriptors_train_df, "descriptors_train_df")
# show_df_info(descriptors_test_df, "descriptors_test_df")

# # 删除无效数据 (有 NaN 的行)
# descriptors_train_df = descriptors_train_df.dropna().reset_index(drop=True)
# descriptors_test_df  = descriptors_test_df.dropna().reset_index(drop=True)

# show_df_info(descriptors_train_df, "descriptors_train_df")
# show_df_info(descriptors_test_df, "descriptors_test_df")

# # 保存到 CSV
# descriptors_train_df_path = os.path.join(DIRS['DATA_DIR000'], "descriptors_train_df.csv")
# descriptors_test_df_path  = os.path.join(DIRS['DATA_DIR000'], "descriptors_test_df.csv")
# descriptors_train_df.to_csv(descriptors_train_df_path, index=False)
# descriptors_test_df.to_csv(descriptors_test_df_path, index=False)

# print(f"✅ descriptors_train_df 已保存到 {descriptors_train_df_path}")
# print(f"✅ descriptors_test_df 已保存到 {descriptors_test_df_path}")


In [10]:
# 提取所有分子指纹 (Fingerprints)
def extract_all_fingerprint(df, SMILES_col, morgan_radius=2, morgan_nbits=1024):
    """
    输入参数:
        df            : DataFrame，包含 SMILES 的表格
        SMILES_col    : str，SMILES 所在列的列名
        morgan_radius : int，Morgan 指纹半径 (默认=2)
        morgan_nbits  : int，Morgan/FCFP/AtomPair 指纹长度 (默认=1024)

    返回:
        DataFrame，原始数据 + 多种分子指纹特征
    """

    fps_data = []   # 存储所有分子的指纹特征字典

    # 1. 定义指纹生成器
    morgan_gen = rdFingerprintGenerator.GetMorganGenerator(
        radius=morgan_radius, fpSize=morgan_nbits,
        countSimulation=True, includeChirality=False
    )
    fcfp = rdFingerprintGenerator.GetMorganFeatureAtomInvGen()
    fcfp_gen = rdFingerprintGenerator.GetMorganGenerator(
        radius=morgan_radius, fpSize=morgan_nbits,
        atomInvariantsGenerator=fcfp, countSimulation=True, includeChirality=False
    )
    atom_gen = rdFingerprintGenerator.GetAtomPairGenerator(
        fpSize=morgan_nbits, countSimulation=True, includeChirality=False
    )

    # 2. 遍历分子，提取指纹
    total = len(df)
    for idx, smi in enumerate(df[SMILES_col]):
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            fps_data.append({})
            print(f"⚠ 无效 SMILES: {smi}")
            continue

        feature_row = {}

        # 2.1 Morgan 指纹 (ECFP)
        morgan_fp = morgan_gen.GetFingerprint(mol)
        for i in range(morgan_nbits):
            feature_row[f"Morgan_{i}"] = morgan_fp[i]

        # 2.2 功能类 Morgan (FCFP)
        fcfp_fp = fcfp_gen.GetFingerprint(mol)
        for i in range(morgan_nbits):
            feature_row[f"FCFP_{i}"] = fcfp_fp[i]

        # 2.3 MACCS Keys (固定 167 位)
        maccs_fp = MACCSkeys.GenMACCSKeys(mol)
        for i in range(len(maccs_fp)):
            feature_row[f"MACCS_{i}"] = int(maccs_fp[i])

        # 2.4 AtomPair 指纹
        atompair_fp = atom_gen.GetCountFingerprint(mol)
        for i in range(morgan_nbits):
            feature_row[f"AtomPair_{i}"] = atompair_fp[i]

        # 2.5 RDKit 内置指纹
        rdkit_fp = RDKFingerprint(mol)
        for i in range(len(rdkit_fp)):
            feature_row[f"RDKIT_{i}"] = int(rdkit_fp[i])

        # 2.6 Avalon 指纹 (若可用)
        if avalon_available:
            avalon_fp = pyAvalonTools.GetAvalonFP(mol, morgan_nbits)
            for i in range(len(avalon_fp)):
                feature_row[f"Avalon_{i}"] = int(avalon_fp[i])

        fps_data.append(feature_row)
        print(f"🔄 指纹提取进度: {idx+1:5d}/{total:5d}", end="\r", flush=True)
    print("\n✅ 分子指纹计算完成")

    # 3. 合并结果并返回
    fps_df = pd.DataFrame(fps_data)
    return pd.concat([df, fps_df], axis=1)






# # ============ 应用函数 ============

# show_df_info(descriptors_train_df, "finger_train_df")
# show_df_info(descriptors_test_df, "finger_test_df")

# finger_train_df = extract_all_fingerprint(descriptors_train_df, "SMILES_normalized")
# finger_test_df  = extract_all_fingerprint(descriptors_test_df, "SMILES_normalized")

# show_df_info(finger_train_df, "finger_train_df")
# show_df_info(finger_test_df, "finger_test_df")


# # 保存结果
# finger_train_df_path = os.path.join(DIRS['DATA_DIR000'], "finger_train_df.csv")
# finger_test_df_path  = os.path.join(DIRS['DATA_DIR000'], "finger_test_df.csv")
# finger_train_df.to_csv(finger_train_df_path, index=False)
# finger_test_df.to_csv(finger_test_df_path, index=False)

# print(f"✅ finger_train_df 已保存到 {finger_train_df_path}")
# print(f"✅ finger_test_df 已保存到 {finger_test_df_path}")


# 数据分析

In [11]:
# 加载数据
def loaddata(DIRS):
    # 定义路径
    finger_train_df_path = os.path.join(DIRS['DATA_DIR000'], "finger_train_df.csv")
    finger_test_df_path  = os.path.join(DIRS['DATA_DIR000'], "finger_test_df.csv")
    # 读取数据
    train_df = pd.read_csv(finger_train_df_path)
    test_df  = pd.read_csv(finger_test_df_path)

    # 打印信息
    show_df_info(train_df, "train_df")
    show_df_info(test_df, "test_df")

    print("特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024")
    print("合计特征总数 = 6528")

    return  train_df, test_df



train_df, test_df =  loaddata(DIRS)

📊 train_df         shape: (20234, 6532)     列名: ['SMILES', 'Tm', 'origin_info', 'SMILES_normalized', 'MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SM

In [12]:
# 打印清单
def config_to_str(config: dict, indent: int = 0) -> str:
    """递归生成配置字符串"""
    prefix = "     " * indent
    lines = []
    for key, value in config.items():
        if isinstance(value, dict):
            lines.append(f"{prefix}🔹 {key}:")
            lines.append(config_to_str(value, indent + 1))  # 递归拼接子字典
        else:
            lines.append(f"{prefix}- {key:<20}: {value}")
    return "\n".join(lines)



In [13]:
# 实验配置单
config = {
    # 固定开关
    "ISTEST"            : True,

    "remove_dup_smiles" : False, 
    "use_feature_gen"   : False,
    "use_pca"           : False,
    "pca_components"    : 100,


    "study_save_name"    : study_save_name,



    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cuda",
        "objective"     : "reg:squarederror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "mean",   


    # 训练设置
    "xgb_train_model_params": {
        'max_depth'         : 6,                     # 树的最大深度，控制模型复杂度（越大越易过拟合）
        'eta'               : 0.1,                   # 学习率（步长缩放），越小越稳健但训练轮数需增多
        'device'            : 'cuda',                # 计算设备：'cuda' 使用 GPU 加速训练
        'tree_method'       : 'hist',                # 使用基于 GPU 的直方图算法，速度快且节省内存
        'eval_metric'       : 'mae',                 # 模型评估指标：平均绝对误差（用于验证与早停）
        'booster'           : 'gbtree',              # 使用基于树的模型（常用：'gbtree' 或 'dart'）

        'subsample'         : 0.8,                   # 每棵树训练时随机采样 80% 的样本（防止过拟合）
        'colsample_bytree'  : 0.8,                   # 每棵树随机采样 80% 的特征（增加模型多样性）
        'min_child_weight'  : 1,                     # 叶节点最小样本权重和（较大值可防止过拟合）
        'lambda'            : 1.0,                   # L2 正则化系数（防止权重过大、提升泛化）
        'objective'         : 'reg:squarederror',    # 训练目标：最小化平方误差（标准回归任务）
    },







    "num_boost_round": 15000,
}

In [14]:
# 数据拆分 (特征矩阵 与 目标向量)
# ============================================
# 特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
# 合计特征总数 = 6528

import numpy as np
import pandas as pd

def prepare_features_and_target(train_df: pd.DataFrame, test_df: pd.DataFrame, config: dict):
    """
    数据拆分函数：构造训练集和测试集的特征矩阵与目标向量
    """


    # 1. 检查并处理重复 SMILES
    if config["remove_dup_smiles"]:

        dup_smiles = set(train_df['SMILES_normalized']) & set(test_df['SMILES_normalized'])
        print(f"⚠️ 检测到 {len(dup_smiles)} 个重复 SMILES_normalized")

        before_shape = train_df.shape
        # 删除训练集中出现在测试集的 SMILES，避免数据泄漏
        train_df = train_df[~train_df['SMILES_normalized'].isin(test_df['SMILES_normalized'])].reset_index(drop=True)
        after_shape = train_df.shape

        print(f"✅ 删除完成: 从 {before_shape} → {after_shape}")


    target_train = pd.DataFrame()  # 训练集目标向量初始化

    # 2. 构造特征矩阵和目标向量
    features_train = train_df.drop(columns=['SMILES', 'Tm', 'origin_info', 'SMILES_normalized'])   # 训练集特征 (X)
    target_train['Tm']    = train_df['Tm']                            # 训练集目标 (y, 熔点)
    features_test  = test_df.drop(columns=['SMILES', 'id', 'origin_info', 'SMILES_normalized'])    # 测试集特征 (无 Tm)


    # 随机选取部分特征（示例：50 个）
    if config["ISTEST"]:
        np.random.seed(42)
        selected_features = np.random.choice(
            train_df.drop(columns=['SMILES', 'Tm', 'origin_info', 'SMILES_normalized']).columns,
            size=110,
            replace=False
        )
        sample_len = 200
        features_train = train_df.iloc[:sample_len][selected_features]   # 训练特征 (前 1000 条)
        target_train  = train_df.iloc[:sample_len][['Tm']]               # 训练目标
        features_test = test_df[selected_features]          # 测试特征 (同样的特征列)




    # 3. 打印维度信息
    print("📊 数据拆分完成")    
    show_df_info(features_train, "features_train")
    show_df_info(target_train, "target_train")
    show_df_info(features_test, "features_test")

    return features_train, target_train, features_test

features_train, target_train, features_test = prepare_features_and_target(train_df, test_df, config)


📊 数据拆分完成
📊 features_train   shape: (200, 110)        列名: ['RDKIT_2008', 'RDKIT_375', 'Morgan_117', 'AtomPair_847', 'FCFP_129', 'AtomPair_700', 'RDKIT_195', 'AtomPair_1009', 'AtomPair_486', 'AtomPair_30', 'Morgan_184', 'AtomPair_94', 'FCFP_421', 'RDKIT_1144', 'AtomPair_926', 'Avalon_846', 'AtomPair_1022', 'AtomPair_687', 'RDKIT_166', 'FCFP_453', 'RDKIT_1561', 'Avalon_527', 'RDKIT_1697', 'Morgan_479', 'RDKIT_1218', 'Morgan_13', 'Avalon_809', 'MACCS_51', 'AtomPair_67', 'FCFP_171', 'Avalon_766', 'FCFP_883', 'FCFP_638', 'RDKIT_111', 'Morgan_107', 'AtomPair_970', 'Morgan_252', 'FCFP_603', 'Avalon_683', 'FCFP_431', 'Avalon_524', 'RDKIT_912', 'RDKIT_311', 'RDKIT_998', 'AtomPair_388', 'AtomPair_493', 'Avalon_250', 'FCFP_876', 'AtomPair_383', 'FCFP_31', 'RDKIT_1498', 'Morgan_16', 'Avalon_172', 'Morgan_394', 'RDKIT_1866', 'RDKIT_918', 'Morgan_30', 'FCFP_169', 'RDKIT_699', 'Avalon_162', 'FCFP_113', 'FCFP_640', 'FCFP_89', 'FCFP_480', 'MACCS_147', 'RDKIT_1817', 'RDKIT_1026', 'FCFP_790', 'Morgan_176'

### 特征生成

In [15]:
import pandas as pd

def add_chemical_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    基于分子描述符构造新的衍生特征
    输入:
        df : pd.DataFrame，必须包含以下列：
            ['NumHDonors', 'NumHAcceptors', 'MolLogP', 'TPSA',
            'NumRotatableBonds', 'MolWt', 'NumAromaticRings', 'BertzCT']
    输出:
        df_new : pd.DataFrame，包含新增特征
    """
    df = df.copy()

    df['HBond_Product']        = df['NumHDonors'] * df['NumHAcceptors']
    df['HBond_Sum']            = df['NumHDonors'] + df['NumHAcceptors']
    df['LogP_div_TPSA']        = df['MolLogP'] / (df['TPSA'] + 1)
    df['LogP_x_TPSA']          = df['MolLogP'] * df['TPSA']
    df['Flexibility_Score']    = df['NumRotatableBonds'] / (df['MolWt'] + 1)
    df['MolWt_x_AromaticRings']= df['MolWt'] * df['NumAromaticRings']
    df['Complexity_per_MW']    = df['BertzCT'] / (df['MolWt'] + 1)
    df['Rigidity_Score']       = df['NumAromaticRings'] / (df['NumRotatableBonds'] + 1)

    return df


if config["use_feature_gen"]:
    features_train = add_chemical_features(features_train)
    features_test  = add_chemical_features(features_test)

    
    show_df_info(features_train, "features_train")
    show_df_info(target_train, "target_train")

### PCA降维

In [16]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy import sparse

def apply_truncated_svd(df: pd.DataFrame, n_components: int = 100, random_state: int = 42):
    """
    使用 TruncatedSVD 对 DataFrame 进行降维
    输入:
        df           : pd.DataFrame，特征矩阵（需去掉 ID / label 等非特征列）
        n_components : int，降维后的目标维度
        random_state : int，随机种子
    输出:
        reduced_df   : pd.DataFrame，降维后的结果，保持原行索引
    """
    # 转换为稀疏矩阵
    X_sparse = sparse.csr_matrix(df.values)

    # 初始化 SVD
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)

    # 训练并降维
    X_reduced_array = svd.fit_transform(X_sparse)

    # 包装为 DataFrame
    reduced_df = pd.DataFrame(
        X_reduced_array,
        index=df.index,
        columns=[f"SVD_{i+1}" for i in range(X_reduced_array.shape[1])]
    )
    # 方差解释率
    explained_var = svd.explained_variance_ratio_.sum()

    # 打印信息
    print( "原始维度         : ", df.shape)
    print( "降维后           : ", reduced_df.shape) 
    print(f"累计解释方差比   :  {explained_var:.2%}")

    return reduced_df


# 对数据降维
if config["use_pca"]:
    features_train_reduced = apply_truncated_svd(features_train, n_components = 100)
    features_test_reduced = apply_truncated_svd(features_test, n_components = 100)

    features_train = pd.concat([features_train, features_train_reduced], axis=1)
    features_test = pd.concat([features_test, features_test_reduced], axis=1)

    
    show_df_info(features_train, "features_train")
    show_df_info(target_train, "target_train")
    show_df_info(features_test, "features_test")


# 单次训练推导

In [17]:
# Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果
# ==============================================================
def run_kfold_xgb(features_train, target_train, features_test, config, DIRS, K_FOLDS=10, verbose=0):
    """
    使用 Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果

    参数:
        features_train, target_train        : 训练集特征和标签
        features_test      : 测试集特征
        params      : XGBoost 最优参数 (dict)
        DIRS        : 保存结果的目录字典
        K_FOLDS     : 折数 (默认=5)
        verbose     : 是否打印详细信息
    """

    
        
    config["X shape"] = features_train.shape
    config["y shape"] = target_train.shape
    config["X_test shape"] = features_test.shape


    # ---------- 创建目录 ----------
    for _, path in DIRS.items():
        os.makedirs(path, exist_ok=True)


    time_str = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    history_DIR = os.path.join(DIRS['HISTORY'], time_str)
    os.makedirs(history_DIR, exist_ok=True)



    print("——" * 20)
    print(f"✅ 当前结果将保存到: {time_str}")


    # ---------- 定义交叉验证 ----------
    skfold = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    yeo = PowerTransformer(method="yeo-johnson")                                # 定义 Yeo-Johnson 变换

    # ---------- 初始化存储 ----------
    oof_val = np.zeros(len(features_train))       # OOF 预测
    train_score, val_score = [], []  # 每折 MAE
    test_pred = []                   # 每折 test 预测
    fold_records = []                # 保存每折信息
    all_importances = []             # 特征重要性
    elapsed_list = []                # 耗时记录



    # 循环每一折
    # ==============================================================

    for i, (train_idx, val_idx) in enumerate(skfold.split(features_train, pd.qcut(target_train, q=10).cat.codes), 1):

        # ----- 打印时间信息 -----
        start_now = datetime.now()
        start_str = start_now.strftime("%H:%M:%S")

        if elapsed_list:
            avg_time = np.mean(elapsed_list)
            est_end = start_now + timedelta(seconds=avg_time*(K_FOLDS - i + 1))

            # 每 5 个一组输出耗时
            parts = [f"{t:6.1f}s" for t in elapsed_list]
            grouped = [" ".join(parts[j:j+5]) for j in range(0, len(parts), 5)]
            elapsed_str = " /// ".join(grouped)

            print(
                f"🔄{i:2d}/{K_FOLDS} 开始 {start_str}"
                f" 结束 {est_end.strftime('%H:%M:%S')}"
                f" avg {avg_time:.1f}s"
                f" [{elapsed_str}]",
                end="\r", flush=True
            )
        else:
            print(f"🔄{i:2d}/{K_FOLDS} 开始 {start_str} 结束 (暂无历史数据)", end="\r", flush=True)



        # ----- 开始训练 -----
        t0 = time.time()

        # 1. 数据集划分
        x_train, x_val = features_train.iloc[train_idx], features_train.iloc[val_idx]
        y_train, y_val = target_train[train_idx], target_train[val_idx]

        # 2. Yeo-Johnson 变换
        y_train = yeo.fit_transform(y_train.values.reshape(-1, 1)).squeeze()
        y_val   = yeo.transform(y_val.values.reshape(-1, 1)).squeeze()


        # 3. 特征选择（轻量级 XGBoost）
        # 使用
        selector_model = xgb.XGBRegressor(**config["xgb_selector_model_params"])
        # selector_model = xgb.XGBRegressor(
        #     n_estimators   = 500,
        #     max_depth      = 6,
        #     learning_rate  = 0.05,
        #     random_state   = 2025,
        #     device         = "cpu",
        #     objective      = "reg:absoluteerror",
        #     tree_method    = "hist",
        #     verbosity      = 0
        # )
        
        

        selector_model.fit(x_train, y_train)

        selector = SelectFromModel(selector_model, prefit=True, threshold=config["selector_threshold"])
        selected_features = x_train.columns[selector.get_support()].tolist()
        if verbose > 0:
            print(f"✅ 选择的特征数量: {len(selected_features)}")


        # 4. 保留重要特征
        x_train_new = x_train[selected_features]
        x_val_new   = x_val[selected_features]
        x_test_new  = features_test[selected_features]

        # 5. 转换为 DMatrix
        dtrain = xgb.DMatrix(x_train_new, y_train, feature_names=selected_features)
        dval   = xgb.DMatrix(x_val_new,   y_val,   feature_names=selected_features)
        dtest  = xgb.DMatrix(x_test_new,             feature_names=selected_features)


        # 6. XGBoost 训练
        xgb_model = xgb.train(
            params                 = config["xgb_train_model_params"],
            dtrain                 = dtrain,
            num_boost_round        = config["num_boost_round"],
            evals                  = [(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds  = 300,
            verbose_eval           = (1000 if verbose > 0 else False)
        )


        # # 保存模型
        # model_path = os.path.join(history_DIR, f"xgb_model_fold{i}.json")
        # xgb_model.save_model(model_path)

        # 7. 获取特征重要性
        imp_dict = xgb_model.get_score(importance_type="gain")
        imp_df = pd.DataFrame(imp_dict.items(), columns=["Feature", "Importance"])
        imp_df["Fold"] = i
        all_importances.append(imp_df)


        # 8. 预测
        y_train_pred = xgb_model.predict(dtrain)
        y_val_pred   = xgb_model.predict(dval)
        y_test_pred  = xgb_model.predict(dtest)

        # 9. 逆变换
        y_train      = yeo.inverse_transform(y_train.reshape(-1, 1)).squeeze()
        y_val        = yeo.inverse_transform(y_val.reshape(-1, 1)).squeeze()
        y_train_pred = yeo.inverse_transform(y_train_pred.reshape(-1, 1)).squeeze()
        y_val_pred   = yeo.inverse_transform(y_val_pred.reshape(-1, 1)).squeeze()
        y_test_pred  = yeo.inverse_transform(y_test_pred.reshape(-1, 1)).squeeze()

        # 10. 计算 MAE
        train_mae = mean_absolute_error(y_train, y_train_pred)
        val_mae   = mean_absolute_error(y_val,   y_val_pred)
        if verbose > 0:
            print(f"Fold {i}: Train MAE={train_mae:.4f}, Val MAE={val_mae:.4f}，用时 {elapsed:.2f} 秒")


        # ----- 保存结果 -----
        train_score.append(train_mae)
        val_score.append(val_mae)
        oof_val[val_idx] = y_val_pred
        test_pred.append(y_test_pred)

        elapsed = time.time() - t0
        elapsed_list.append(elapsed)

        fold_records.append({
            "Fold": i,
            "Train_MAE": train_mae,
            "Val_MAE": val_mae,
            "Num_Features": len(selected_features),
            "Selected_Features": selected_features,
            "elapsed": elapsed
        })

    # 保存整体结果
    # ==============================================================
    if verbose > 0:
        print("\n")
        print(f"📊 Train MAE 平均值 : {np.mean(train_score):.4f}")
        print(f"📊 Val   MAE 平均值 : {np.mean(val_score):.4f}")
        print(f"📊 Train MAE 标准差 : {np.std(train_score, ddof=0):.4f}")
        print(f"📊 Val   MAE 标准差 : {np.std(val_score, ddof=0):.4f}")

    # 参数
    with open(os.path.join(history_DIR, "config.json"), "w", encoding="utf-8") as f:
        json.dump(config, f, indent=4, ensure_ascii=False)

    # 每折信息
    folds_df = pd.DataFrame(fold_records)
    folds_df.to_csv(os.path.join(history_DIR, "folds_info.csv"), index=False, encoding="utf-8-sig")


    # 特征重要性
    if all_importances:
        valid_imps = [df for df in all_importances if not df.empty]
        all_imp_df = pd.concat(valid_imps, axis=0) if valid_imps else pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    else:
        all_imp_df = pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    all_imp_df.to_csv(os.path.join(history_DIR, "feature_importance_all.csv"), index=False, encoding="utf-8-sig")


    # 测试集预测
    test_pred_array = np.vstack(test_pred).T
    test_pred_df = pd.DataFrame(test_pred_array, columns=[f"Fold_{j+1}" for j in range(test_pred_array.shape[1])])
    test_pred_df["Final_Pred"] = test_pred_df.mean(axis=1)
    test_pred_df.to_csv(os.path.join(history_DIR, "test_predictions.csv"), index=False, encoding="utf-8-sig")

    # 总结
    with open(os.path.join(history_DIR, "summary.txt"), "w", encoding="utf-8") as f:
        f.write(f"Train MAE Mean : {np.mean(train_score):.4f}\n")
        f.write(f"Val   MAE Mean : {np.mean(val_score):.4f}\n")
        f.write(f"Train MAE Std  : {np.std(train_score, ddof=0):.4f}\n")
        f.write(f"Val   MAE Std  : {np.std(val_score, ddof=0):.4f}\n")


    # 最终提交
    final_score = np.mean(val_score)
    submission = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "sample_submission.csv"))
    submission["Tm"] = test_pred_df["Final_Pred"]

    submission_path = os.path.join(history_DIR, f"sub_{time_str}_{final_score:.8f}.csv")
    submission.to_csv(submission_path, index=False)
    submission.to_csv(os.path.join(DIRS['SUBMISSION'], f"sub_{time_str}_{final_score:.8f}.csv"), index=False)

        
    config["time_str"] = time_str
    config["score"] = final_score


    # ---------- 返回结果 ----------
    return {
        "oof_val": oof_val,
        "train_score": train_score,
        "val_score": val_score,
        "test_pred": test_pred_df,
        "folds_info": folds_df,
        "feature_importance": all_imp_df,
        "submission_path": submission_path,
        "time": time_str,
        "final_score": final_score,
        "config": config
    }


In [18]:
# 执行一次

X = features_train
y = target_train
X_test = features_test
    
show_df_info(X, "X")
show_df_info(y, "y")
show_df_info(X_test, "X_test")


results = run_kfold_xgb(X, y['Tm'], X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
config = results['config']

print('\n',results['final_score'])

📊 X                shape: (200, 110)        列名: ['RDKIT_2008', 'RDKIT_375', 'Morgan_117', 'AtomPair_847', 'FCFP_129', 'AtomPair_700', 'RDKIT_195', 'AtomPair_1009', 'AtomPair_486', 'AtomPair_30', 'Morgan_184', 'AtomPair_94', 'FCFP_421', 'RDKIT_1144', 'AtomPair_926', 'Avalon_846', 'AtomPair_1022', 'AtomPair_687', 'RDKIT_166', 'FCFP_453', 'RDKIT_1561', 'Avalon_527', 'RDKIT_1697', 'Morgan_479', 'RDKIT_1218', 'Morgan_13', 'Avalon_809', 'MACCS_51', 'AtomPair_67', 'FCFP_171', 'Avalon_766', 'FCFP_883', 'FCFP_638', 'RDKIT_111', 'Morgan_107', 'AtomPair_970', 'Morgan_252', 'FCFP_603', 'Avalon_683', 'FCFP_431', 'Avalon_524', 'RDKIT_912', 'RDKIT_311', 'RDKIT_998', 'AtomPair_388', 'AtomPair_493', 'Avalon_250', 'FCFP_876', 'AtomPair_383', 'FCFP_31', 'RDKIT_1498', 'Morgan_16', 'Avalon_172', 'Morgan_394', 'RDKIT_1866', 'RDKIT_918', 'Morgan_30', 'FCFP_169', 'RDKIT_699', 'Avalon_162', 'FCFP_113', 'FCFP_640', 'FCFP_89', 'FCFP_480', 'MACCS_147', 'RDKIT_1817', 'RDKIT_1026', 'FCFP_790', 'Morgan_176', 'RDKIT_

In [19]:
# 打印当前config
print(config_to_str(config))

- ISTEST              : True
- remove_dup_smiles   : False
- use_feature_gen     : False
- use_pca             : False
- pca_components      : 100
- study_save_name     : XGBoost_Optuna_pure_data
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cuda
     - objective           : reg:squarederror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : mean
🔹 xgb_train_model_params:
     - max_depth           : 6
     - eta                 : 0.1
     - device              : cuda
     - tree_method         : hist
     - eval_metric         : mae
     - booster             : gbtree
     - subsample           : 0.8
     - colsample_bytree    : 0.8
     - min_child_weight    : 1
     - lambda              : 1.0
     - objective           : reg:squarederror
- num_boost_round     : 15000
- X shape             : (200, 110)
-

# 提交 kaggle 平台测试

In [20]:
# 根据 submission_time 定位文件路径 提交 kaggle 平台测试

import os
import itertools
import time
from kaggle.api.kaggle_api_extended import KaggleApi


def find_submission_file(submission_time, submission_dir):
    """
    在 submission_dir 下查找包含 submission_time 的文件
    一旦找到立刻返回完整路径；如果没找到则返回 None
    """
    for fname in os.listdir(submission_dir):
        if submission_time in fname:
            file_path = os.path.join(submission_dir, fname)
            print(f"✅ 找到目标文件: {fname}")
            return file_path
    
    print(f"⚠️ 未找到包含 {submission_time} 的文件")
    return None

def submit_and_get_score(file_path, competition_name, message="My submission"):
    """
    封装 Kaggle 提交并等待结果评分
    --------------------------------------
    file_path        : str  提交文件路径
    competition_name : str  Kaggle 比赛名称 (URL 最后一段)
    message          : str  提交备注
    """
    # 1. 配置 Kaggle API
    os.environ["KAGGLE_CONFIG_DIR"] = r"C:\Users\Admin\.kaggle"
    api = KaggleApi()
    api.authenticate()
    print("✅ Kaggle API 已经配置成功！")

    # 2. 提交文件
    api.competition_submit(
        file_name=file_path,
        competition=competition_name,
        message=message
    )
    print("✅ 提交完成！请等待评分...")

    # 3. 动态等待
    spinner = itertools.cycle(["|", "/", "-", "\\"])
    while True:
        submissions = api.competition_submissions(competition_name)
        latest = submissions[0]
        status_str = str(latest._status).lower()

        if "complete" in status_str and latest._public_score is not None:
            print("\n🎯 最终结果:")
            print(f"Public 分数 : {latest._public_score}")
            print(f"Private 分数: {latest._private_score}")
            print(f"提交 ID     : {latest._ref}")
            print(f"文件名      : {latest._file_name}")
            print(f"状态        : {latest._status}")
            print(f"提交时间    : {latest._date}")
            print(f"描述/备注   : {latest._description}")
            return latest

        spin_char = next(spinner)
        print(f"当前状态: {status_str} , 等待中 {spin_char}", end="\r", flush=True)
        time.sleep(0.2)  # 每 0.5 秒检查一次


### 不轻易运行，再三考虑

In [21]:
# submission_time 提交
submission_time = "2025-10-21 19-59-49"
competition_name = kaggle_competition_name
message =  f"该提交文件的参数：\n{config_to_str(config)} "



target_file = find_submission_file(submission_time, DIRS['SUBMISSION'] )

# submit_and_get_score(target_file, competition_name, message)

⚠️ 未找到包含 2025-10-21 19-59-49 的文件


# 参数优化

In [22]:
# 实验配置单
base_config = {
    # 固定开关
    "ISTEST"            : False,

    "remove_dup_smiles" : False, 
    "use_feature_gen"   : False,
    "use_pca"           : False,
    "pca_components"    : 100,


    "study_save_name"    : study_save_name,

    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cuda",
        "objective"     : "reg:squarederror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "mean",   


    # 训练设置
    "xgb_train_model_params": {
        'max_depth'         : 6,                     # 树的最大深度，控制模型复杂度（越大越易过拟合）
        'eta'               : 0.1,                   # 学习率（步长缩放），越小越稳健但训练轮数需增多
        'device'            : 'cuda',                # 计算设备：'cuda' 使用 GPU 加速训练
        'tree_method'       : 'hist',                # 使用基于 GPU 的直方图算法，速度快且节省内存
        'eval_metric'       : 'mae',                 # 模型评估指标：平均绝对误差（用于验证与早停）
        'booster'           : 'gbtree',              # 使用基于树的模型（常用：'gbtree' 或 'dart'）

        'subsample'         : 0.8,                   # 每棵树训练时随机采样 80% 的样本（防止过拟合）
        'colsample_bytree'  : 0.8,                   # 每棵树随机采样 80% 的特征（增加模型多样性）
        'min_child_weight'  : 1,                     # 叶节点最小样本权重和（较大值可防止过拟合）
        'lambda'            : 1.0,                   # L2 正则化系数（防止权重过大、提升泛化）
        'objective'         : 'reg:squarederror',    # 训练目标：最小化平方误差（标准回归任务）
    },



    "num_boost_round": 15000,
}

In [23]:
# 定义优化任务  加入标识符 host: hao-2   ip: 192.168.40.1

import copy
import contextlib
import io

def objective(trial):
    """
    Optuna 的目标函数 (Objective Function)
    每次 trial 会生成一组超参数，用于训练 XGBoost 模型，
    并返回交叉验证的平均 RMSE 作为优化目标。
    """


    # 1. 定义 超参数 搜索空间
    # 拷贝一份 config，避免全局污染
    config = copy.deepcopy(base_config)

    # 只修改需要优化的参数
    config["remove_dup_smiles"] = trial.suggest_categorical("remove_dup_smiles", [True, False])
    config["use_feature_gen"]   = trial.suggest_categorical("use_feature_gen", [True, False])
    config["use_pca"]           = trial.suggest_categorical("use_pca", [True, False])

    # config["xgb_selector_model_params"]["device"]       = trial.suggest_categorical("selector_device", ["cpu", "cuda"])
    config["xgb_selector_model_params"]["n_estimators"] = trial.suggest_categorical("selector_n_estimators", [100, 300, 500, 700, 900, 1000])


    config["selector_threshold"] = trial.suggest_categorical("selector_threshold", [ "0*mean", "0.25*mean", "0.5*mean", "0.75*mean", "mean"])


    # Optuna 超参数搜索空间
    config["xgb_train_model_params"]["max_depth"]         = trial.suggest_int(  "train_max_depth",     3, 30)         # 树的最大深度
    config["xgb_train_model_params"]["eta"]               = trial.suggest_float("train_eta",           0.01, 0.3, log=True)  # 学习率
    
    config["xgb_train_model_params"]["subsample"]         = trial.suggest_float("subsample",           0.5, 1.0)      # 样本采样比例
    config["xgb_train_model_params"]["colsample_bytree"]  = trial.suggest_float("colsample_bytree",    0.5, 1.0)      # 特征采样比例
    config["xgb_train_model_params"]["min_child_weight"]  = trial.suggest_int(  "min_child_weight",    1, 10)         # 叶节点最小样本权重
    config["xgb_train_model_params"]["lambda"]            = trial.suggest_float("lambda",              1e-3, 10.0, log=True)  # L2 正则化



    # 主流程---------------------------------------------------------------------------------------------------
    # 创建一个黑洞缓冲区
    f = io.StringIO()
    with contextlib.redirect_stdout(f):
        None

        # 打印当前config
        print(config_to_str(config))
        

        # 加载数据
        train_df, test_df =  loaddata(DIRS)

        # 数据拆分
        print("数据拆分---------------------------")
        features_train, target_train, features_test = prepare_features_and_target(train_df, test_df, config)

        # 特征生成
        if config["use_feature_gen"]:
            print("特征生成---------------------------")
            features_train = add_chemical_features(features_train)
            features_test  = add_chemical_features(features_test)
            show_df_info(features_train, "features_train")
            show_df_info(target_train, "target_train")
            

        # 数据降维
        if config["use_pca"]:
            print("数据降维---------------------------")
            features_train_reduced = apply_truncated_svd(features_train, n_components = 100)
            features_test_reduced = apply_truncated_svd(features_test, n_components = 100)

            features_train = pd.concat([features_train, features_train_reduced], axis=1)
            features_test = pd.concat([features_test, features_test_reduced], axis=1)

            
            show_df_info(features_train, "features_train")
            show_df_info(target_train, "target_train")
            show_df_info(features_test, "features_test")



        X = features_train
        y = target_train
        X_test = features_test
            
        show_df_info(X, "X")
        show_df_info(y, "y")
        show_df_info(X_test, "X_test")

        print("开始训练---------------------------")





    results = run_kfold_xgb(X, y['Tm'], X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
    config = results['config']
    score = results['final_score']



    HOSTNAME = socket.gethostname()
    HOST_IP = socket.gethostbyname(HOSTNAME)
    trial.set_user_attr("host", HOSTNAME)        # 你自己定义主机 A/B
    trial.set_user_attr("ip", HOST_IP)        # 你自己定义角色 A/B

    # 4. 返回平均 MAE
    return score

In [None]:
# 开始优化

# 1. 定义 SQLite 数据库存储路径


storage_url = f"mysql+pymysql://{user}:{password}@{host}:3306/{database_name}"

STUDY_NAME = F"test_{study_save_name}" if base_config["ISTEST"] else study_save_name

study = optuna.create_study(
    study_name = STUDY_NAME,
    # study_name="ghsdjsrtjrswtjhwrt",
    storage=storage_url,
    load_if_exists=True
)

# 自动获取当前主机名\当前主机的 IP 地址
HOSTNAME = socket.gethostname()
HOST_IP = socket.gethostbyname(HOSTNAME)
print("主机名:", HOSTNAME," 主机 IP:", HOST_IP)
time.sleep(1)

# 5. 启动超参数搜索
print("🔎 开始超参数搜索...")
if base_config["ISTEST"]:
    study.optimize(objective, n_trials = 5)
else:
    study.optimize(objective, n_trials = 100)


# 6. 打印最优结果
print("\n✅ 训练完成！")
print(f"📊 已完成试验次数 : {len(study.trials)}")
print(f"🏆 最优试验编号   : {study.best_trial.number}")
print(f"📉 最优 MAE       : {study.best_value}")
print(f"⚙️ 最优参数组合   : {study.best_trial.params}")


[I 2025-10-30 13:53:26,751] Using an existing study with name 'XGBoost_Optuna_pure_data' instead of creating a new one.


主机名: hao-2  主机 IP: 192.168.40.1
🔎 开始超参数搜索...
————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-30 13-53-34
🔄10/10 开始 14:58:27 结束 15:05:40 avg 432.6s [ 295.3s  413.6s  425.5s  468.2s  437.1s ///  507.0s  364.6s  496.9s  484.8s]

[I 2025-10-30 15:04:18,714] Trial 38 finished with value: 25.549116408375987 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 7, 'train_eta': 0.012207555741170089, 'subsample': 0.8498420323708237, 'colsample_bytree': 0.8827464208443675, 'min_child_weight': 2, 'lambda': 0.08028674502258853}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-30 15-04-27
🔄10/10 开始 17:45:33 结束 18:03:27 avg 1074.0s [1099.5s 1096.6s 1054.9s 1079.9s 1050.1s /// 1068.9s 1038.9s 1116.6s 1060.7s]

[I 2025-10-30 18:02:17,747] Trial 39 finished with value: 28.10137875580181 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 30, 'train_eta': 0.012542133399786652, 'subsample': 0.8491141091671633, 'colsample_bytree': 0.9072177317353283, 'min_child_weight': 1, 'lambda': 0.06600887209445436}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-30 18-02-26
🔄10/10 开始 19:30:51 结束 19:40:41 avg 589.5s [ 640.4s  591.1s  609.7s  549.4s  495.6s ///  579.9s  621.2s  607.4s  610.7s]

[I 2025-10-30 19:41:30,070] Trial 40 finished with value: 28.161340417017147 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 24, 'train_eta': 0.01663559247813382, 'subsample': 0.8945545165119386, 'colsample_bytree': 0.9599256633267998, 'min_child_weight': 2, 'lambda': 0.10795614027739725}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-30 19-41-46
🔄10/10 开始 20:48:40 结束 20:56:07 avg 446.1s [ 301.4s  456.2s  530.2s  490.0s  411.0s ///  517.5s  371.4s  432.6s  504.5s]

[I 2025-10-30 20:54:52,217] Trial 41 finished with value: 26.203363516952514 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': True, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 8, 'train_eta': 0.011671246333806883, 'subsample': 0.9666823706483857, 'colsample_bytree': 0.882870640345817, 'min_child_weight': 2, 'lambda': 0.016247155281097735}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-30 20-55-01
🔄10/10 开始 21:47:33 结束 21:53:23 avg 350.2s [ 316.3s  375.5s  271.3s  435.6s  465.8s ///  379.4s  352.8s  327.1s  228.4s]

[I 2025-10-30 21:55:10,735] Trial 42 finished with value: 25.636057467866483 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 7, 'train_eta': 0.0143321819367958, 'subsample': 0.765051093699161, 'colsample_bytree': 0.8183851761432475, 'min_child_weight': 6, 'lambda': 0.401992117690359}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-30 21-55-19
🔄10/10 开始 22:52:45 结束 22:59:08 avg 382.9s [ 281.4s  417.7s  279.1s  397.2s  387.2s ///  386.7s  383.2s  397.8s  515.4s]

[I 2025-10-30 22:59:24,618] Trial 43 finished with value: 25.595953601107745 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 7, 'train_eta': 0.014133178068391247, 'subsample': 0.7997523978741268, 'colsample_bytree': 0.8081269927858359, 'min_child_weight': 3, 'lambda': 0.38174637036971304}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-30 22-59-34
🔄10/10 开始 00:08:30 结束 00:16:09 avg 459.5s [ 334.3s  400.9s  471.3s  404.8s  519.0s ///  527.3s  371.1s  505.0s  602.2s]

[I 2025-10-31 00:15:53,347] Trial 44 finished with value: 25.877826318745882 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 5, 'train_eta': 0.013595082159048692, 'subsample': 0.7890015798226265, 'colsample_bytree': 0.7966138887811003, 'min_child_weight': 1, 'lambda': 0.3434640231859275}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 00-16-02
🔄10/10 开始 01:15:05 结束 01:21:38 avg 393.6s [ 374.4s  391.5s  359.6s  497.6s  405.2s ///  406.7s  348.4s  388.7s  370.1s]

[I 2025-10-31 01:20:55,995] Trial 45 finished with value: 26.079689462583808 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 10, 'train_eta': 0.010133927144404862, 'subsample': 0.8803747103609486, 'colsample_bytree': 0.8169625125054324, 'min_child_weight': 3, 'lambda': 0.15179209713329578}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 01-21-04
🔄10/10 开始 02:38:32 结束 02:47:08 avg 516.4s [ 529.5s  509.6s  517.8s  514.5s  523.6s ///  514.4s  513.9s  512.5s  511.5s]

[I 2025-10-31 02:47:12,111] Trial 46 finished with value: 26.225760581544296 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 4, 'train_eta': 0.011745920554307716, 'subsample': 0.8330835888228978, 'colsample_bytree': 0.7710636178722733, 'min_child_weight': 2, 'lambda': 0.05576633844810634}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 02-47-20
🔄10/10 开始 03:00:04 结束 03:01:29 avg 84.9s [  98.7s   68.3s   88.5s   74.4s  115.7s ///   79.2s   69.0s   63.5s  106.4s]

[I 2025-10-31 03:01:38,017] Trial 47 finished with value: 26.447116485782136 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 9, 'train_eta': 0.06851741583822873, 'subsample': 0.7615090915602136, 'colsample_bytree': 0.8714169126465057, 'min_child_weight': 7, 'lambda': 0.2445735956456279}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 03-01-46
🔄10/10 开始 03:28:05 结束 03:31:00 avg 175.4s [ 139.5s  142.1s  149.6s  177.8s  207.8s ///  203.1s  172.8s  189.2s  197.3s]

[I 2025-10-31 03:30:55,726] Trial 48 finished with value: 26.947322975317626 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 900, 'selector_threshold': '0.25*mean', 'train_max_depth': 12, 'train_eta': 0.021454902025511903, 'subsample': 0.9155939811583268, 'colsample_bytree': 0.8393507842280963, 'min_child_weight': 2, 'lambda': 0.10778207524943302}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 03-31-11
🔄10/10 开始 04:12:54 结束 04:17:32 avg 278.1s [ 228.6s  257.1s  280.0s  286.6s  236.8s ///  325.5s  270.5s  280.4s  337.9s]

[I 2025-10-31 04:17:43,477] Trial 49 finished with value: 26.01800260267142 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': True, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 7, 'train_eta': 0.015594779974159508, 'subsample': 0.8046863105699409, 'colsample_bytree': 0.8264761293548999, 'min_child_weight': 3, 'lambda': 0.4670792794523051}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 04-17-52
🔄10/10 开始 05:25:09 结束 05:32:37 avg 448.5s [ 454.4s  349.9s  370.4s  406.3s  502.2s ///  496.5s  458.1s  504.0s  494.7s]

[I 2025-10-31 05:33:18,924] Trial 50 finished with value: 26.141891999832467 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 300, 'selector_threshold': '0*mean', 'train_max_depth': 4, 'train_eta': 0.019274399157875134, 'subsample': 0.7862623728509279, 'colsample_bytree': 0.7168387302047987, 'min_child_weight': 9, 'lambda': 0.00497050007358567}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 05-33-27
🔄10/10 开始 05:53:14 结束 05:55:26 avg 131.9s [ 106.0s  125.2s  166.1s  131.4s   91.1s ///  107.5s  134.4s  143.8s  181.5s]

[I 2025-10-31 05:55:19,176] Trial 51 finished with value: 26.04186861315311 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 1000, 'selector_threshold': 'mean', 'train_max_depth': 11, 'train_eta': 0.014158415511574674, 'subsample': 0.9483868603223934, 'colsample_bytree': 0.7722690912903211, 'min_child_weight': 10, 'lambda': 0.04417975047312301}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 05-55-28
🔄10/10 开始 06:38:08 结束 06:42:53 avg 284.5s [ 200.6s  384.3s  238.1s  263.5s  354.1s ///  307.6s  234.8s  253.3s  323.8s]

[I 2025-10-31 06:42:40,814] Trial 52 finished with value: 25.70282704192629 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 7, 'train_eta': 0.017783366729482726, 'subsample': 0.7422512120826874, 'colsample_bytree': 0.9288735186007289, 'min_child_weight': 5, 'lambda': 2.028490617469365}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 06-42-50
🔄10/10 开始 07:06:32 结束 07:09:11 avg 158.1s [ 150.2s  139.8s  171.1s  216.9s  150.5s ///  135.2s  131.2s  118.0s  209.9s]

[I 2025-10-31 07:08:47,588] Trial 53 finished with value: 25.71561751396382 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.25*mean', 'train_max_depth': 8, 'train_eta': 0.016931960539520362, 'subsample': 0.6861701998015376, 'colsample_bytree': 0.9984637174737623, 'min_child_weight': 6, 'lambda': 0.26295729730200074}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 07-08-56
🔄10/10 开始 08:09:02 结束 08:15:43 avg 400.6s [ 294.1s  479.2s  335.1s  459.5s  448.0s ///  428.0s  430.2s  312.7s  418.9s]

[I 2025-10-31 08:14:43,970] Trial 54 finished with value: 25.59850080022822 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 900, 'selector_threshold': '0*mean', 'train_max_depth': 7, 'train_eta': 0.011644973591505355, 'subsample': 0.649941376360532, 'colsample_bytree': 0.9037240574127389, 'min_child_weight': 4, 'lambda': 0.5036446305908805}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 08-14-53
🔄10/10 开始 09:18:03 结束 09:25:05 avg 421.2s [ 397.3s  420.1s  397.8s  512.7s  474.4s ///  348.4s  474.6s  333.3s  431.8s]

[I 2025-10-31 09:27:15,061] Trial 55 finished with value: 25.7012093230245 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 900, 'selector_threshold': '0*mean', 'train_max_depth': 6, 'train_eta': 0.011537340684609185, 'subsample': 0.6413610980890738, 'colsample_bytree': 0.913478290299395, 'min_child_weight': 4, 'lambda': 0.18036134789493524}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 09-27-24
🔄10/10 开始 09:35:58 结束 09:36:55 avg 57.1s [  58.9s   49.4s   48.2s   46.7s   73.9s ///   61.9s   56.1s   49.6s   69.5s]

[I 2025-10-31 09:36:55,343] Trial 56 finished with value: 28.641280030327096 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 900, 'selector_threshold': '0*mean', 'train_max_depth': 9, 'train_eta': 0.1651363412992976, 'subsample': 0.5448950021327601, 'colsample_bytree': 0.895516689537912, 'min_child_weight': 3, 'lambda': 0.01851880300943146}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 09-37-04
🔄10/10 开始 10:09:29 结束 10:13:05 avg 216.2s [ 178.7s  215.4s  183.6s  249.0s  248.3s ///  179.6s  249.9s  195.0s  245.8s]

[I 2025-10-31 10:13:39,860] Trial 57 finished with value: 25.818673908654535 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 900, 'selector_threshold': '0.5*mean', 'train_max_depth': 5, 'train_eta': 0.014627157712151881, 'subsample': 0.7724534640293977, 'colsample_bytree': 0.854568288790469, 'min_child_weight': 4, 'lambda': 0.0860944660957403}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 10-13-49
🔄10/10 开始 11:07:10 结束 11:13:05 avg 355.6s [ 261.4s  394.1s  352.5s  299.1s  410.9s ///  347.0s  357.6s  345.0s  433.2s]

[I 2025-10-31 11:13:03,356] Trial 58 finished with value: 26.2745815497705 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 300, 'selector_threshold': '0*mean', 'train_max_depth': 11, 'train_eta': 0.012271592490791187, 'subsample': 0.8020256927952796, 'colsample_bytree': 0.8094522832906024, 'min_child_weight': 1, 'lambda': 1.1753509134884985}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 11-13-13
🔄10/10 开始 12:38:13 结束 12:47:39 avg 566.7s [ 563.3s  533.2s  590.4s  572.5s  571.7s ///  575.0s  562.8s  566.7s  564.2s]

[I 2025-10-31 12:47:35,474] Trial 59 finished with value: 25.82397297536236 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0*mean', 'train_max_depth': 6, 'train_eta': 0.01011409840360613, 'subsample': 0.9914918402778365, 'colsample_bytree': 0.8310905663566275, 'min_child_weight': 2, 'lambda': 0.5025877921399508}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 12-47-51
🔄10/10 开始 13:00:12 结束 13:01:35 avg 82.3s [  65.7s   85.3s   83.6s   99.3s   88.5s ///   83.5s   86.9s   72.8s   75.5s]

[I 2025-10-31 13:01:20,522] Trial 60 finished with value: 26.342828355746256 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': True, 'selector_n_estimators': 900, 'selector_threshold': 'mean', 'train_max_depth': 9, 'train_eta': 0.02193262123088514, 'subsample': 0.5921760886457614, 'colsample_bytree': 0.8742317112061475, 'min_child_weight': 3, 'lambda': 0.13382087877211132}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 13-01-29
🔄10/10 开始 13:41:15 结束 13:45:40 avg 265.1s [ 288.0s  180.6s  255.5s  252.0s  360.3s ///  310.8s  226.4s  271.7s  240.4s]

[I 2025-10-31 13:45:16,167] Trial 61 finished with value: 26.823210797292017 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 100, 'selector_threshold': '0*mean', 'train_max_depth': 3, 'train_eta': 0.0561575294886214, 'subsample': 0.8367152849269772, 'colsample_bytree': 0.6646840471417124, 'min_child_weight': 4, 'lambda': 0.3251720581892867}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 13-45-24
🔄10/10 开始 14:26:43 结束 14:31:19 avg 275.4s [ 313.6s  227.4s  255.9s  247.3s  330.0s ///  215.3s  316.0s  230.8s  342.8s]

[I 2025-10-31 14:31:54,381] Trial 62 finished with value: 25.72942200508457 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': False, 'use_pca': False, 'selector_n_estimators': 1000, 'selector_threshold': '0*mean', 'train_max_depth': 8, 'train_eta': 0.01583004614355805, 'subsample': 0.6997623261395044, 'colsample_bytree': 0.950535136800002, 'min_child_weight': 5, 'lambda': 0.5760943285953898}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 14-32-03
🔄10/10 开始 14:56:07 结束 14:58:47 avg 160.4s [ 129.4s  153.7s  145.3s  172.1s  167.7s ///  196.8s  151.8s  145.4s  181.6s]

[I 2025-10-31 14:59:37,522] Trial 63 finished with value: 25.621406957417776 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': False, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 7, 'train_eta': 0.013470515022329975, 'subsample': 0.6562292296336911, 'colsample_bytree': 0.9188227417304929, 'min_child_weight': 6, 'lambda': 0.0021426504759826702}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 14-59-46
🔄10/10 开始 15:25:39 结束 15:28:32 avg 172.6s [ 129.4s  219.7s  141.3s  182.6s  224.3s ///  173.4s  157.2s  146.3s  178.7s]

[I 2025-10-31 15:28:21,333] Trial 64 finished with value: 25.646466821218485 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': False, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 7, 'train_eta': 0.013216605884241004, 'subsample': 0.6586813975664532, 'colsample_bytree': 0.9232149180008833, 'min_child_weight': 6, 'lambda': 0.0011934539246658139}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 15-28-31
🔄10/10 开始 15:35:13 结束 15:35:58 avg 44.7s [  36.7s   38.5s   46.9s   37.3s   50.7s ///   36.5s   48.4s   53.5s   53.6s]

[I 2025-10-31 15:36:07,721] Trial 65 finished with value: 26.73107339147155 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 7, 'train_eta': 0.08693724692876294, 'subsample': 0.6177921478923372, 'colsample_bytree': 0.9132653602830838, 'min_child_weight': 6, 'lambda': 0.0010813720893855742}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 15-36-16
🔄10/10 开始 15:59:03 结束 16:01:35 avg 151.9s [ 146.0s  127.3s  119.7s  163.3s  188.8s ///  142.6s  190.8s  143.5s  145.0s]

[I 2025-10-31 16:01:07,730] Trial 66 finished with value: 26.330848300689276 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': False, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 4, 'train_eta': 0.029078185524902787, 'subsample': 0.6607618862981154, 'colsample_bytree': 0.9226611418928772, 'min_child_weight': 7, 'lambda': 0.0030145162968460365}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 16-01-17
🔄10/10 开始 16:25:19 结束 16:27:59 avg 160.1s [ 149.2s  150.0s  149.1s  135.0s  168.2s ///  141.0s  180.9s  148.0s  219.9s]

[I 2025-10-31 16:28:05,251] Trial 67 finished with value: 25.92134541702985 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': False, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 10, 'train_eta': 0.014001705779354752, 'subsample': 0.6251402698974147, 'colsample_bytree': 0.9773355542123034, 'min_child_weight': 8, 'lambda': 0.001852368658845651}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 16-28-15
🔄10/10 开始 16:59:15 结束 17:02:41 avg 206.6s [ 193.9s  180.8s  145.7s  209.5s  252.0s ///  175.0s  220.4s  246.0s  236.4s]

[I 2025-10-31 17:02:06,652] Trial 68 finished with value: 26.3879814983882 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 14, 'train_eta': 0.011097139923856868, 'subsample': 0.6607703189584495, 'colsample_bytree': 0.8990891872859902, 'min_child_weight': 6, 'lambda': 0.0035816833655691826}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 17-02-16
🔄10/10 开始 17:25:22 结束 17:27:56 avg 154.0s [ 149.3s  117.7s  180.0s  152.6s  181.3s ///  141.8s  151.1s  117.8s  194.5s]

[I 2025-10-31 17:28:07,312] Trial 69 finished with value: 25.646937843018435 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': False, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 7, 'train_eta': 0.015276937864499462, 'subsample': 0.7171605827090144, 'colsample_bytree': 0.8635807411439165, 'min_child_weight': 7, 'lambda': 0.0017575014072510784}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 17-28-16
🔄10/10 开始 17:48:38 结束 17:50:53 avg 135.7s [ 155.9s  131.7s  153.9s  171.6s  100.7s ///  116.4s  115.7s  143.6s  131.4s]

[I 2025-10-31 17:50:19,163] Trial 70 finished with value: 25.652854433576135 and parameters: {'remove_dup_smiles': False, 'use_feature_gen': False, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 7, 'train_eta': 0.019434843365525964, 'subsample': 0.6043828493283542, 'colsample_bytree': 0.8844571701338577, 'min_child_weight': 7, 'lambda': 0.0019834918579679675}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 17-50-29
🔄10/10 开始 18:11:41 结束 18:14:03 avg 141.4s [ 127.9s  163.6s  132.3s  132.2s  120.3s ///  119.0s  152.9s   96.6s  227.6s]

[I 2025-10-31 18:14:24,873] Trial 71 finished with value: 25.705101288096774 and parameters: {'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_n_estimators': 500, 'selector_threshold': '0.5*mean', 'train_max_depth': 8, 'train_eta': 0.015061766954224181, 'subsample': 0.6553485045870235, 'colsample_bytree': 0.8650023583806064, 'min_child_weight': 7, 'lambda': 0.00763716951755416}. Best is trial 38 with value: 25.549116408375987.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-31 18-14-34
🔄 7/10 开始 18:27:42 结束 18:36:28 avg 131.5s [ 168.5s  119.1s  155.7s  116.4s   94.5s ///  134.4s]

Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x0000024C4029CD30>>
Traceback (most recent call last):
  File "d:\Software\conda\envs\kaggle\lib\site-packages\xgboost\core.py", line 637, in _next_wrapper
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "d:\Software\conda\envs\kaggle\lib\site-packages\xgboost\core.py", line 550, in _handle_exception
    return fn()
  File "d:\Software\conda\envs\kaggle\lib\site-packages\xgboost\core.py", line 637, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "d:\Software\conda\envs\kaggle\lib\site-packages\xgboost\data.py", line 1416, in next
    input_data(**self.kwargs)
  File "d:\Software\conda\envs\kaggle\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "d:\Software\conda\envs\kaggle\lib\site-packages\xgboost\core.py", line 625, in input_data
    d

# 管理数据库信息

In [None]:
# 查询数据库详细数据


storage_url = f"mysql+pymysql://{user}:{password}@{host}:3306/{database_name}"

studies = optuna.study.get_all_study_summaries(storage=storage_url)

if not studies:
    print("❌ 当前数据库里无 study")
else:
    print("✅ 数据库中的 study 列表:")
    for s in studies:

        print("-", s.study_name)

        study = optuna.load_study(study_name=s.study_name, storage=storage_url)

        print("         Trials:")
        for trial in study.trials[:10]:  # 仅显示前 10 个 trial
            host = trial.user_attrs.get("host") or "unknown"
            ip = trial.user_attrs.get("ip") or "unknown"
            value = f"{trial.value:.10f}" if trial.value is not None else "None"

            print(
                f"    Trial {trial.number:4d}: "
                f"host={host:<16}, ip={ip:<15}, "
                f"value={value:<15}, params={trial.params}"
            )

        print("    总 trial 数量:", len(study.trials))
        print("=" * 100)


In [None]:
# 查询当前最优结果
storage_url = f"mysql+pymysql://{user}:{password}@{host}:3306/{database_name}"

study_name = study_save_name
study = optuna.load_study(study_name=study_name, storage=storage_url)

# 查看最优 trial
best_trial = study.best_trial

print(f"✅ Study 名称: {study_name}")
print(f"最优目标值: {best_trial.value:.8f}")
print(f"最优参数: {best_trial.params}")
print(f"Trial 编号: {best_trial.number}")
print(f"Host: {best_trial.user_attrs.get('host', 'unknown')}")
print(f"IP: {best_trial.user_attrs.get('ip', 'unknown')}")

# print("\n🔍 详细优化过程（全部 trial）:")
# for trial in study.trials:
#     value = f"{trial.value:.8f}" if trial.value is not None else "None"
#     print(
#         f"Trial {trial.number:3d}: value={value:<15}, params={trial.params}"
#     )

print(f"\n📊 总 trial 数量: {len(study.trials)}")


In [None]:
# 清理前：先查看数据库里当前有哪些 study 存在，以及每个 study 里有多少个 trial

storage_url = f"mysql+pymysql://{user}:{password}@{host}:3306/{database_name}"

studies = optuna.study.get_all_study_summaries(storage=storage_url)
print("现有 study：", [s.study_name for s in studies])

for s in studies:
    study = optuna.load_study(study_name=s.study_name, storage=storage_url)
    print(f"Study:   {s.study_name:30s}, Trials: {len(study.trials):4d}")

In [None]:
# 清理中：删除指定 study
# 指定要删除的名称
to_delete = ["melting_point_study"]   # 可以写一个或多个

to_delete = [            ]

for s in studies:
    if s.study_name in to_delete:
        optuna.delete_study(study_name=s.study_name, storage=storage_url)
        print("已删除:", s.study_name)


In [None]:
# 清理后：再次检查
studies_after = optuna.study.get_all_study_summaries(storage=storage_url)
print("清理后 study：", [s.study_name for s in studies_after])


# 使用泄露数据完善

In [None]:
import pandas as pd

submission_df = pd.read_csv(target_file)
show_df_info(submission_df, "submission_df")
show_df_info(test_df, "test_df")



test_with_Tm_df = test_df[['id', 'SMILES', 'origin_info', 'SMILES_normalized']].copy()
test_with_Tm_df['Tm'] = submission_df['Tm']
show_df_info(test_with_Tm_df, "test_with_Tm_df_before_replace")


# =========================================================
# 🔁 3. 定义替换函数
# =========================================================
def replace_Tm_from_train(test_df, train_df, key_col="SMILES_normalized", target_col="Tm", verbose=True):
    """
    用训练集的 Tm 值替换测试集中相同分子的 Tm 值。
    
    参数:
        test_df    : DataFrame，包含预测的 Tm
        train_df   : DataFrame，包含真实的 Tm
        key_col    : 匹配的键列（默认为 'SMILES_normalized'）
        target_col : 要替换的目标列（默认为 'Tm'）
        verbose    : 是否打印替换统计信息
        
    返回:
        替换后的 test_df（原 DataFrame 的拷贝）
    """
    test_df = test_df.copy()

    # 1️⃣ 建立映射表：SMILES_normalized → Tm
    train_tm_map = train_df.set_index(key_col)[target_col]

    # 2️⃣ 找出匹配项
    matched_mask = test_df[key_col].isin(train_tm_map.index)
    matched_count = matched_mask.sum()
    total = len(test_df)

    # 3️⃣ 替换匹配到的值
    test_df[target_col] = test_df[key_col].map(train_tm_map).fillna(test_df[target_col])

    # 4️⃣ 打印替换信息
    if verbose:
        print(f"✅ 共 {total} 行测试样本，其中 {matched_count} 行匹配到训练集 ({matched_count/total:.2%}) 并替换 Tm。")

    return test_df


# =========================================================
# ⚙️ 4. 执行替换并显示结果
# =========================================================
test_with_Tm_df = replace_Tm_from_train(test_with_Tm_df, train_df)
show_df_info(test_with_Tm_df, "test_with_Tm_df_after_replace")


# 1️⃣ 把替换后的 Tm 写回 submission_df
submission_df["Tm"] = test_with_Tm_df["Tm"]

# 2️⃣ 生成新的输出路径
submission_final_path = target_file.replace(".csv", "_final.csv")

# 3️⃣ 保存到新文件
submission_df.to_csv(submission_final_path, index=False)

print(f"✅ 最终提交文件已保存到: {submission_final_path}")
