In [None]:
# pip install optuna pymysql sqlalchemy

In [11]:
ISTEST = True


# ISTEST = False

STUDY_NAME = "test" if ISTEST else "optuna_task1"

if ISTEST:
    status_msg = "🧪 当前状态：实验运行"
else:
    status_msg = "🚀 当前状态：正式运行"

print(status_msg)
print(f"📂 Study Name: {STUDY_NAME}")

🧪 当前状态：实验运行
📂 Study Name: test


In [12]:
# 系统库
import os
import subprocess
import time
import shutil
import json
import socket
from datetime import datetime, timedelta

# 第三方科学计算 & 可视化
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 设置中文字体，避免乱码
plt.rcParams['font.sans-serif'] = ['SimHei']        # 黑体
plt.rcParams['axes.unicode_minus'] = False          # 解决负号显示成方块的问题

# 机器学习 & 优化
import xgboost as xgb
import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer

# 化学信息学 (RDKit)
from rdkit import Chem, RDLogger
from rdkit.Chem import (
    Descriptors, Crippen, rdMolDescriptors,
    MACCSkeys, RDKFingerprint, rdFingerprintGenerator
)
from rdkit.Chem.AtomPairs import Pairs, Torsions

# 关闭 RDKit 的警告
RDLogger.DisableLog('rdApp.*')

# Avalon 指纹（可选）
try:
    from rdkit.Avalon import pyAvalonTools
    avalon_available = True
except ImportError:
    avalon_available = False
print(f"Avalon available: {avalon_available}")

# Kaggle API
from kaggle.api.kaggle_api_extended import KaggleApi


import plotly.io as pio
pio.renderers.default = "iframe_connected"







if socket.gethostname() == 'hao-2':
    dir = r'D:\数据\Kaggle\Thermophysical Property Melting Point'
else:
    dir = os.getcwd()


DIRS = {
    "dir":              dir,                                       
    "DATA_DIR000":      os.path.join(dir, "DATA_DIR000"),
    "HISTORY":          os.path.join(dir, "HISTORY"),
    "SUBMISSION":       os.path.join(dir, "SUBMISSION"),
}

# 自动创建目录
for key, path in DIRS.items():
    os.makedirs(path, exist_ok=True)

# 打印时一行一个地址
print("✅ 路径已创建：\n")
for key, path in DIRS.items():
    print(f"{key:<12} : {path}")


Avalon available: True
✅ 路径已创建：

dir          : D:\数据\Kaggle\Thermophysical Property Melting Point
DATA_DIR000  : D:\数据\Kaggle\Thermophysical Property Melting Point\DATA_DIR000
HISTORY      : D:\数据\Kaggle\Thermophysical Property Melting Point\HISTORY
SUBMISSION   : D:\数据\Kaggle\Thermophysical Property Melting Point\SUBMISSION


In [13]:
# 读取已处理好的最终特征数据

# 特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
# 合计特征总数 = 6528

# 定义路径
merge_fp_path = os.path.join(DIRS['DATA_DIR000'], "merge_fingerprints.csv")
test_fp_path  = os.path.join(DIRS['DATA_DIR000'], "test_fingerprints.csv")

# 读取数据
merge_df = pd.read_csv(merge_fp_path)
test_df  = pd.read_csv(test_fp_path)

# 打印信息
print(f"✅ merge_df 加载完成，shape = {merge_df.shape}")
print(f"✅ test_df  加载完成，shape = {test_df.shape}")

print("特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024")
print("合计特征总数 = 6528")


✅ merge_df 加载完成，shape = (28808, 6530)
✅ test_df  加载完成，shape = (666, 6530)
特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
合计特征总数 = 6528


In [14]:
# 数据拆分 (特征矩阵 与 目标向量)
# ============================================
# 特征字段: SMILES, Tm | 描述符: 217 | Morgan: 1024 | FCFP: 1024 | MACCS: 167 | AtomPair: 1024 | RDKit: 2048 | Avalon: 1024
# 合计特征总数 = 6528




# # 构建训练集与测试集
# # 1. 找到重复的 SMILES
# dup_smiles = set(merge_df['SMILES']) & set(test_df['SMILES'])
# print(f"⚠️ 检测到 {len(dup_smiles)} 个重复 SMILES")

# # 2. 删除 merge_df 里 SMILES 在 test_df 里的行
# before_shape = merge_df.shape
# merge_df = merge_df[~merge_df['SMILES'].isin(test_df['SMILES'])].reset_index(drop=True)
# after_shape = merge_df.shape

# print(f"✅ 删除完成: 从 {before_shape} → {after_shape}")




features_train = merge_df.drop(labels=['SMILES', 'Tm'], axis=1)      # 特征矩阵 X：去掉 SMILES 和目标值 Tm
target_train = merge_df['Tm']                                      # 目标向量 y：只保留 Tm (熔点，单位 K)
features_test = test_df.drop(labels=['SMILES', 'id'], axis=1)  # 测试集特征：去掉 SMILES 和 id (因为 test 没有 Tm)



# 随机选取部分特征（示例：50 个）
if ISTEST:
    np.random.seed(42)
    selected_features = np.random.choice(
        merge_df.drop(columns=['SMILES', 'Tm']).columns,
        size=20,
        replace=False
    )
    sample_len = 500
    features_train = merge_df.iloc[:sample_len][selected_features]   # 训练特征 (前 1000 条)
    target_train = merge_df.iloc[:sample_len]['Tm']               # 训练目标
    features_test = test_df[selected_features]          # 测试特征 (同样的特征列)



# 3. 打印维度信息
print("📊 数据拆分完成")
print(f"训练集特征 features_train  shape   : {features_train.shape}")
print(f"训练集目标   target_train  shape   : {target_train.shape}")
print(f"测试集特征  features_test  shape   : {features_test.shape}")
print(f"           features_train  类型    : {type(features_train)}")



📊 数据拆分完成
训练集特征 features_train  shape   : (500, 20)
训练集目标   target_train  shape   : (500,)
测试集特征  features_test  shape   : (666, 20)
           features_train  类型    : <class 'pandas.core.frame.DataFrame'>


In [15]:
# Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果
# ==============================================================
def run_kfold_xgb(features_train, target_train, features_test, params, DIRS, K_FOLDS=5, verbose=0):
    """
    使用 Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果

    参数:
        features_train, target_train        : 训练集特征和标签
        features_test      : 测试集特征
        params      : XGBoost 最优参数 (dict)
        DIRS        : 保存结果的目录字典
        K_FOLDS     : 折数 (默认=5)
        verbose     : 是否打印详细信息
    """
    # ---------- 创建目录 ----------
    for _, path in DIRS.items():
        os.makedirs(path, exist_ok=True)


    time_str = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    history_DIR = os.path.join(DIRS['HISTORY'], time_str)
    os.makedirs(history_DIR, exist_ok=True)



    print("——" * 20)
    print(f"✅ 当前结果将保存到: {time_str}")


    # ---------- 定义交叉验证 ----------
    skfold = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    yeo = PowerTransformer(method="yeo-johnson")                                # 定义 Yeo-Johnson 变换

    # ---------- 初始化存储 ----------
    oof_val = np.zeros(len(features_train))       # OOF 预测
    train_score, val_score = [], []  # 每折 MAE
    test_pred = []                   # 每折 test 预测
    fold_records = []                # 保存每折信息
    all_importances = []             # 特征重要性
    elapsed_list = []                # 耗时记录



    # 循环每一折
    # ==============================================================

    for i, (train_idx, val_idx) in enumerate(skfold.split(features_train, pd.qcut(target_train, q=10).cat.codes), 1):

        # ----- 打印时间信息 -----
        start_now = datetime.now()
        start_str = start_now.strftime("%H:%M:%S")

        if elapsed_list:
            avg_time = np.mean(elapsed_list)
            est_end = start_now + timedelta(seconds=avg_time)

            # 每 5 个一组输出耗时
            parts = [f"{t:6.1f}s" for t in elapsed_list]
            grouped = [" ".join(parts[j:j+5]) for j in range(0, len(parts), 5)]
            elapsed_str = " /// ".join(grouped)

            print(
                f"🔄{i:2d}/{K_FOLDS} ST {start_str}"
                f" ET {est_end.strftime('%H:%M:%S')}"
                f" avg {avg_time:.1f}s"
                f" [{elapsed_str}]",
                end="\r", flush=True
            )
        else:
            print(f"🔄{i:2d}/{K_FOLDS} ST {start_str} ET (暂无历史数据)", end="\r", flush=True)



        # ----- 开始训练 -----
        t0 = time.time()

        # 1. 数据集划分
        x_train, x_val = features_train.iloc[train_idx], features_train.iloc[val_idx]
        y_train, y_val = target_train[train_idx], target_train[val_idx]

        # 2. Yeo-Johnson 变换
        y_train = yeo.fit_transform(y_train.values.reshape(-1, 1)).squeeze()
        y_val   = yeo.transform(y_val.values.reshape(-1, 1)).squeeze()


        # 3. 特征选择（轻量级 XGBoost）
        selector_model = xgb.XGBRegressor(
            n_estimators=500, 
            max_depth=6, 
            learning_rate=0.05,
            random_state=42,
            device="cuda", 
            objective="reg:absoluteerror",
            tree_method="hist", 
            verbosity=0
        )
        selector_model.fit(x_train, y_train)

        selector = SelectFromModel(selector_model, prefit=True, threshold="mean")
        selected_features = x_train.columns[selector.get_support()].tolist()
        if verbose > 0:
            print(f"✅ 选择的特征数量: {len(selected_features)}")


        # 4. 保留重要特征
        x_train_new = x_train[selected_features]
        x_val_new   = x_val[selected_features]
        x_test_new  = features_test[selected_features]

        # 5. 转换为 DMatrix
        dtrain = xgb.DMatrix(x_train_new, y_train, feature_names=selected_features)
        dval   = xgb.DMatrix(x_val_new,   y_val,   feature_names=selected_features)
        dtest  = xgb.DMatrix(x_test_new,             feature_names=selected_features)


        # 6. XGBoost 训练
        xgb_model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=params["num_boost_round"],
            evals=[(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds=300,
            verbose_eval=(1000 if verbose > 0 else False)
        )


        # 保存模型
        model_path = os.path.join(history_DIR, f"xgb_model_fold{i}.json")
        xgb_model.save_model(model_path)

        # 7. 获取特征重要性
        imp_dict = xgb_model.get_score(importance_type="gain")
        imp_df = pd.DataFrame(imp_dict.items(), columns=["Feature", "Importance"])
        imp_df["Fold"] = i
        all_importances.append(imp_df)


        # 8. 预测
        y_train_pred = xgb_model.predict(dtrain)
        y_val_pred   = xgb_model.predict(dval)
        y_test_pred  = xgb_model.predict(dtest)

        # 9. 逆变换
        y_train      = yeo.inverse_transform(y_train.reshape(-1, 1)).squeeze()
        y_val        = yeo.inverse_transform(y_val.reshape(-1, 1)).squeeze()
        y_train_pred = yeo.inverse_transform(y_train_pred.reshape(-1, 1)).squeeze()
        y_val_pred   = yeo.inverse_transform(y_val_pred.reshape(-1, 1)).squeeze()
        y_test_pred  = yeo.inverse_transform(y_test_pred.reshape(-1, 1)).squeeze()

        # 10. 计算 MAE
        train_mae = mean_absolute_error(y_train, y_train_pred)
        val_mae   = mean_absolute_error(y_val,   y_val_pred)
        if verbose > 0:
            print(f"Fold {i}: Train MAE={train_mae:.4f}, Val MAE={val_mae:.4f}，用时 {elapsed:.2f} 秒")



        # ----- 保存结果 -----
        train_score.append(train_mae)
        val_score.append(val_mae)
        oof_val[val_idx] = y_val_pred
        test_pred.append(y_test_pred)

        elapsed = time.time() - t0
        elapsed_list.append(elapsed)

        fold_records.append({
            "Fold": i,
            "Train_MAE": train_mae,
            "Val_MAE": val_mae,
            "Num_Features": len(selected_features),
            "Selected_Features": selected_features,
            "elapsed": elapsed
        })

    # 保存整体结果
    # ==============================================================
    if verbose > 0:
        print("\n")
        print(f"📊 Train MAE 平均值 : {np.mean(train_score):.4f}")
        print(f"📊 Val   MAE 平均值 : {np.mean(val_score):.4f}")
        print(f"📊 Train MAE 标准差 : {np.std(train_score, ddof=0):.4f}")
        print(f"📊 Val   MAE 标准差 : {np.std(val_score, ddof=0):.4f}")

    # 参数
    with open(os.path.join(history_DIR, "params.json"), "w", encoding="utf-8") as f:
        json.dump(params, f, indent=4, ensure_ascii=False)

    # 每折信息
    folds_df = pd.DataFrame(fold_records)
    folds_df.to_csv(os.path.join(history_DIR, "folds_info.csv"), index=False, encoding="utf-8-sig")


    # 特征重要性
    if all_importances:
        valid_imps = [df for df in all_importances if not df.empty]
        all_imp_df = pd.concat(valid_imps, axis=0) if valid_imps else pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    else:
        all_imp_df = pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    all_imp_df.to_csv(os.path.join(history_DIR, "feature_importance_all.csv"), index=False, encoding="utf-8-sig")


    # 测试集预测
    test_pred_array = np.vstack(test_pred).T
    test_pred_df = pd.DataFrame(test_pred_array, columns=[f"Fold_{j+1}" for j in range(test_pred_array.shape[1])])
    test_pred_df["Final_Pred"] = test_pred_df.mean(axis=1)
    test_pred_df.to_csv(os.path.join(history_DIR, "test_predictions.csv"), index=False, encoding="utf-8-sig")

    # 总结
    with open(os.path.join(history_DIR, "summary.txt"), "w", encoding="utf-8") as f:
        f.write(f"Train MAE Mean : {np.mean(train_score):.4f}\n")
        f.write(f"Val   MAE Mean : {np.mean(val_score):.4f}\n")
        f.write(f"Train MAE Std  : {np.std(train_score, ddof=0):.4f}\n")
        f.write(f"Val   MAE Std  : {np.std(val_score, ddof=0):.4f}\n")


    # 最终提交
    final_score = np.mean(val_score)
    submission = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "sample_submission.csv"))
    submission["Tm"] = test_pred_df["Final_Pred"]

    submission_path = os.path.join(history_DIR, f"sub_{time_str}_{final_score:.8f}.csv")
    submission.to_csv(submission_path, index=False)
    submission.to_csv(os.path.join(DIRS['SUBMISSION'], f"sub_{time_str}_{final_score:.8f}.csv"), index=False)


    # ---------- 返回结果 ----------
    return {
        "oof_val": oof_val,
        "train_score": train_score,
        "val_score": val_score,
        "test_pred": test_pred_df,
        "folds_info": folds_df,
        "feature_importance": all_imp_df,
        "submission_path": submission_path,
        "time": time_str,
        "final_score": final_score
    }


In [16]:
# 定义优化任务  加入标识符 host: hao-2   ip: 192.168.40.1
def objective(trial):
    """
    Optuna 的目标函数 (Objective Function)
    每次 trial 会生成一组超参数，用于训练 XGBoost 模型，
    并返回交叉验证的平均 RMSE 作为优化目标。
    """

    # 1. 定义 XGBoost 超参数搜索空间
    xgb_params = {
        "verbosity"        : 0,                                   # 训练时日志输出级别 (0=静默)
        "objective"        : "reg:absoluteerror",              # 回归任务目标函数
        "tree_method"      : "gpu_hist",                          # 使用 GPU 加速的直方图算法
        "predictor"        : "gpu_predictor",                     # GPU 预测
        "device"           : "cuda",                              # 指定设备 (CUDA GPU)
        "eval_metric"      : "mae",                               # 评估指标：平均绝对误差
        "booster"          : "gbtree",                            # 基学习器：树模型
        "num_boost_round"     : 20000,                               # 如果用 sklearn API 才保留；xgb.train 用 num_boost_round

        # -------- 需要调优的超参数 --------
        "max_depth"        : trial.suggest_int  ("max_depth"       , 3    , 7),
        "learning_rate"    : trial.suggest_float("learning_rate"   , 0.01 , 0.3 , log=True),
        "min_child_weight" : trial.suggest_int  ("min_child_weight", 1    , 10),
        "subsample"        : trial.suggest_float("subsample"       , 0.5  , 1.0),
        "colsample_bytree" : trial.suggest_float("colsample_bytree", 0.5  , 1.0),
        "gamma"            : trial.suggest_float("gamma"           , 0.0  , 1.0),
        "reg_lambda"       : trial.suggest_float("reg_lambda"      , 0.1  , 5.0 , log=True),
        "reg_alpha"        : trial.suggest_float("reg_alpha"       , 0.1  , 1.0 , log=True),
    }


    results = run_kfold_xgb(features_train, target_train, features_test, xgb_params, DIRS, K_FOLDS = 10, verbose = 0)

    score = results['final_score']
    

    
    HOSTNAME = socket.gethostname()
    HOST_IP = socket.gethostbyname(HOSTNAME)
    trial.set_user_attr("host", HOSTNAME)        # 你自己定义主机 A/B
    trial.set_user_attr("ip", HOST_IP)        # 你自己定义角色 A/B

    
    # 4. 返回平均 MAE
    return score





In [17]:
# 开始优化

# 1. 定义 SQLite 数据库存储路径

storage_url = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

study = optuna.create_study(
    study_name = STUDY_NAME,
    # study_name="ghsdjsrtjrswtjhwrt",
    storage=storage_url,
    load_if_exists=True
)

# 自动获取当前主机名\当前主机的 IP 地址
HOSTNAME = socket.gethostname()
HOST_IP = socket.gethostbyname(HOSTNAME)
print("主机名:", HOSTNAME," 主机 IP:", HOST_IP)
time.sleep(1)

# 5. 启动超参数搜索
print("🔎 开始超参数搜索...")
if ISTEST:
    study.optimize(objective, n_trials = 3)
else:
    study.optimize(objective, n_trials = 100)


# 6. 打印最优结果
print("\n✅ 训练完成！")
print(f"📊 已完成试验次数 : {len(study.trials)}")
print(f"🏆 最优试验编号   : {study.best_trial.number}")
print(f"📉 最优 MAE       : {study.best_value}")
print(f"⚙️ 最优参数组合   : {study.best_trial.params}")


[I 2025-10-21 15:55:03,256] Using an existing study with name 'test' instead of creating a new one.


主机名: hao-2  主机 IP: 192.168.40.1
🔎 开始超参数搜索...
————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-21 15-55-04
🔄10/10 ST 15:55:24 ET 15:55:26 avg 2.2s [   2.0s    2.7s    1.5s    3.4s    1.6s ///    2.0s    2.5s    1.5s    2.8s]

[I 2025-10-21 15:55:26,125] Trial 6 finished with value: 63.064621230468745 and parameters: {'max_depth': 3, 'learning_rate': 0.019651386606002744, 'min_child_weight': 6, 'subsample': 0.5334783686346105, 'colsample_bytree': 0.5894583143905426, 'gamma': 0.49088975643532917, 'reg_lambda': 0.15242062420054192, 'reg_alpha': 0.21577721406330097}. Best is trial 6 with value: 63.064621230468745.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-21 15-55-26
🔄10/10 ST 15:55:41 ET 15:55:43 avg 1.7s [   1.5s    2.5s    1.4s    1.6s    1.5s ///    1.5s    1.5s    2.2s    1.7s]

[I 2025-10-21 15:55:44,048] Trial 7 finished with value: 63.42987898193358 and parameters: {'max_depth': 3, 'learning_rate': 0.16701877404534557, 'min_child_weight': 8, 'subsample': 0.6649407630812719, 'colsample_bytree': 0.8350044077322408, 'gamma': 0.5935872801065222, 'reg_lambda': 0.15156043255073376, 'reg_alpha': 0.30274397572605766}. Best is trial 6 with value: 63.064621230468745.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-21 15-55-44
🔄10/10 ST 15:56:04 ET 15:56:06 avg 2.3s [   1.9s    3.3s    1.5s    3.4s    1.5s ///    2.0s    1.7s    1.4s    3.9s]

[I 2025-10-21 15:56:06,540] Trial 8 finished with value: 63.14212542968748 and parameters: {'max_depth': 3, 'learning_rate': 0.013736348754728076, 'min_child_weight': 7, 'subsample': 0.5852735359947805, 'colsample_bytree': 0.68036314460227, 'gamma': 0.7599864574319253, 'reg_lambda': 0.2653823032465253, 'reg_alpha': 0.19019002717897487}. Best is trial 6 with value: 63.064621230468745.



✅ 训练完成！
📊 已完成试验次数 : 9
🏆 最优试验编号   : 6
📉 最优 MAE       : 63.064621230468745
⚙️ 最优参数组合   : {'max_depth': 3, 'learning_rate': 0.019651386606002744, 'min_child_weight': 6, 'subsample': 0.5334783686346105, 'colsample_bytree': 0.5894583143905426, 'gamma': 0.49088975643532917, 'reg_lambda': 0.15242062420054192, 'reg_alpha': 0.21577721406330097}


In [None]:
停止运行

# 管理数据库信息

In [18]:
# 查询数据库详细数据

storage_url = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

studies = optuna.study.get_all_study_summaries(storage=storage_url)

if not studies:
    print("❌ 当前数据库里无 study")
else:
    print("✅ 数据库中的 study 列表:")
    for s in studies:

        print("-", s.study_name)

        study = optuna.load_study(study_name=s.study_name, storage=storage_url)

        print("         Trials:")
        for trial in study.trials:
            host = trial.user_attrs.get("host") or "unknown"
            ip = trial.user_attrs.get("ip") or "unknown"
            value = f"{trial.value:.4f}" if trial.value is not None else "None"

            print(
                f"    Trial {trial.number:4d}: "
                f"host={host:<16}, ip={ip:<15}, "
                f"value={value:<10}, params={trial.params}"
            )

        print("    总 trial 数量:", len(study.trials))
        print("=" * 100)


✅ 数据库中的 study 列表:
- test
         Trials:
    Trial    0: host=hao-2           , ip=192.168.40.1   , value=63.3390   , params={'max_depth': 7, 'learning_rate': 0.014881350294625223, 'min_child_weight': 6, 'subsample': 0.7434737625708363, 'colsample_bytree': 0.9897332098694307, 'gamma': 0.2774819216237613, 'reg_lambda': 0.5956939885569088, 'reg_alpha': 0.6311524797185967}
    Trial    1: host=hao-2           , ip=192.168.40.1   , value=63.6148   , params={'max_depth': 3, 'learning_rate': 0.2713247898317377, 'min_child_weight': 7, 'subsample': 0.9849126281161902, 'colsample_bytree': 0.8309378401166326, 'gamma': 0.7814392281295133, 'reg_lambda': 3.2585889489649014, 'reg_alpha': 0.235425190902393}
    Trial    2: host=hao-2           , ip=192.168.40.1   , value=63.3587   , params={'max_depth': 5, 'learning_rate': 0.05739020873816809, 'min_child_weight': 10, 'subsample': 0.6583362500908256, 'colsample_bytree': 0.5101341233098649, 'gamma': 0.6972120855106135, 'reg_lambda': 3.618819426954319,

In [19]:
# 清理前：先查看数据库里当前有哪些 study 存在，以及每个 study 里有多少个 trial

storage = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

studies = optuna.study.get_all_study_summaries(storage=storage)
print("现有 study：", [s.study_name for s in studies])

for s in studies:
    study = optuna.load_study(study_name=s.study_name, storage=storage)
    print(f"Study:   {s.study_name:30s}, Trials: {len(study.trials):4d}")

现有 study： ['test']
Study:   test                          , Trials:    9


In [20]:
# 清理中：删除指定 study
# 指定要删除的名称
to_delete = ["melting_point_study"]   # 可以写一个或多个
to_delete = []   # 可以写一个或多个

for s in studies:
    if s.study_name in to_delete:
        optuna.delete_study(study_name=s.study_name, storage=storage)
        print("已删除:", s.study_name)


In [21]:
# 清理后：再次检查
studies_after = optuna.study.get_all_study_summaries(storage=storage)
print("清理后 study：", [s.study_name for s in studies_after])


清理后 study： ['test']


In [None]:
停止运行

# 单次训练推导

In [22]:
# 从 Optuna study 获取最优参数

storage = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

# 只加载指定 study
study = optuna.load_study(study_name=STUDY_NAME, storage=storage)

print(f"当前最优 MAE : {study.best_value} \n")
best_params = study.best_trial.params

# 构造最终训练用参数（对齐格式）
params = {
    "verbosity"        : 0,                                   # 日志静默
    "objective"        : "reg:absoluteerror",                 # 回归任务目标函数 (MAE)
    "tree_method"      : "gpu_hist",                          # GPU 加速直方图算法
    "predictor"        : "gpu_predictor",                     # GPU 预测
    "device"           : "cuda",                              # CUDA GPU
    "eval_metric"      : "mae",                               # 评估指标
    "booster"          : "gbtree",                            # 树模型
    "num_boost_round"  : 20_000,                              # 最大迭代次数

    # -------- 调优后的超参数 --------
    "max_depth"        : best_params.get("max_depth"       , 6   ),
    "learning_rate"    : best_params.get("learning_rate"   , 0.10),
    "min_child_weight" : best_params.get("min_child_weight", 6   ),
    "subsample"        : best_params.get("subsample"       , 0.60),
    "colsample_bytree" : best_params.get("colsample_bytree", 0.60),
    "gamma"            : best_params.get("gamma"           , 0.40),
    "reg_lambda"       : best_params.get("reg_lambda"      , 1.60), 
    "reg_alpha"        : best_params.get("reg_alpha"       , 0.40),
}


for k, v in params.items():
    print(f"{k:20s}: {str(v):20s}")



当前最优 MAE : 63.064621230468745 

verbosity           : 0                   
objective           : reg:absoluteerror   
tree_method         : gpu_hist            
predictor           : gpu_predictor       
device              : cuda                
eval_metric         : mae                 
booster             : gbtree              
num_boost_round     : 20000               
max_depth           : 3                   
learning_rate       : 0.019651386606002744
min_child_weight    : 6                   
subsample           : 0.5334783686346105  
colsample_bytree    : 0.5894583143905426  
gamma               : 0.49088975643532917 
reg_lambda          : 0.15242062420054192 
reg_alpha           : 0.21577721406330097 


In [23]:
best_params.get("lambda")

In [24]:
# 单一执行一次

results = run_kfold_xgb(features_train, target_train, features_test, params, DIRS, K_FOLDS = 10, verbose = 0)

score = results['final_score']

print('\n',score)

————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-21 15-58-09
🔄10/10 ST 15:58:29 ET 15:58:31 avg 2.2s [   2.1s    2.6s    1.5s    3.5s    1.6s ///    2.1s    2.4s    1.5s    2.8s]
 63.064621230468745


In [None]:
停止运行


# 提交 kaggle 平台测试

In [26]:
# 根据 submission_time 定位文件路径 提交 kaggle 平台测试

import os
import itertools
import time
from kaggle.api.kaggle_api_extended import KaggleApi


def find_submission_file(submission_time, submission_dir):
    """
    在 submission_dir 下查找包含 submission_time 的文件
    一旦找到立刻返回完整路径；如果没找到则返回 None
    """
    for fname in os.listdir(submission_dir):
        if submission_time in fname:
            file_path = os.path.join(submission_dir, fname)
            print(f"✅ 找到目标文件: {fname}")
            return file_path
    
    print(f"⚠️ 未找到包含 {submission_time} 的文件")
    return None

def submit_and_get_score(file_path, competition_name, message="My submission"):
    """
    封装 Kaggle 提交并等待结果评分
    --------------------------------------
    file_path        : str  提交文件路径
    competition_name : str  Kaggle 比赛名称 (URL 最后一段)
    message          : str  提交备注
    """
    # 1. 配置 Kaggle API
    os.environ["KAGGLE_CONFIG_DIR"] = r"C:\Users\Admin\.kaggle"
    api = KaggleApi()
    api.authenticate()
    print("✅ Kaggle API 已经配置成功！")

    # 2. 提交文件
    api.competition_submit(
        file_name=file_path,
        competition=competition_name,
        message=message
    )
    print("✅ 提交完成！请等待评分...")

    # 3. 动态等待
    spinner = itertools.cycle(["|", "/", "-", "\\"])
    while True:
        submissions = api.competition_submissions(competition_name)
        latest = submissions[0]
        status_str = str(latest._status).lower()

        if "complete" in status_str and latest._public_score is not None:
            print("\n🎯 最终结果:")
            print(f"Public 分数 : {latest._public_score}")
            print(f"Private 分数: {latest._private_score}")
            print(f"提交 ID     : {latest._ref}")
            print(f"文件名      : {latest._file_name}")
            print(f"状态        : {latest._status}")
            print(f"提交时间    : {latest._date}")
            print(f"描述/备注   : {latest._description}")
            return latest

        spin_char = next(spinner)
        print(f"当前状态: {status_str} , 等待中 {spin_char}", end="\r", flush=True)
        time.sleep(0.2)  # 每 0.5 秒检查一次


### 不轻易运行，再三考虑

In [28]:
# submission_time 提交
submission_time = "2025-10-21 15-58-09"   
competition_name = "melting-point"
message =  "本地提交测试"

target_file = find_submission_file(submission_time, DIRS['SUBMISSION'] )

# submit_and_get_score(target_file, competition_name, message)

✅ 找到目标文件: sub_2025-10-21 15-58-09_63.06462123.csv


In [None]:
停止运行