In [None]:
# 定义优化任务  加入标识符 host: hao-2   ip: 192.168.40.1

import copy
import contextlib
import io

def objective(trial):
    """
    Optuna 的目标函数 (Objective Function)
    每次 trial 会生成一组超参数，用于训练 XGBoost 模型，
    并返回交叉验证的平均 RMSE 作为优化目标。
    """




    # 1. 定义 超参数 搜索空间
    # 拷贝一份 config，避免全局污染
    config = copy.deepcopy(base_config)

    # 只修改需要优化的参数
    config["remove_dup_smiles"] = trial.suggest_categorical("remove_dup_smiles", [True, False])
    config["use_feature_gen"]   = trial.suggest_categorical("use_feature_gen", [True, False])
    config["use_pca"]           = trial.suggest_categorical("use_pca", [True, False])

    # config["xgb_selector_model_params"]["random_state"] = trial.suggest_int("selector_random_state", 1, 9999)
    config["xgb_selector_model_params"]["random_state"] = trial.suggest_categorical("selector_random_state", [42, 2025])
    config["xgb_selector_model_params"]["device"]       = trial.suggest_categorical("selector_device", ["cpu", "cuda"])
    # config["xgb_selector_model_params"]["tree_method"]  = trial.suggest_categorical("selector_tree_method", ["hist", "approx"])

    config["selector_threshold"] = trial.suggest_categorical("selector_threshold", ["mean", "0.75*mean", "0.5*mean", "1.25*mean"])

    config["xgb_train_model_params"]["max_depth"] = trial.suggest_int("train_max_depth", 3, 12)
    config["xgb_train_model_params"]["eta"] = trial.suggest_float("train_eta", 0.01 , 0.3 , log=True)








    # 主流程---------------------------------------------------------------------------------------------------
    # 创建一个黑洞缓冲区
    f = io.StringIO()
    with contextlib.redirect_stdout(f):
        None

        # 打印当前config
        print(config_to_str(config))
        

        # 加载数据
        train_df, test_df =  loaddata(DIRS)

        # 数据拆分
        print("数据拆分---------------------------")
        features_train, target_train, features_test = prepare_features_and_target(train_df, test_df, config)

        # 特征生成
        if config["use_feature_gen"]:
            print("特征生成---------------------------")
            features_train = add_chemical_features(features_train)
            features_test  = add_chemical_features(features_test)
            show_df_info(features_train, "features_train")
            show_df_info(target_train, "target_train")
            

        # 数据降维
        if config["use_pca"]:
            print("数据降维---------------------------")
            features_train_reduced = apply_truncated_svd(features_train, n_components = 100)
            features_test_reduced = apply_truncated_svd(features_test, n_components = 100)

            features_train = pd.concat([features_train, features_train_reduced], axis=1)
            features_test = pd.concat([features_test, features_test_reduced], axis=1)

            
            show_df_info(features_train, "features_train")
            show_df_info(target_train, "target_train")
            show_df_info(features_test, "features_test")



        X = features_train
        y = target_train
        X_test = features_test
            
        show_df_info(X, "X")
        show_df_info(y, "y")
        show_df_info(X_test, "X_test")

        print("开始训练---------------------------")





    results = run_kfold_xgb(X, y['Tm'], X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
    config = results['config']
    score = results['final_score']



    HOSTNAME = socket.gethostname()
    HOST_IP = socket.gethostbyname(HOSTNAME)
    trial.set_user_attr("host", HOSTNAME)        # 你自己定义主机 A/B
    trial.set_user_attr("ip", HOST_IP)        # 你自己定义角色 A/B

    # 4. 返回平均 MAE
    return score

In [None]:
# 实验配置单
base_config = {
    # 固定开关
    "ISTEST"            : True,

    "remove_dup_smiles" : False, 
    "use_feature_gen"   : False,
    "use_pca"           : False,
    "pca_components"    : 100,


    "study_save_name"    : study_save_name,

    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cpu",
        "objective"     : "reg:squarederror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "mean",   


    # 训练设置
    "xgb_train_model_params": {
        'max_depth'         : 6,                     # 树的最大深度，控制模型复杂度（越大越易过拟合）
        'eta'               : 0.1,                   # 学习率（步长缩放），越小越稳健但训练轮数需增多
        'device'            : 'cuda',                # 计算设备：'cuda' 使用 GPU 加速训练
        'tree_method'       : 'hist',                # 使用基于 GPU 的直方图算法，速度快且节省内存
        'eval_metric'       : 'mae',                 # 模型评估指标：平均绝对误差（用于验证与早停）
        'booster'           : 'gbtree',              # 使用基于树的模型（常用：'gbtree' 或 'dart'）

        'subsample'         : 0.8,                   # 每棵树训练时随机采样 80% 的样本（防止过拟合）
        'colsample_bytree'  : 0.8,                   # 每棵树随机采样 80% 的特征（增加模型多样性）
        'min_child_weight'  : 1,                     # 叶节点最小样本权重和（较大值可防止过拟合）
        'lambda'            : 1.0,                   # L2 正则化系数（防止权重过大、提升泛化）
        'objective'         : 'reg:squarederror',    # 训练目标：最小化平方误差（标准回归任务）
    },



    "num_boost_round": 15000,
}