In [4]:
# 系统库
import os
import subprocess
import time
import shutil
import json
import socket
from datetime import datetime, timedelta
import lightgbm as lgb
# 第三方科学计算 & 可视化
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 设置中文字体，避免乱码
plt.rcParams['font.sans-serif'] = ['SimHei']        # 黑体
plt.rcParams['axes.unicode_minus'] = False          # 解决负号显示成方块的问题

# 机器学习 & 优化
import xgboost as xgb
import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer


# Kaggle API
from kaggle.api.kaggle_api_extended import KaggleApi








if socket.gethostname() == 'hao-2':
    dir = r'D:\数据\Kaggle\预测道路事故风险'
else:
    dir = os.getcwd()


DIRS = {
    "dir":              dir,                                       
    "DATA_DIR000":      os.path.join(dir, "DATA_DIR000"),
    "HISTORY":          os.path.join(dir, "HISTORY"),
    "SUBMISSION":       os.path.join(dir, "SUBMISSION"),
}

# 自动创建目录
for key, path in DIRS.items():
    os.makedirs(path, exist_ok=True)

# 打印时一行一个地址
print("✅ 路径已创建：\n")
for key, path in DIRS.items():
    print(f"{key:<12} : {path}")


✅ 路径已创建：

dir          : D:\数据\Kaggle\预测道路事故风险
DATA_DIR000  : D:\数据\Kaggle\预测道路事故风险\DATA_DIR000
HISTORY      : D:\数据\Kaggle\预测道路事故风险\HISTORY
SUBMISSION   : D:\数据\Kaggle\预测道路事故风险\SUBMISSION


# 数据提取处理

In [5]:
# 加载Kaggle 训练集和 Bradley 熔点公开数据集

# Kaggle 提供的训练集和测试集
train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))



# 输出数据集规模，确认加载成功
print("Train                        shape:", train_df.shape)
print("Test                         shape:", test_df.shape)

Train                        shape: (517754, 14)
Test                         shape: (172585, 13)


In [6]:
# 检测 DataFrame 每列的数据类型、缺失值情况、唯一值数量，并给出部分示例值。
def dataframe_info(df: pd.DataFrame, n_sample: int = 6) -> pd.DataFrame:
    """
    检测 DataFrame 每列的数据类型、缺失值情况、唯一值数量，并给出部分示例值。
    
    参数:
        df       : 输入的 pandas DataFrame
        n_sample : 每列展示的样本值数量 (默认 3)
    
    返回:
        summary_df : 包含每列信息的 DataFrame
    """
    summary = pd.DataFrame({
        "数据类型": df.dtypes,
        "缺失值数量": df.isnull().sum(),
        "缺失值比例": df.isnull().mean(),
        "唯一值数量": df.nunique()
    })
    
    # 添加示例值（前 n_sample 个唯一值）
    summary["示例值 (samples)"] = df.apply(
        lambda col: col.dropna().unique()[:n_sample] if col.notnull().any() else []
    )
    
    # 按缺失比例排序
    summary = summary.sort_values("缺失值比例", ascending=False)
    print(df.columns.values)
    display(summary)
    # print(summary)


# 使用方法
dataframe_info(train_df, n_sample = 6)
dataframe_info(test_df, n_sample = 6)


['id' 'road_type' 'num_lanes' 'curvature' 'speed_limit' 'lighting'
 'weather' 'road_signs_present' 'public_road' 'time_of_day' 'holiday'
 'school_season' 'num_reported_accidents' 'accident_risk']


Unnamed: 0,数据类型,缺失值数量,缺失值比例,唯一值数量,示例值 (samples)
id,int64,0,0.0,517754,"[0, 1, 2, 3, 4, 5]"
road_type,object,0,0.0,3,"[urban, rural, highway]"
num_lanes,int64,0,0.0,4,"[2, 4, 1, 3]"
curvature,float64,0,0.0,261,"[0.06, 0.99, 0.63, 0.07, 0.58, 0.54]"
speed_limit,int64,0,0.0,5,"[35, 70, 60, 45, 25]"
lighting,object,0,0.0,3,"[daylight, dim, night]"
weather,object,0,0.0,3,"[rainy, clear, foggy]"
road_signs_present,bool,0,0.0,2,"[False, True]"
public_road,bool,0,0.0,2,"[True, False]"
time_of_day,object,0,0.0,3,"[afternoon, evening, morning]"


['id' 'road_type' 'num_lanes' 'curvature' 'speed_limit' 'lighting'
 'weather' 'road_signs_present' 'public_road' 'time_of_day' 'holiday'
 'school_season' 'num_reported_accidents']


Unnamed: 0,数据类型,缺失值数量,缺失值比例,唯一值数量,示例值 (samples)
id,int64,0,0.0,172585,"[517754, 517755, 517756, 517757, 517758, 517759]"
road_type,object,0,0.0,3,"[highway, urban, rural]"
num_lanes,int64,0,0.0,4,"[2, 3, 4, 1]"
curvature,float64,0,0.0,195,"[0.34, 0.04, 0.59, 0.95, 0.86, 0.52]"
speed_limit,int64,0,0.0,5,"[45, 35, 25, 70, 60]"
lighting,object,0,0.0,3,"[night, dim, daylight]"
weather,object,0,0.0,3,"[clear, foggy, rainy]"
road_signs_present,bool,0,0.0,2,"[True, False]"
public_road,bool,0,0.0,2,"[True, False]"
time_of_day,object,0,0.0,3,"[afternoon, evening, morning]"


# 数据分析

In [7]:
# 打印清单
def config_to_str(config: dict, indent: int = 0) -> str:
    """递归生成配置字符串"""
    prefix = "     " * indent
    lines = []
    for key, value in config.items():
        if isinstance(value, dict):
            lines.append(f"{prefix}🔹 {key}:")
            lines.append(config_to_str(value, indent + 1))  # 递归拼接子字典
        else:
            lines.append(f"{prefix}- {key:<20}: {value}")
    return "\n".join(lines)



In [8]:
# 实验配置单
config = {
    # 固定开关
    "ISTEST"            : False,

    "use_feature_gen"   : True,
    "use_pca"           : True,
    "pca_components"    : 10,

    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cpu",
        "objective"     : "reg:squarederror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "0*mean",   

    # # 训练设置
    # "xgb_train_model_params": {
    #     'max_depth'   : 6,
    #     'eta'         : 0.1,
    #     'tree_method' : 'hist',
    #     'eval_metric' : 'rmse',
    # },
    # "num_boost_round": 15000,


    # 训练设置（LightGBM）
    "lgb_train_model_params": {
        "objective": "regression",        # 回归任务
        "metric": "rmse",                 # 使用 RMSE 作为评估指标
        "boosting_type": "gbdt",          # 梯度提升树（默认）
        "learning_rate": 0.05,            # 学习率（原来 XGB 的 eta）
        "num_leaves": 31,                 # 控制树的复杂度
        "max_depth": -1,                  # -1 表示不限制深度
        "min_data_in_leaf": 20,           # 每个叶子最少样本数
        "feature_fraction": 0.9,          # 每次建树时使用的特征比例
        "bagging_fraction": 0.8,          # 每次建树时使用的数据比例
        "bagging_freq": 5,                # 每 5 次迭代进行一次 bagging
        "lambda_l1": 0.1,                 # L1 正则化
        "lambda_l2": 0.1,                 # L2 正则化
        "verbosity": -1                   # 不输出详细日志
    },
    "num_boost_round": 15000,             # 与 XGB 相同，用于控制最大迭代次数




}

In [9]:
# 数据拆分 (特征矩阵 与 目标向量)
# ============================================

import numpy as np
import pandas as pd

def prepare_features_and_target(train_df: pd.DataFrame, test_df: pd.DataFrame, config: dict):
    """
    数据拆分函数：构造训练集和测试集的特征矩阵与目标向量
    """

    
    # 2. 构造特征矩阵和目标向量
    features_train = train_df.drop(columns=['id', 'accident_risk'])   # 训练集特征 (X)
    target_train   = train_df['accident_risk']                            # 训练集目标 (y, 熔点)
    features_test  = test_df.drop(columns=['id'])    # 测试集特征 (无 Tm)



    # 随机选取部分特征（示例：50 个）
    if config["ISTEST"]:

        sample_len = 100
        features_train = train_df.drop(columns=['id', 'accident_risk']).iloc[:sample_len]  # 训练特征 (前 1000 条)
        target_train = train_df.iloc[:sample_len]['accident_risk']               # 训练目标
        features_test = test_df.drop(columns=['id'])                # 测试特征 (同样的特征列)




    # 3. 打印维度信息
    print("📊 数据拆分完成")
    print(f"训练集特征 features_train  shape   : {features_train.shape}")
    print(f"训练集目标   target_train  shape   : {target_train.shape}")
    print(f"测试集特征  features_test  shape   : {features_test.shape}")
    print(f"           features_train  类型    : {type(features_train)}")

    return features_train, target_train, features_test



### 特征生成

In [10]:
def add_new_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    针对道路事故预测构造新的衍生特征
    输入:
        df : pd.DataFrame
    输出:
        df_new : pd.DataFrame，包含新增特征
    """
    df = df.copy()
    shape_before = df.shape  # 原始维度

    # 1. 交互项
    df['Speed_x_Curvature'] = df['speed_limit'] * df['curvature']
    df['Lanes_x_Speed']     = df['num_lanes'] * df['speed_limit']
    df['Accidents_x_Speed'] = df['num_reported_accidents'] * df['speed_limit']

    # 类别交互项（需要后续 OneHotEncoder）
    df['RoadType_Time']     = df['road_type'] + "_" + df['time_of_day']
    df['Lighting_Weather']  = df['lighting'] + "_" + df['weather']

    # 2. 非线性变换
    df['Log_Accidents']     = np.log1p(df['num_reported_accidents'])  # log(1+x)

    # 分箱（自定义阈值，可调节）
    df['Accident_Bins'] = pd.cut(df['num_reported_accidents'],
                                bins=[-1, 0, 2, 5, np.inf],
                                labels=['none', 'low', 'medium', 'high'])

    # 3. 比率特征
    df['Accidents_per_Lane']   = df['num_reported_accidents'] / (df['num_lanes'] + 1)
    df['Curvature_per_Lane']   = df['curvature'] / (df['num_lanes'] + 1)
    df['Speed_per_Lane']       = df['speed_limit'] / (df['num_lanes'] + 1)


    shape_after = df.shape  # 新的维度

    print(f"{shape_before[0]} × {shape_before[1]} --> {shape_after[0]} × {shape_after[1]}   新增 {shape_after[1] - shape_before[1]} 列")



    return df



### 手动编码 & 布尔转换

In [11]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def fit_ohe(train_df: pd.DataFrame):
    """
    在训练集上自动检测 object/category 列，并拟合 OneHotEncoder。
    返回 encoder 对象和类别列名。
    """
    # 检测类别列
    cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
    print(f"检测到{len(cat_cols)}类: {cat_cols}")

    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    ohe.fit(train_df[cat_cols])

    return ohe, cat_cols


def transform_ohe(df: pd.DataFrame, ohe: OneHotEncoder, cat_cols: list) -> pd.DataFrame:
    """
    使用已拟合的 OneHotEncoder 对 DataFrame 进行编码。
    自动处理 bool -> int，保持 train/test 一致。
    """
    df = df.copy()
    shape_before = df.shape

    # 布尔型转 int
    for col in df.select_dtypes('bool').columns:
        df[col] = df[col].astype(int)

    # OHE
    ohe_array = ohe.transform(df[cat_cols])
    ohe_cols = ohe.get_feature_names_out(cat_cols)
    ohe_df = pd.DataFrame(ohe_array, columns=ohe_cols, index=df.index)

    # 拼接
    df_enc = pd.concat([df.drop(columns=cat_cols), ohe_df], axis=1)

    shape_after = df_enc.shape
    print(f"{shape_before[0]} × {shape_before[1]} --> {shape_after[0]} × {shape_after[1]}   新增 {shape_after[1] - shape_before[1]} 列")

    return df_enc





### 特征生成

### PCA降维

In [12]:
# 对数据降维

import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy import sparse

def fit_svd(train_df: pd.DataFrame, n_components: int = 100, random_state: int = 42) -> TruncatedSVD:
    """
    在训练集上拟合 TruncatedSVD，并返回拟合好的模型。
    """
    X_sparse = sparse.csr_matrix(train_df.values)
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    svd.fit(X_sparse)
    explained_var = svd.explained_variance_ratio_.sum()
    print(f"✅ SVD 已拟合完成 (n_components={n_components})，训练集累计解释方差比: {explained_var:.2%}")
    return svd

def transform_svd(df: pd.DataFrame, svd: TruncatedSVD) -> pd.DataFrame:
    """
    使用拟合好的 SVD 对数据集进行降维。
    """
    shape_before = df.shape
    X_sparse = sparse.csr_matrix(df.values)
    X_reduced_array = svd.transform(X_sparse)

    reduced_df = pd.DataFrame(
        X_reduced_array,
        index=df.index,
        columns=[f"SVD_{i+1}" for i in range(X_reduced_array.shape[1])]
    )

    shape_after = reduced_df.shape
    print(f"{shape_before[0]} × {shape_before[1]} --> {shape_after[0]} × {shape_after[1]}")
    return reduced_df







# 交叉训练验证

In [31]:
# Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果
# ==============================================================
def run_kfold_xgb(features_train, target_train, features_test, config, DIRS, K_FOLDS=10, verbose=0):
    """
    使用 Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果

    参数:
        features_train, target_train        : 训练集特征和标签
        features_test      : 测试集特征
        params      : XGBoost 最优参数 (dict)
        DIRS        : 保存结果的目录字典
        K_FOLDS     : 折数 (默认=5)
        verbose     : 是否打印详细信息
    """

    
        
    config["X shape"] = features_train.shape
    config["y shape"] = target_train.shape
    config["X_test shape"] = features_test.shape


    # ---------- 创建目录 ----------
    for _, path in DIRS.items():
        os.makedirs(path, exist_ok=True)


    time_str = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    history_DIR = os.path.join(DIRS['HISTORY'], time_str)
    os.makedirs(history_DIR, exist_ok=True)



    print("——" * 20)
    print(f"✅ 当前结果将保存到: {time_str}")


    # ---------- 定义交叉验证 ----------
    skfold = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    yeo = PowerTransformer(method="yeo-johnson")                                # 定义 Yeo-Johnson 变换

    # ---------- 初始化存储 ----------
    oof_val = np.zeros(len(features_train))       # OOF 预测
    train_score, val_score = [], []  # 每折 LOSS
    test_pred = []                   # 每折 test 预测
    fold_records = []                # 保存每折信息
    all_importances = []             # 特征重要性
    elapsed_list = []                # 耗时记录



    # 循环每一折
    # ==============================================================

    for i, (train_idx, val_idx) in enumerate(skfold.split(features_train, pd.qcut(target_train, q=10).cat.codes), 1):

        # ----- 打印时间信息 -----
        start_now = datetime.now()
        start_str = start_now.strftime("%H:%M:%S")

        if elapsed_list:
            avg_time = np.mean(elapsed_list)
            est_end = start_now + timedelta(seconds=avg_time)

            # 每 5 个一组输出耗时
            parts = [f"{t:6.1f}s" for t in elapsed_list]
            grouped = [" ".join(parts[j:j+5]) for j in range(0, len(parts), 5)]
            elapsed_str = " /// ".join(grouped)

            print(
                f"🔄{i:2d}/{K_FOLDS} ST {start_str}"
                f" ET {est_end.strftime('%H:%M:%S')}"
                f" avg {avg_time:.1f}s"
                f" [{elapsed_str}]",
                end="\r", flush=True
            )
        else:
            print(f"🔄{i:2d}/{K_FOLDS} ST {start_str} ET (暂无历史数据)", end="\r", flush=True)



        # ----- 开始训练 -----
        t0 = time.time()

        # 1. 数据集划分
        x_train, x_val = features_train.iloc[train_idx], features_train.iloc[val_idx]
        y_train, y_val = target_train[train_idx], target_train[val_idx]

        # 2. Yeo-Johnson 变换
        y_train = yeo.fit_transform(y_train.values.reshape(-1, 1)).squeeze()
        y_val   = yeo.transform(y_val.values.reshape(-1, 1)).squeeze()


        # 3. 特征选择（轻量级 XGBoost）
        # 使用
        selector_model = xgb.XGBRegressor(**config["xgb_selector_model_params"])
        # selector_model = xgb.XGBRegressor(
        #     n_estimators   = 500,
        #     max_depth      = 6,
        #     learning_rate  = 0.05,
        #     random_state   = 2025,
        #     device         = "cpu",
        #     objective      = "reg:absoluteerror",
        #     tree_method    = "hist",
        #     verbosity      = 0
        # )
        
        

        selector_model.fit(x_train, y_train)

        selector = SelectFromModel(selector_model, prefit=True, threshold=config["selector_threshold"])
        selected_features = x_train.columns[selector.get_support()].tolist()
        if verbose > 0:
            print(f"✅ 选择的特征数量: {len(selected_features)}")


        # 4. 保留重要特征
        x_train_new = x_train[selected_features]
        x_val_new   = x_val[selected_features]
        x_test_new  = features_test[selected_features]







        # 5. 构造 LightGBM 数据集
        dtrain = lgb.Dataset(x_train_new, label=y_train)
        dval   = lgb.Dataset(x_val_new,   label=y_val)
        # dtest  = lgb.Dataset(x_test_new,  label=y_test)

        early_stop_rounds = 800 if config["lgb_train_model_params"]["learning_rate"] < 0.03 else 300
        use_early_stopping = (config["lgb_train_model_params"]["boosting_type"] != "dart")

        callbacks = []
        if use_early_stopping:
            callbacks.append(lgb.early_stopping(stopping_rounds=300))
        callbacks.append(lgb.log_evaluation(period=1000))

        # 6. LightGBM 训练
        lgb_model = lgb.train(
            params=config["lgb_train_model_params"],
            train_set=dtrain,
            valid_sets=[dtrain, dval],
            valid_names=["train", "valid"],
            num_boost_round=config["num_boost_round"],
        callbacks=callbacks
        )

        # # 保存模型
        # model_path = os.path.join(history_DIR, f"lgb_model_fold{i}.txt")
        # lgb_model.save_model(model_path)



        # 7. 获取特征重要性
        # imp_dict = xgb_model.get_score(importance_type="gain")
        # imp_df = pd.DataFrame(imp_dict.items(), columns=["Feature", "Importance"])
        # imp_df["Fold"] = i
        # all_importances.append(imp_df)


        # 8. 预测
        y_train_pred = lgb_model.predict(x_train_new, num_iteration=lgb_model.best_iteration)
        y_val_pred   = lgb_model.predict(x_val_new,   num_iteration=lgb_model.best_iteration)
        y_test_pred  = lgb_model.predict(x_test_new,  num_iteration=lgb_model.best_iteration)










        # 9. 逆变换
        y_train      = yeo.inverse_transform(y_train.reshape(-1, 1)).squeeze()
        y_val        = yeo.inverse_transform(y_val.reshape(-1, 1)).squeeze()
        y_train_pred = yeo.inverse_transform(y_train_pred.reshape(-1, 1)).squeeze()
        y_val_pred   = yeo.inverse_transform(y_val_pred.reshape(-1, 1)).squeeze()
        y_test_pred  = yeo.inverse_transform(y_test_pred.reshape(-1, 1)).squeeze()

        # 10. 计算 LOSS
        train_loss = np.sqrt(np.mean((y_train - y_train_pred) ** 2))
        val_loss   = np.sqrt(np.mean((y_val   - y_val_pred) ** 2))


        if verbose > 0:
            print(f"Fold {i}: Train LOSS={train_loss:.4f}, Val LOSS={val_loss:.4f}，用时 {elapsed:.2f} 秒")


        # ----- 保存结果 -----
        train_score.append(train_loss)
        val_score.append(val_loss)
        oof_val[val_idx] = y_val_pred
        test_pred.append(y_test_pred)

        elapsed = time.time() - t0
        elapsed_list.append(elapsed)

        fold_records.append({
            "Fold": i,
            "Train_LOSS": train_loss,
            "Val_LOSS": val_loss,
            "Num_Features": len(selected_features),
            "Selected_Features": selected_features,
            "elapsed": elapsed
        })

    # 保存整体结果
    # ==============================================================
    if verbose > 0:
        print("\n")
        print(f"📊 Train LOSS 平均值 : {np.mean(train_score):.4f}")
        print(f"📊 Val   LOSS 平均值 : {np.mean(val_score):.4f}")
        print(f"📊 Train LOSS 标准差 : {np.std(train_score, ddof=0):.4f}")
        print(f"📊 Val   LOSS 标准差 : {np.std(val_score, ddof=0):.4f}")

    # 参数
    with open(os.path.join(history_DIR, "config.json"), "w", encoding="utf-8") as f:
        json.dump(config, f, indent=4, ensure_ascii=False)

    # 每折信息
    folds_df = pd.DataFrame(fold_records)
    folds_df.to_csv(os.path.join(history_DIR, "folds_info.csv"), index=False, encoding="utf-8-sig")


    # # 特征重要性
    # if all_importances:
    #     valid_imps = [df for df in all_importances if not df.empty]
    #     all_imp_df = pd.concat(valid_imps, axis=0) if valid_imps else pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    # else:
    #     all_imp_df = pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    # all_imp_df.to_csv(os.path.join(history_DIR, "feature_importance_all.csv"), index=False, encoding="utf-8-sig")


    # 测试集预测
    test_pred_array = np.vstack(test_pred).T
    test_pred_df = pd.DataFrame(test_pred_array, columns=[f"Fold_{j+1}" for j in range(test_pred_array.shape[1])])
    test_pred_df["Final_Pred"] = test_pred_df.mean(axis=1)
    test_pred_df.to_csv(os.path.join(history_DIR, "test_predictions.csv"), index=False, encoding="utf-8-sig")

    # 总结
    with open(os.path.join(history_DIR, "summary.txt"), "w", encoding="utf-8") as f:
        f.write(f"Train LOSS Mean : {np.mean(train_score):.4f}\n")
        f.write(f"Val   LOSS Mean : {np.mean(val_score):.4f}\n")
        f.write(f"Train LOSS Std  : {np.std(train_score, ddof=0):.4f}\n")
        f.write(f"Val   LOSS Std  : {np.std(val_score, ddof=0):.4f}\n")


    # 最终提交
    final_score = np.mean(val_score)
    submission = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "sample_submission.csv"))
    submission["accident_risk"] = test_pred_df["Final_Pred"]

    submission_path = os.path.join(history_DIR, f"sub_{time_str}_{final_score:.8f}.csv")
    submission.to_csv(submission_path, index=False)
    submission.to_csv(os.path.join(DIRS['SUBMISSION'], f"sub_{time_str}_{final_score:.8f}.csv"), index=False)

        
    config["time_str"] = time_str
    config["score"] = final_score


    # ---------- 返回结果 ----------
    return {
        "oof_val": oof_val,
        "train_score": train_score,
        "val_score": val_score,
        "test_pred": test_pred_df,
        "folds_info": folds_df,
        # "feature_importance": all_imp_df,
        "submission_path": submission_path,
        "time": time_str,
        "final_score": final_score,
        "config": config
    }


# 单次训练推导

In [24]:
# 执行一次

# 准备流程---------------------------------------------------------------------------------------------------
# 打印当前config
print(config_to_str(config))

# Kaggle 提供的训练集和测试集
train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))

# 数据拆分
print("数据拆分---------------------------")
features_train, target_train, features_test = prepare_features_and_target(train_df, test_df, config)


dataframe_info(features_train, n_sample = 6)


# 特征生成
print("特征生成---------------------------")
if config["use_feature_gen"]:
    features_train = add_new_features(features_train)
    features_test  = add_new_features(features_test)


dataframe_info(features_train, n_sample = 6)


# 手动编码 & 布尔转换
print("手动编码 & 布尔转换----------------")
# 在训练集上拟合
ohe, cat_cols = fit_ohe(features_train)
# 分别编码 train/test，保证一致
features_train = transform_ohe(features_train, ohe, cat_cols)
features_test  = transform_ohe(features_test,  ohe, cat_cols)


# 数据降维
print("数据降维---------------------------")
if config["use_pca"]:
    # 1. 在训练集上拟合
    svd = fit_svd(features_train, n_components = config["pca_components"], random_state=42)

    # 2. 分别对 train/test transform
    features_train_reduced = transform_svd(features_train, svd)
    features_test_reduced  = transform_svd(features_test, svd)

    shape_before_train = features_train.shape
    shape_before_test = features_test.shape

    features_train = pd.concat([features_train, features_train_reduced], axis=1)
    features_test = pd.concat([features_test, features_test_reduced], axis=1)

    shape_after_train = features_train.shape
    shape_after_test = features_test.shape

    print(f"Train: {shape_before_train[0]} × {shape_before_train[1]}  -->  {shape_after_train[0]} × {shape_after_train[1]}   新增 {shape_after_train[1] - shape_before_train[1]} 列")
    print(f"Test : {shape_before_test[0]} × {shape_before_test[1]}  -->  {shape_after_test[0]} × {shape_after_test[1]}   新增 {shape_after_test[1] - shape_before_test[1]} 列")



X, y, X_test = features_train, target_train, features_test
print("开始训练---------------------------")

# 准备流程---------------------------------------------------------------------------------------------------

results = run_kfold_xgb(X, y, X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
config = results['config']
score = results['final_score']



print('\n',score)


- ISTEST              : False
- use_feature_gen     : True
- use_pca             : True
- pca_components      : 10
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:squarederror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : 0*mean
🔹 lgb_train_model_params:
     - objective           : regression
     - metric              : rmse
     - boosting_type       : gbdt
     - learning_rate       : 0.05
     - num_leaves          : 31
     - max_depth           : -1
     - min_data_in_leaf    : 20
     - feature_fraction    : 0.9
     - bagging_fraction    : 0.8
     - bagging_freq        : 5
     - lambda_l1           : 0.1
     - lambda_l2           : 0.1
     - verbosity           : -1
- num_boost_round     : 15000
- X shape             : (517754, 59)
- y shape             :

Unnamed: 0,数据类型,缺失值数量,缺失值比例,唯一值数量,示例值 (samples)
road_type,object,0,0.0,3,"[urban, rural, highway]"
num_lanes,int64,0,0.0,4,"[2, 4, 1, 3]"
curvature,float64,0,0.0,261,"[0.06, 0.99, 0.63, 0.07, 0.58, 0.54]"
speed_limit,int64,0,0.0,5,"[35, 70, 60, 45, 25]"
lighting,object,0,0.0,3,"[daylight, dim, night]"
weather,object,0,0.0,3,"[rainy, clear, foggy]"
road_signs_present,bool,0,0.0,2,"[False, True]"
public_road,bool,0,0.0,2,"[True, False]"
time_of_day,object,0,0.0,3,"[afternoon, evening, morning]"
holiday,bool,0,0.0,2,"[False, True]"


特征生成---------------------------
517754 × 12 --> 517754 × 22   新增 10 列
172585 × 12 --> 172585 × 22   新增 10 列
['road_type' 'num_lanes' 'curvature' 'speed_limit' 'lighting' 'weather'
 'road_signs_present' 'public_road' 'time_of_day' 'holiday'
 'school_season' 'num_reported_accidents' 'Speed_x_Curvature'
 'Lanes_x_Speed' 'Accidents_x_Speed' 'RoadType_Time' 'Lighting_Weather'
 'Log_Accidents' 'Accident_Bins' 'Accidents_per_Lane' 'Curvature_per_Lane'
 'Speed_per_Lane']


Unnamed: 0,数据类型,缺失值数量,缺失值比例,唯一值数量,示例值 (samples)
road_type,object,0,0.0,3,"[urban, rural, highway]"
num_lanes,int64,0,0.0,4,"[2, 4, 1, 3]"
Curvature_per_Lane,float64,0,0.0,546,"[0.02, 0.198, 0.126, 0.014000000000000002, 0.2..."
Accidents_per_Lane,float64,0,0.0,21,"[0.3333333333333333, 0.0, 0.4, 0.2, 0.5, 1.0]"
Accident_Bins,category,0,0.0,4,"['low', 'none', 'medium', 'high'] Categories (..."
Log_Accidents,float64,0,0.0,8,"[0.6931471805599453, 0.0, 1.0986122886681098, ..."
Lighting_Weather,object,0,0.0,9,"[daylight_rainy, daylight_clear, dim_clear, di..."
RoadType_Time,object,0,0.0,9,"[urban_afternoon, urban_evening, rural_morning..."
Accidents_x_Speed,int64,0,0.0,29,"[35, 0, 140, 60, 90, 50]"
Lanes_x_Speed,int64,0,0.0,17,"[70, 140, 280, 60, 210, 45]"


手动编码 & 布尔转换----------------
检测到7类: ['road_type', 'lighting', 'weather', 'time_of_day', 'RoadType_Time', 'Lighting_Weather', 'Accident_Bins']
517754 × 22 --> 517754 × 49   新增 27 列
172585 × 22 --> 172585 × 49   新增 27 列
数据降维---------------------------
✅ SVD 已拟合完成 (n_components=10)，训练集累计解释方差比: 99.95%
517754 × 49 --> 517754 × 10
172585 × 49 --> 172585 × 10
Train: 517754 × 49  -->  517754 × 59   新增 10 列
Test : 172585 × 49  -->  172585 × 59   新增 10 列
开始训练---------------------------
————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 02-08-53
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[785]	train's rmse: 0.33819	valid's rmse: 0.34435
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[611]	train's rmse: 0.339651	valid's rmse: 0.343783
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[480]	train's rmse: 0.340581	valid's rmse: 0.343426
Tr

# 提交 kaggle 平台测试

In [None]:
# 根据 submission_time 定位文件路径 提交 kaggle 平台测试

import os
import itertools
import time
from kaggle.api.kaggle_api_extended import KaggleApi


def find_submission_file(submission_time, submission_dir):
    """
    在 submission_dir 下查找包含 submission_time 的文件
    一旦找到立刻返回完整路径；如果没找到则返回 None
    """
    for fname in os.listdir(submission_dir):
        if submission_time in fname:
            file_path = os.path.join(submission_dir, fname)
            print(f"✅ 找到目标文件: {fname}")
            return file_path
    
    print(f"⚠️ 未找到包含 {submission_time} 的文件")
    return None

def submit_and_get_score(file_path, competition_name, message="My submission"):
    """
    封装 Kaggle 提交并等待结果评分
    --------------------------------------
    file_path        : str  提交文件路径
    competition_name : str  Kaggle 比赛名称 (URL 最后一段)
    message          : str  提交备注
    """
    # 1. 配置 Kaggle API
    os.environ["KAGGLE_CONFIG_DIR"] = r"C:\Users\Admin\.kaggle"
    api = KaggleApi()
    api.authenticate()
    print("✅ Kaggle API 已经配置成功！")

    # 2. 提交文件
    api.competition_submit(
        file_name=file_path,
        competition=competition_name,
        message=message
    )
    print("✅ 提交完成！请等待评分...")

    # 3. 动态等待
    spinner = itertools.cycle(["|", "/", "-", "\\"])
    while True:
        submissions = api.competition_submissions(competition_name)
        latest = submissions[0]
        status_str = str(latest._status).lower()

        if "complete" in status_str and latest._public_score is not None:
            print("\n🎯 最终结果:")
            print(f"Public 分数 : {latest._public_score}")
            print(f"Private 分数: {latest._private_score}")
            print(f"提交 ID     : {latest._ref}")
            print(f"文件名      : {latest._file_name}")
            print(f"状态        : {latest._status}")
            print(f"提交时间    : {latest._date}")
            print(f"描述/备注   : {latest._description}")
            return latest

        if "pending" in status_str:
            spin_char = next(spinner)
            print(f"当前状态: {status_str} , 等待中 {spin_char}", end="\r", flush=True)
            time.sleep(0.2)  # 每 0.5 秒检查一次
            continue

        else:
            print(f"\n报错")
            print(f"submissions")
            
            break

        


### 不轻易运行，再三考虑

In [None]:
# submission_time 提交
submission_time = "2025-10-24 01-23-41"
competition_name = "playground-series-s5e10"
message =  f"该提交文件的参数：\n{config_to_str(config)} "

target_file = find_submission_file(submission_time, DIRS['SUBMISSION'] )

# submit_and_get_score(target_file, competition_name, message)

# 参数优化

In [26]:
# 实验配置单
base_config = {
    # 固定开关
    "ISTEST"            : False,

    "use_feature_gen"   : False,
    "use_pca"           : True,
    "pca_components"    : 10,

    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cpu",
        "objective"     : "reg:squarederror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "0*mean",   

    # # 训练设置
    # "xgb_train_model_params": {
    #     'max_depth'   : 6,
    #     'eta'         : 0.1,
    #     'tree_method' : 'hist',
    #     'eval_metric' : 'rmse',
    # },
    # "num_boost_round": 15000,


    # 训练设置（LightGBM）
    "lgb_train_model_params": {
        "objective": "regression",        # 回归任务
        "metric": "rmse",                 # 使用 RMSE 作为评估指标
        "boosting_type": "gbdt",          # 梯度提升树（默认）
        "learning_rate": 0.05,            # 学习率（原来 XGB 的 eta）
        "num_leaves": 31,                 # 控制树的复杂度
        "max_depth": -1,                  # -1 表示不限制深度
        "min_data_in_leaf": 20,           # 每个叶子最少样本数
        "feature_fraction": 0.9,          # 每次建树时使用的特征比例
        "bagging_fraction": 0.8,          # 每次建树时使用的数据比例
        "bagging_freq": 5,                # 每 5 次迭代进行一次 bagging
        "lambda_l1": 0.1,                 # L1 正则化
        "lambda_l2": 0.1,                 # L2 正则化
        "verbosity": -1                   # 不输出详细日志
    },
    "num_boost_round": 15000,             # 与 XGB 相同，用于控制最大迭代次数



}

In [28]:
# 定义优化任务  加入标识符 host: hao-2   ip: 192.168.40.1

import copy
import contextlib
import io

def objective(trial):
    """
    Optuna 的目标函数 (Objective Function)
    每次 trial 会生成一组超参数，用于训练 XGBoost 模型，
    并返回交叉验证的平均 RMSE 作为优化目标。
    """




    # 1. 定义 超参数 搜索空间
    # 拷贝一份 config，避免全局污染
    config = copy.deepcopy(base_config)

    # 只修改需要优化的参数
    config["use_feature_gen"]   = trial.suggest_categorical("use_feature_gen", [True, False])
    config["use_pca"]           = trial.suggest_categorical("use_pca", [True, False])
    config["pca_components"] = trial.suggest_categorical("pca_components", [5, 10, 15, 20])

    # config["xgb_selector_model_params"]["random_state"] = trial.suggest_categorical("selector_random_state", [2025])
    # config["xgb_selector_model_params"]["device"]       = trial.suggest_categorical("selector_device", ["cpu", "cuda"])

    # config["selector_threshold"] = trial.suggest_categorical("selector_threshold", ["0*mean", "0.25*mean", "0.5*mean", "0.75*mean", "mean"])

    # config["xgb_train_model_params"]["max_depth"] = trial.suggest_int("train_max_depth", 3, 12)
    # config["xgb_train_model_params"]["eta"] = trial.suggest_float("train_eta", 0.01 , 0.3 , log=True)
    config["lgb_train_model_params"]["learning_rate"] = trial.suggest_float(
        "learning_rate", 0.005, 0.1, log=True
    )
    config["lgb_train_model_params"]["num_leaves"] = trial.suggest_int(
        "num_leaves", 16, 256
    )
    # config["lgb_train_model_params"]["max_depth"] = trial.suggest_int(
    #     "max_depth", -1, 15
    # )
    config["lgb_train_model_params"]["min_data_in_leaf"] = trial.suggest_int(
        "min_data_in_leaf", 10, 100
    )
    config["lgb_train_model_params"]["feature_fraction"] = trial.suggest_float(
        "feature_fraction", 0.6, 1.0
    )
    config["lgb_train_model_params"]["bagging_fraction"] = trial.suggest_float(
        "bagging_fraction", 0.6, 1.0
    )
    config["lgb_train_model_params"]["bagging_freq"] = trial.suggest_int(
        "bagging_freq", 1, 10
    )
    config["lgb_train_model_params"]["lambda_l1"] = trial.suggest_float(
        "lambda_l1", 1e-3, 10.0, log=True
    )
    config["lgb_train_model_params"]["lambda_l2"] = trial.suggest_float(
        "lambda_l2", 1e-3, 10.0, log=True
    )
    config["lgb_train_model_params"]["boosting_type"] = trial.suggest_categorical(
        "boosting_type", ["gbdt", "dart"]
    )







    # 创建一个黑洞缓冲区
    f = io.StringIO()
    with contextlib.redirect_stdout(f):
        None




        # 准备流程---------------------------------------------------------------------------------------------------
        # 打印当前config
        print(config_to_str(config))

        # Kaggle 提供的训练集和测试集
        train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
        test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))

        # 数据拆分
        print("数据拆分---------------------------")
        features_train, target_train, features_test = prepare_features_and_target(train_df, test_df, config)

        # 特征生成
        print("特征生成---------------------------")
        if config["use_feature_gen"]:
            features_train = add_new_features(features_train)
            features_test  = add_new_features(features_test)


        # 手动编码 & 布尔转换
        print("手动编码 & 布尔转换----------------")
        # 在训练集上拟合
        ohe, cat_cols = fit_ohe(features_train)
        # 分别编码 train/test，保证一致
        features_train = transform_ohe(features_train, ohe, cat_cols)
        features_test  = transform_ohe(features_test,  ohe, cat_cols)


        # 数据降维
        print("数据降维---------------------------")
        if config["use_pca"]:
            # 1. 在训练集上拟合
            svd = fit_svd(features_train, n_components = config["pca_components"], random_state=42)

            # 2. 分别对 train/test transform
            features_train_reduced = transform_svd(features_train, svd)
            features_test_reduced  = transform_svd(features_test, svd)

            shape_before_train = features_train.shape
            shape_before_test = features_test.shape

            features_train = pd.concat([features_train, features_train_reduced], axis=1)
            features_test = pd.concat([features_test, features_test_reduced], axis=1)

            shape_after_train = features_train.shape
            shape_after_test = features_test.shape

            print(f"Train: {shape_before_train[0]} × {shape_before_train[1]}  -->  {shape_after_train[0]} × {shape_after_train[1]}   新增 {shape_after_train[1] - shape_before_train[1]} 列")
            print(f"Test : {shape_before_test[0]} × {shape_before_test[1]}  -->  {shape_after_test[0]} × {shape_after_test[1]}   新增 {shape_after_test[1] - shape_before_test[1]} 列")



        X, y, X_test = features_train, target_train, features_test
        print("开始训练---------------------------")

        # 准备流程---------------------------------------------------------------------------------------------------





    results = run_kfold_xgb(X, y, X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
    config = results['config']
    score = results['final_score']



    HOSTNAME = socket.gethostname()
    HOST_IP = socket.gethostbyname(HOSTNAME)
    trial.set_user_attr("host", HOSTNAME)        # 你自己定义主机 A/B
    trial.set_user_attr("ip", HOST_IP)        # 你自己定义角色 A/B

    # 4. 返回平均 LOSS
    return score




In [29]:
STUDY_NAME = "test2LightGBM" if base_config["ISTEST"] else "Predicting Road Accident Risk LightGBM"



In [42]:
# 开始优化

# 1. 定义 SQLite 数据库存储路径

storage_url = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

study = optuna.create_study(
    study_name = STUDY_NAME,
    # study_name="ghsdjsrtjrswtjhwrt",
    storage=storage_url,
    load_if_exists=True
)

# 自动获取当前主机名\当前主机的 IP 地址
HOSTNAME = socket.gethostname()
HOST_IP = socket.gethostbyname(HOSTNAME)
print("主机名:", HOSTNAME," 主机 IP:", HOST_IP)
time.sleep(1)

# 5. 启动超参数搜索
print("🔎 开始超参数搜索...")
if base_config["ISTEST"]:
    study.optimize(objective, n_trials = 3)
else:
    study.optimize(objective, n_trials = 200)


# 6. 打印最优结果
print("\n✅ 训练完成！")
print(f"📊 已完成试验次数 : {len(study.trials)}")
print(f"🏆 最优试验编号   : {study.best_trial.number}")
print(f"📉 最优 LOSS       : {study.best_value}")
print(f"⚙️ 最优参数组合   : {study.best_trial.params}")


[I 2025-10-24 02:40:10,326] A new study created in RDB with name: Predicting Road Accident Risk LightGBM


主机名: hao-2  主机 IP: 192.168.40.1
🔎 开始超参数搜索...
————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 02-40-16
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[118]	train's rmse: 0.337128	valid's rmse: 0.344459
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[134]	train's rmse: 0.336545	valid's rmse: 0.343626
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[118]	train's rmse: 0.337313	valid's rmse: 0.343178
Training until validation scores don't improve for 300 rounds17.7s]
Early stopping, best iteration is:
[110]	train's rmse: 0.337764	valid's rmse: 0.343029
Training until validation scores don't improve for 300 rounds17.7s   12.6s]
Early stopping, best iteration is:
[105]	train's rmse: 0.337821	valid's rmse: 0.344441
Training until validation scores don't improve for 300 rounds17.7s   12.6s   13.8s]
Early stopping, best iteration is

[I 2025-10-24 02:42:53,397] Trial 0 finished with value: 0.05623675145539836 and parameters: {'use_feature_gen': True, 'use_pca': True, 'pca_components': 20, 'learning_rate': 0.05855433126993213, 'num_leaves': 242, 'min_data_in_leaf': 77, 'feature_fraction': 0.8454640426355255, 'bagging_fraction': 0.7469011972885259, 'bagging_freq': 7, 'lambda_l1': 6.693584206135924, 'lambda_l2': 0.2765348113468092, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 0.05623675145539836.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 02-42-55
Training until validation scores don't improve for 300 rounds
[1000]	train's rmse: 0.34455	valid's rmse: 0.345857
[2000]	train's rmse: 0.343017	valid's rmse: 0.345098
[3000]	train's rmse: 0.342014	valid's rmse: 0.344851
[4000]	train's rmse: 0.341179	valid's rmse: 0.344766
[5000]	train's rmse: 0.340385	valid's rmse: 0.344683
[6000]	train's rmse: 0.339658	valid's rmse: 0.344658
Early stopping, best iteration is:
[5826]	train's rmse: 0.339781	valid's rmse: 0.344652
Training until validation scores don't improve for 300 rounds
[1000]	train's rmse: 0.344772	valid's rmse: 0.34461
[2000]	train's rmse: 0.343215	valid's rmse: 0.344021
[3000]	train's rmse: 0.342216	valid's rmse: 0.343871
[4000]	train's rmse: 0.341369	valid's rmse: 0.343796
Early stopping, best iteration is:
[4337]	train's rmse: 0.341088	valid's rmse: 0.343785
Training until validation scores don't improve for 300 rounds
[1000]	train's rmse: 0.344783	valid's

[I 2025-10-24 02:49:34,577] Trial 1 finished with value: 0.05628299929131834 and parameters: {'use_feature_gen': False, 'use_pca': True, 'pca_components': 10, 'learning_rate': 0.007963988791270001, 'num_leaves': 22, 'min_data_in_leaf': 67, 'feature_fraction': 0.7178025892239908, 'bagging_fraction': 0.7126133392111178, 'bagging_freq': 2, 'lambda_l1': 0.0021554917369354, 'lambda_l2': 0.0027367913297240017, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 0.05623675145539836.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 02-49-35
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[592]	train's rmse: 0.338283	valid's rmse: 0.34387
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[570]	train's rmse: 0.338669	valid's rmse: 0.34288
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[626]	train's rmse: 0.338177	valid's rmse: 0.342605
Training until validation scores don't improve for 300 rounds11.9s]
Early stopping, best iteration is:
[388]	train's rmse: 0.340093	valid's rmse: 0.34256
Training until validation scores don't improve for 300 rounds11.9s   10.2s]
Early stopping, best iteration is:
[455]	train's rmse: 0.339397	valid's rmse: 0.344012
Training until validation scores don't improve for 300 rounds11.9s   10.2s   10.6s]
Early stopping, best iteration is:
[372]	train's rmse: 0.340286	valid's rmse: 0.3

[I 2025-10-24 02:51:30,472] Trial 2 finished with value: 0.05614115586133135 and parameters: {'use_feature_gen': True, 'use_pca': False, 'pca_components': 5, 'learning_rate': 0.04396975969946361, 'num_leaves': 60, 'min_data_in_leaf': 39, 'feature_fraction': 0.838628315128256, 'bagging_fraction': 0.6832554536468092, 'bagging_freq': 2, 'lambda_l1': 0.817967898786827, 'lambda_l2': 0.0041471317233033505, 'boosting_type': 'gbdt'}. Best is trial 2 with value: 0.05614115586133135.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 02-51-31
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[695]	train's rmse: 0.337338	valid's rmse: 0.343557
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[690]	train's rmse: 0.337575	valid's rmse: 0.342647
Training until validation scores don't improve for 300 rounds
[1000]	train's rmse: 0.335335	valid's rmse: 0.342473
Early stopping, best iteration is:
[765]	train's rmse: 0.337056	valid's rmse: 0.342386
Training until validation scores don't improve for 300 rounds18.2s]
Early stopping, best iteration is:
[597]	train's rmse: 0.3383	valid's rmse: 0.342116
Training until validation scores don't improve for 300 rounds18.2s   16.1s]
Early stopping, best iteration is:
[559]	train's rmse: 0.338548	valid's rmse: 0.343776
Training until validation scores don't improve for 300 rounds18.2s   16.1s   15.7s]
Early stopping, best iterat

[I 2025-10-24 02:54:23,871] Trial 3 finished with value: 0.056091493835279595 and parameters: {'use_feature_gen': True, 'use_pca': False, 'pca_components': 5, 'learning_rate': 0.014070743748395762, 'num_leaves': 175, 'min_data_in_leaf': 46, 'feature_fraction': 0.7816630185138349, 'bagging_fraction': 0.6550649106172352, 'bagging_freq': 5, 'lambda_l1': 0.0013655435173871426, 'lambda_l2': 0.7392541005877692, 'boosting_type': 'gbdt'}. Best is trial 3 with value: 0.056091493835279595.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 02-54-24
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[141]	train's rmse: 0.338522	valid's rmse: 0.3433
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[109]	train's rmse: 0.339743	valid's rmse: 0.342429
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[168]	train's rmse: 0.33807	valid's rmse: 0.342023
Training until validation scores don't improve for 300 rounds7.7s]
Early stopping, best iteration is:
[123]	train's rmse: 0.339283	valid's rmse: 0.34194
Training until validation scores don't improve for 300 rounds7.7s    7.4s]
Early stopping, best iteration is:
[144]	train's rmse: 0.338642	valid's rmse: 0.343359
Training until validation scores don't improve for 300 rounds7.7s    7.4s    7.5s]
Early stopping, best iteration is:
[128]	train's rmse: 0.339254	valid's rmse: 0.34244

[I 2025-10-24 02:55:41,564] Trial 4 finished with value: 0.05604852309513626 and parameters: {'use_feature_gen': False, 'use_pca': False, 'pca_components': 15, 'learning_rate': 0.06625058653731754, 'num_leaves': 180, 'min_data_in_leaf': 83, 'feature_fraction': 0.9077185807172881, 'bagging_fraction': 0.9215159682485459, 'bagging_freq': 6, 'lambda_l1': 0.031125849539165376, 'lambda_l2': 0.5256515100596871, 'boosting_type': 'gbdt'}. Best is trial 4 with value: 0.05604852309513626.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 02-55-42
[1000]	train's rmse: 0.3641	valid's rmse: 0.365109
[2000]	train's rmse: 0.349092	valid's rmse: 0.350322
[3000]	train's rmse: 0.344892	valid's rmse: 0.34635
[4000]	train's rmse: 0.343237	valid's rmse: 0.344999
[5000]	train's rmse: 0.342349	valid's rmse: 0.344305
[6000]	train's rmse: 0.341871	valid's rmse: 0.344125
[7000]	train's rmse: 0.341176	valid's rmse: 0.34374
[8000]	train's rmse: 0.340691	valid's rmse: 0.343587
[9000]	train's rmse: 0.340268	valid's rmse: 0.34349
[10000]	train's rmse: 0.339864	valid's rmse: 0.343433
[11000]	train's rmse: 0.339399	valid's rmse: 0.343391
[12000]	train's rmse: 0.339025	valid's rmse: 0.343379
[13000]	train's rmse: 0.338609	valid's rmse: 0.343345
[14000]	train's rmse: 0.338223	valid's rmse: 0.343301
[15000]	train's rmse: 0.337885	valid's rmse: 0.343399
[1000]	train's rmse: 0.364287	valid's rmse: 0.363804]
[2000]	train's rmse: 0.349274	valid's rmse: 0.348991
[3000]	train's rmse: 0.3

[I 2025-10-24 06:04:36,061] Trial 5 finished with value: 0.05605546091310608 and parameters: {'use_feature_gen': True, 'use_pca': False, 'pca_components': 15, 'learning_rate': 0.013301417227872592, 'num_leaves': 36, 'min_data_in_leaf': 21, 'feature_fraction': 0.7930033350109744, 'bagging_fraction': 0.6307886626465097, 'bagging_freq': 3, 'lambda_l1': 1.482043761071851, 'lambda_l2': 1.3227988976005378, 'boosting_type': 'dart'}. Best is trial 4 with value: 0.05604852309513626.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 06-04-37
[1000]	train's rmse: 0.33542	valid's rmse: 0.344923
[2000]	train's rmse: 0.328255	valid's rmse: 0.345125
[3000]	train's rmse: 0.321304	valid's rmse: 0.346121
[4000]	train's rmse: 0.314476	valid's rmse: 0.347564
[5000]	train's rmse: 0.309159	valid's rmse: 0.348638
[6000]	train's rmse: 0.303782	valid's rmse: 0.349816
[7000]	train's rmse: 0.298632	valid's rmse: 0.351034
[8000]	train's rmse: 0.293945	valid's rmse: 0.352145
[9000]	train's rmse: 0.289591	valid's rmse: 0.353231
[10000]	train's rmse: 0.285221	valid's rmse: 0.35436
[11000]	train's rmse: 0.280515	valid's rmse: 0.355609
[12000]	train's rmse: 0.276699	valid's rmse: 0.356583
[13000]	train's rmse: 0.273053	valid's rmse: 0.357645
[14000]	train's rmse: 0.269569	valid's rmse: 0.358609
[15000]	train's rmse: 0.266056	valid's rmse: 0.359629
[1000]	train's rmse: 0.335685	valid's rmse: 0.343494]
[2000]	train's rmse: 0.328612	valid's rmse: 0.343591
[3000]	train's rmse: 

[I 2025-10-24 10:48:41,594] Trial 6 finished with value: 0.058651658849128395 and parameters: {'use_feature_gen': True, 'use_pca': False, 'pca_components': 20, 'learning_rate': 0.04945665442990347, 'num_leaves': 214, 'min_data_in_leaf': 28, 'feature_fraction': 0.8431561395248868, 'bagging_fraction': 0.6615101973044316, 'bagging_freq': 6, 'lambda_l1': 0.17106686742553048, 'lambda_l2': 0.1433718887906728, 'boosting_type': 'dart'}. Best is trial 4 with value: 0.05604852309513626.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 10-48-45
Training until validation scores don't improve for 300 rounds
[1000]	train's rmse: 0.338704	valid's rmse: 0.344104
Early stopping, best iteration is:
[1017]	train's rmse: 0.338633	valid's rmse: 0.344098
Training until validation scores don't improve for 300 rounds
[1000]	train's rmse: 0.33884	valid's rmse: 0.343199
Early stopping, best iteration is:
[962]	train's rmse: 0.339003	valid's rmse: 0.34318
Training until validation scores don't improve for 300 rounds
[1000]	train's rmse: 0.338928	valid's rmse: 0.342946
Early stopping, best iteration is:
[794]	train's rmse: 0.339856	valid's rmse: 0.342937
Training until validation scores don't improve for 300 rounds13.6s]
[1000]	train's rmse: 0.338854	valid's rmse: 0.342718
Early stopping, best iteration is:
[852]	train's rmse: 0.33955	valid's rmse: 0.342706
Training until validation scores don't improve for 300 rounds13.6s   14.1s]
[1000]	train's rmse: 0.338832	valid's r

[I 2025-10-24 10:51:18,563] Trial 7 finished with value: 0.05618495105605834 and parameters: {'use_feature_gen': True, 'use_pca': True, 'pca_components': 5, 'learning_rate': 0.022778075949933623, 'num_leaves': 58, 'min_data_in_leaf': 70, 'feature_fraction': 0.7927722979740537, 'bagging_fraction': 0.6505819421030906, 'bagging_freq': 9, 'lambda_l1': 0.0039441578429872965, 'lambda_l2': 0.6441734110193609, 'boosting_type': 'gbdt'}. Best is trial 4 with value: 0.05604852309513626.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-24 10-51-19
[1000]	train's rmse: 0.338594	valid's rmse: 0.344958
[2000]	train's rmse: 0.333928	valid's rmse: 0.344605
[3000]	train's rmse: 0.329626	valid's rmse: 0.345051
[4000]	train's rmse: 0.325388	valid's rmse: 0.345916
[5000]	train's rmse: 0.322063	valid's rmse: 0.346519
[6000]	train's rmse: 0.318667	valid's rmse: 0.347279
[7000]	train's rmse: 0.315391	valid's rmse: 0.348071
[8000]	train's rmse: 0.312375	valid's rmse: 0.348776
[9000]	train's rmse: 0.309541	valid's rmse: 0.349514
[10000]	train's rmse: 0.306525	valid's rmse: 0.350359
[11000]	train's rmse: 0.303228	valid's rmse: 0.351185
[12000]	train's rmse: 0.300596	valid's rmse: 0.35182
[13000]	train's rmse: 0.298053	valid's rmse: 0.352541
[14000]	train's rmse: 0.295637	valid's rmse: 0.353156
[15000]	train's rmse: 0.293184	valid's rmse: 0.353849
[1000]	train's rmse: 0.338899	valid's rmse: 0.343563]
[2000]	train's rmse: 0.33419	valid's rmse: 0.343211
[3000]	train's rmse: 

[W 2025-10-24 13:54:51,978] Trial 8 failed with parameters: {'use_feature_gen': True, 'use_pca': False, 'pca_components': 10, 'learning_rate': 0.04470271994566001, 'num_leaves': 192, 'min_data_in_leaf': 79, 'feature_fraction': 0.8102328267360597, 'bagging_fraction': 0.8212406982029004, 'bagging_freq': 2, 'lambda_l1': 1.0416596589636642, 'lambda_l2': 0.0010992384601723694, 'boosting_type': 'dart'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "d:\Software\conda\envs\py39_tf\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_43384\674254397.py", line 139, in objective
    results = run_kfold_xgb(X, y, X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_43384\2787167921.py", line 168, in run_kfold_xgb
    y_test_pred  = lgb_model.predict(x_test_new,  num_iteration=lgb_model.best_iteration)

KeyboardInterrupt: 

# 管理数据库信息

In [33]:
# 查询数据库详细数据

storage_url = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

studies = optuna.study.get_all_study_summaries(storage=storage_url)

if not studies:
    print("❌ 当前数据库里无 study")
else:
    print("✅ 数据库中的 study 列表:")
    for s in studies:

        print("-", s.study_name)

        study = optuna.load_study(study_name=s.study_name, storage=storage_url)

        print("         Trials:")
        for trial in study.trials:
            host = trial.user_attrs.get("host") or "unknown"
            ip = trial.user_attrs.get("ip") or "unknown"
            value = f"{trial.value:.10f}" if trial.value is not None else "None"

            print(
                f"    Trial {trial.number:4d}: "
                f"host={host:<16}, ip={ip:<15}, "
                f"value={value:<15}, params={trial.params}"
            )

        print("    总 trial 数量:", len(study.trials))
        print("=" * 100)


✅ 数据库中的 study 列表:
- optuna_task1
         Trials:
    Trial    0: host=unknown         , ip=unknown        , value=None           , params={'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_random_state': 2025, 'selector_device': 'cpu', 'selector_threshold': '0.75*mean', 'train_max_depth': 6, 'train_eta': 0.14963770710824598}
    Trial    1: host=unknown         , ip=unknown        , value=None           , params={'remove_dup_smiles': True, 'use_feature_gen': True, 'use_pca': False, 'selector_random_state': 42, 'selector_device': 'cuda', 'selector_threshold': '1.25*mean', 'train_max_depth': 7, 'train_eta': 0.018021464633564754}
    Trial    2: host=unknown         , ip=unknown        , value=None           , params={'remove_dup_smiles': False, 'use_feature_gen': False, 'use_pca': False, 'selector_random_state': 42, 'selector_device': 'cpu', 'selector_threshold': 'mean', 'train_max_depth': 8, 'train_eta': 0.021659776125338565}
    Trial    3: host=unknown  

In [37]:
# 清理前：先查看数据库里当前有哪些 study 存在，以及每个 study 里有多少个 trial

storage = "mysql+pymysql://user1:123456@10.162.147.95:3306/kaggle_melting_point_optuna"

studies = optuna.study.get_all_study_summaries(storage=storage)
print("现有 study：", [s.study_name for s in studies])

for s in studies:
    study = optuna.load_study(study_name=s.study_name, storage=storage)
    print(f"Study:   {s.study_name:30s}, Trials: {len(study.trials):4d}")

现有 study： ['optuna_task1', 'test', 'test2', 'Predicting Road Accident Risk']
Study:   optuna_task1                  , Trials:   33
Study:   test                          , Trials:    4
Study:   test2                         , Trials:   10
Study:   Predicting Road Accident Risk , Trials:  278


In [36]:
# 清理中：删除指定 study
# 指定要删除的名称
to_delete = ["melting_point_study"]   # 可以写一个或多个

to_delete = [            ]

for s in studies:
    if s.study_name in to_delete:
        optuna.delete_study(study_name=s.study_name, storage=storage)
        print("已删除:", s.study_name)


In [None]:
# 清理后：再次检查
studies_after = optuna.study.get_all_study_summaries(storage=storage)
print("清理后 study：", [s.study_name for s in studies_after])
