In [35]:
# 系统库
import os
import subprocess
import time
import shutil
import json
import socket
from datetime import datetime, timedelta

# 第三方科学计算 & 可视化
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 设置中文字体，避免乱码
plt.rcParams['font.sans-serif'] = ['SimHei']        # 黑体
plt.rcParams['axes.unicode_minus'] = False          # 解决负号显示成方块的问题

# 机器学习 & 优化
import xgboost as xgb
import lightgbm as lgb
import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer


# Kaggle API
from kaggle.api.kaggle_api_extended import KaggleApi


In [36]:
# 初始化




host = "10.162.147.95"
user = "user1"
password = "123456"

database_name = 'predicting_road_accident_risk'  # 数据库名称
competition = database_name  # 竞赛名称
kaggle_competition_name = "playground-series-s5e10"
study_save_name = "XGBoost_model2"



if socket.gethostname() == 'hao-2':
    dir = rf'D:\数据\Kaggle_\{competition}'
else:
    dir = os.getcwd()


DIRS = {
    "dir":              dir,                                       
    "DATA_DIR000":      os.path.join(dir, "DATA_DIR000"),
    "HISTORY":          os.path.join(dir, "HISTORY", f"{study_save_name}"),
    "SUBMISSION":       os.path.join(dir, "SUBMISSION", f"{study_save_name}"),
}

# 自动创建目录
for key, path in DIRS.items():
    os.makedirs(path, exist_ok=True)

# 打印时一行一个地址
print("✅ 路径已创建：\n")
for key, path in DIRS.items():
    print(f"{key:<12} : {path}")

✅ 路径已创建：

dir          : D:\数据\Kaggle_\predicting_road_accident_risk
DATA_DIR000  : D:\数据\Kaggle_\predicting_road_accident_risk\DATA_DIR000
HISTORY      : D:\数据\Kaggle_\predicting_road_accident_risk\HISTORY\XGBoost_model2
SUBMISSION   : D:\数据\Kaggle_\predicting_road_accident_risk\SUBMISSION\XGBoost_model2


# 数据提取处理

In [37]:
# 加载Kaggle 训练集和 Bradley 熔点公开数据集

# Kaggle 提供的训练集和测试集
train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))



# 输出数据集规模，确认加载成功
print("Train                        shape:", train_df.shape)
print("Test                         shape:", test_df.shape)

Train                        shape: (517754, 14)
Test                         shape: (172585, 13)


In [38]:
# 检测 DataFrame 每列的数据类型、缺失值情况、唯一值数量，并给出部分示例值。
def dataframe_info(df: pd.DataFrame, n_sample: int = 6) -> pd.DataFrame:
    """
    检测 DataFrame 每列的数据类型、缺失值情况、唯一值数量，并给出部分示例值。
    
    参数:
        df       : 输入的 pandas DataFrame
        n_sample : 每列展示的样本值数量 (默认 3)
    
    返回:
        summary_df : 包含每列信息的 DataFrame
    """
    summary = pd.DataFrame({
        "数据类型": df.dtypes,
        "缺失值数量": df.isnull().sum(),
        "缺失值比例": df.isnull().mean(),
        "唯一值数量": df.nunique()
    })
    
    # 添加示例值（前 n_sample 个唯一值）
    summary["示例值 (samples)"] = df.apply(
        lambda col: col.dropna().unique()[:n_sample] if col.notnull().any() else []
    )
    
    # 按缺失比例排序
    summary = summary.sort_values("缺失值比例", ascending=False)
    print(df.columns.values)
    display(summary)
    # print(summary)


# 使用方法
dataframe_info(train_df, n_sample = 6)
dataframe_info(test_df, n_sample = 6)


['id' 'road_type' 'num_lanes' 'curvature' 'speed_limit' 'lighting'
 'weather' 'road_signs_present' 'public_road' 'time_of_day' 'holiday'
 'school_season' 'num_reported_accidents' 'accident_risk']


Unnamed: 0,数据类型,缺失值数量,缺失值比例,唯一值数量,示例值 (samples)
id,int64,0,0.0,517754,"[0, 1, 2, 3, 4, 5]"
road_type,object,0,0.0,3,"[urban, rural, highway]"
num_lanes,int64,0,0.0,4,"[2, 4, 1, 3]"
curvature,float64,0,0.0,261,"[0.06, 0.99, 0.63, 0.07, 0.58, 0.54]"
speed_limit,int64,0,0.0,5,"[35, 70, 60, 45, 25]"
lighting,object,0,0.0,3,"[daylight, dim, night]"
weather,object,0,0.0,3,"[rainy, clear, foggy]"
road_signs_present,bool,0,0.0,2,"[False, True]"
public_road,bool,0,0.0,2,"[True, False]"
time_of_day,object,0,0.0,3,"[afternoon, evening, morning]"


['id' 'road_type' 'num_lanes' 'curvature' 'speed_limit' 'lighting'
 'weather' 'road_signs_present' 'public_road' 'time_of_day' 'holiday'
 'school_season' 'num_reported_accidents']


Unnamed: 0,数据类型,缺失值数量,缺失值比例,唯一值数量,示例值 (samples)
id,int64,0,0.0,172585,"[517754, 517755, 517756, 517757, 517758, 517759]"
road_type,object,0,0.0,3,"[highway, urban, rural]"
num_lanes,int64,0,0.0,4,"[2, 3, 4, 1]"
curvature,float64,0,0.0,195,"[0.34, 0.04, 0.59, 0.95, 0.86, 0.52]"
speed_limit,int64,0,0.0,5,"[45, 35, 25, 70, 60]"
lighting,object,0,0.0,3,"[night, dim, daylight]"
weather,object,0,0.0,3,"[clear, foggy, rainy]"
road_signs_present,bool,0,0.0,2,"[True, False]"
public_road,bool,0,0.0,2,"[True, False]"
time_of_day,object,0,0.0,3,"[afternoon, evening, morning]"


# 数据分析

In [39]:
# 打印清单
def config_to_str(config: dict, indent: int = 0) -> str:
    """递归生成配置字符串"""
    prefix = "     " * indent
    lines = []
    for key, value in config.items():
        if isinstance(value, dict):
            lines.append(f"{prefix}🔹 {key}:")
            lines.append(config_to_str(value, indent + 1))  # 递归拼接子字典
        else:
            lines.append(f"{prefix}- {key:<20}: {value}")
    return "\n".join(lines)



In [40]:
# 实验配置单
config = {
    # 固定开关
    "ISTEST"            : False,

    "use_feature_gen"   : True,
    "use_pca"           : True,
    "pca_components"    : 10,



    "study_save_name"    : study_save_name,


    

    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cpu",
        "objective"     : "reg:squarederror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "0*mean",   

    # 训练设置
    "xgb_train_model_params": {
        'max_depth'   : 6,
        'eta'         : 0.1,
        'tree_method' : 'hist',
        'eval_metric' : 'rmse',
    },
    "num_boost_round": 15000,
}

In [41]:
# 数据拆分 (特征矩阵 与 目标向量)
# ============================================

import numpy as np
import pandas as pd

def prepare_features_and_target(train_df: pd.DataFrame, test_df: pd.DataFrame, config: dict):
    """
    数据拆分函数：构造训练集和测试集的特征矩阵与目标向量
    """

    
    # 2. 构造特征矩阵和目标向量
    features_train = train_df.drop(columns=['id', 'accident_risk'])   # 训练集特征 (X)
    target_train   = train_df['accident_risk']                            # 训练集目标 (y, 熔点)
    features_test  = test_df.drop(columns=['id'])    # 测试集特征 (无 Tm)



    # 随机选取部分特征（示例：50 个）
    if config["ISTEST"]:

        sample_len = 100
        features_train = train_df.drop(columns=['id', 'accident_risk']).iloc[:sample_len]  # 训练特征 (前 1000 条)
        target_train = train_df.iloc[:sample_len]['accident_risk']               # 训练目标
        features_test = test_df.drop(columns=['id'])                # 测试特征 (同样的特征列)




    # 3. 打印维度信息
    print("📊 数据拆分完成")
    print(f"训练集特征 features_train  shape   : {features_train.shape}")
    print(f"训练集目标   target_train  shape   : {target_train.shape}")
    print(f"测试集特征  features_test  shape   : {features_test.shape}")
    print(f"           features_train  类型    : {type(features_train)}")

    return features_train, target_train, features_test



### 特征生成

In [42]:
def add_new_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    针对道路事故预测构造新的衍生特征
    输入:
        df : pd.DataFrame
    输出:
        df_new : pd.DataFrame，包含新增特征
    """
    df = df.copy()
    shape_before = df.shape  # 原始维度

    # 1. 交互项
    df['Speed_x_Curvature'] = df['speed_limit'] * df['curvature']
    df['Lanes_x_Speed']     = df['num_lanes'] * df['speed_limit']
    df['Accidents_x_Speed'] = df['num_reported_accidents'] * df['speed_limit']

    # 类别交互项（需要后续 OneHotEncoder）
    df['RoadType_Time']     = df['road_type'] + "_" + df['time_of_day']
    df['Lighting_Weather']  = df['lighting'] + "_" + df['weather']

    # 2. 非线性变换
    df['Log_Accidents']     = np.log1p(df['num_reported_accidents'])  # log(1+x)

    # 分箱（自定义阈值，可调节）
    df['Accident_Bins'] = pd.cut(df['num_reported_accidents'],
                                bins=[-1, 0, 2, 5, np.inf],
                                labels=['none', 'low', 'medium', 'high'])

    # 3. 比率特征
    df['Accidents_per_Lane']   = df['num_reported_accidents'] / (df['num_lanes'] + 1)
    df['Curvature_per_Lane']   = df['curvature'] / (df['num_lanes'] + 1)
    df['Speed_per_Lane']       = df['speed_limit'] / (df['num_lanes'] + 1)


    shape_after = df.shape  # 新的维度

    print(f"{shape_before[0]} × {shape_before[1]} --> {shape_after[0]} × {shape_after[1]}   新增 {shape_after[1] - shape_before[1]} 列")



    return df



### 手动编码 & 布尔转换

In [43]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def fit_ohe(train_df: pd.DataFrame):
    """
    在训练集上自动检测 object/category 列，并拟合 OneHotEncoder。
    返回 encoder 对象和类别列名。
    """
    # 检测类别列
    cat_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
    print(f"检测到{len(cat_cols)}类: {cat_cols}")

    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    ohe.fit(train_df[cat_cols])

    return ohe, cat_cols


def transform_ohe(df: pd.DataFrame, ohe: OneHotEncoder, cat_cols: list) -> pd.DataFrame:
    """
    使用已拟合的 OneHotEncoder 对 DataFrame 进行编码。
    自动处理 bool -> int，保持 train/test 一致。
    """
    df = df.copy()
    shape_before = df.shape

    # 布尔型转 int
    for col in df.select_dtypes('bool').columns:
        df[col] = df[col].astype(int)

    # OHE
    ohe_array = ohe.transform(df[cat_cols])
    ohe_cols = ohe.get_feature_names_out(cat_cols)
    ohe_df = pd.DataFrame(ohe_array, columns=ohe_cols, index=df.index)

    # 拼接
    df_enc = pd.concat([df.drop(columns=cat_cols), ohe_df], axis=1)

    shape_after = df_enc.shape
    print(f"{shape_before[0]} × {shape_before[1]} --> {shape_after[0]} × {shape_after[1]}   新增 {shape_after[1] - shape_before[1]} 列")

    return df_enc





### 特征生成

### PCA降维

In [44]:
# 对数据降维

import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy import sparse

def fit_svd(train_df: pd.DataFrame, n_components: int = 100, random_state: int = 42) -> TruncatedSVD:
    """
    在训练集上拟合 TruncatedSVD，并返回拟合好的模型。
    """
    X_sparse = sparse.csr_matrix(train_df.values)
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    svd.fit(X_sparse)
    explained_var = svd.explained_variance_ratio_.sum()
    print(f"✅ SVD 已拟合完成 (n_components={n_components})，训练集累计解释方差比: {explained_var:.2%}")
    return svd

def transform_svd(df: pd.DataFrame, svd: TruncatedSVD) -> pd.DataFrame:
    """
    使用拟合好的 SVD 对数据集进行降维。
    """
    shape_before = df.shape
    X_sparse = sparse.csr_matrix(df.values)
    X_reduced_array = svd.transform(X_sparse)

    reduced_df = pd.DataFrame(
        X_reduced_array,
        index=df.index,
        columns=[f"SVD_{i+1}" for i in range(X_reduced_array.shape[1])]
    )

    shape_after = reduced_df.shape
    print(f"{shape_before[0]} × {shape_before[1]} --> {shape_after[0]} × {shape_after[1]}")
    return reduced_df







# 交叉训练验证

In [45]:
# Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果
# ==============================================================
def run_kfold_xgb(features_train, target_train, features_test, config, DIRS, K_FOLDS=10, verbose=0):
    """
    使用 Stratified K-Fold + XGBoost 进行训练验证，并保存实验结果

    参数:
        features_train, target_train        : 训练集特征和标签
        features_test      : 测试集特征
        params      : XGBoost 最优参数 (dict)
        DIRS        : 保存结果的目录字典
        K_FOLDS     : 折数 (默认=5)
        verbose     : 是否打印详细信息
    """

    
        
    config["X shape"] = features_train.shape
    config["y shape"] = target_train.shape
    config["X_test shape"] = features_test.shape


    # ---------- 创建目录 ----------
    for _, path in DIRS.items():
        os.makedirs(path, exist_ok=True)


    time_str = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    history_DIR = os.path.join(DIRS['HISTORY'], time_str)
    os.makedirs(history_DIR, exist_ok=True)



    print("——" * 20)
    print(f"✅ 当前结果将保存到: {time_str}")


    # ---------- 定义交叉验证 ----------
    skfold = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    yeo = PowerTransformer(method="yeo-johnson")                                # 定义 Yeo-Johnson 变换

    # ---------- 初始化存储 ----------
    oof_val = np.zeros(len(features_train))       # OOF 预测
    train_score, val_score = [], []  # 每折 LOSS
    test_pred = []                   # 每折 test 预测
    fold_records = []                # 保存每折信息
    all_importances = []             # 特征重要性
    elapsed_list = []                # 耗时记录



    # 循环每一折
    # ==============================================================

    for i, (train_idx, val_idx) in enumerate(skfold.split(features_train, pd.qcut(target_train, q=10).cat.codes), 1):

        # ----- 打印时间信息 -----
        start_now = datetime.now()
        start_str = start_now.strftime("%H:%M:%S")

        if elapsed_list:
            avg_time = np.mean(elapsed_list)
            est_end = start_now + timedelta(seconds=avg_time)

            # 每 5 个一组输出耗时
            parts = [f"{t:6.1f}s" for t in elapsed_list]
            grouped = [" ".join(parts[j:j+5]) for j in range(0, len(parts), 5)]
            elapsed_str = " /// ".join(grouped)

            print(
                f"🔄{i:2d}/{K_FOLDS} ST {start_str}"
                f" ET {est_end.strftime('%H:%M:%S')}"
                f" avg {avg_time:.1f}s"
                f" [{elapsed_str}]",
                end="\r", flush=True
            )
        else:
            print(f"🔄{i:2d}/{K_FOLDS} ST {start_str} ET (暂无历史数据)", end="\r", flush=True)



        # ----- 开始训练 -----
        t0 = time.time()

        # 1. 数据集划分
        x_train, x_val = features_train.iloc[train_idx], features_train.iloc[val_idx]
        y_train, y_val = target_train[train_idx], target_train[val_idx]

        # 2. Yeo-Johnson 变换
        y_train = yeo.fit_transform(y_train.values.reshape(-1, 1)).squeeze()
        y_val   = yeo.transform(y_val.values.reshape(-1, 1)).squeeze()


        # 3. 特征选择（轻量级 XGBoost）
        # 使用
        selector_model = xgb.XGBRegressor(**config["xgb_selector_model_params"])
        # selector_model = xgb.XGBRegressor(
        #     n_estimators   = 500,
        #     max_depth      = 6,
        #     learning_rate  = 0.05,
        #     random_state   = 2025,
        #     device         = "cpu",
        #     objective      = "reg:absoluteerror",
        #     tree_method    = "hist",
        #     verbosity      = 0
        # )
        
        

        selector_model.fit(x_train, y_train)

        selector = SelectFromModel(selector_model, prefit=True, threshold=config["selector_threshold"])
        selected_features = x_train.columns[selector.get_support()].tolist()
        if verbose > 0:
            print(f"✅ 选择的特征数量: {len(selected_features)}")


        # 4. 保留重要特征
        x_train_new = x_train[selected_features]
        x_val_new   = x_val[selected_features]
        x_test_new  = features_test[selected_features]

        # 5. 转换为 DMatrix
        dtrain = xgb.DMatrix(x_train_new, y_train, feature_names=selected_features)
        dval   = xgb.DMatrix(x_val_new,   y_val,   feature_names=selected_features)
        dtest  = xgb.DMatrix(x_test_new,             feature_names=selected_features)


        # 6. XGBoost 训练
        xgb_model = xgb.train(
            params                 = config["xgb_train_model_params"],
            dtrain                 = dtrain,
            num_boost_round        = config["num_boost_round"],
            evals                  = [(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds  = 300,
            verbose_eval           = (1000 if verbose > 0 else False)
        )


        # # 保存模型
        # model_path = os.path.join(history_DIR, f"xgb_model_fold{i}.json")
        # xgb_model.save_model(model_path)

        # 7. 获取特征重要性
        imp_dict = xgb_model.get_score(importance_type="gain")
        imp_df = pd.DataFrame(imp_dict.items(), columns=["Feature", "Importance"])
        imp_df["Fold"] = i
        all_importances.append(imp_df)


        # 8. 预测
        y_train_pred = xgb_model.predict(dtrain)
        y_val_pred   = xgb_model.predict(dval)
        y_test_pred  = xgb_model.predict(dtest)

        # 9. 逆变换
        y_train      = yeo.inverse_transform(y_train.reshape(-1, 1)).squeeze()
        y_val        = yeo.inverse_transform(y_val.reshape(-1, 1)).squeeze()
        y_train_pred = yeo.inverse_transform(y_train_pred.reshape(-1, 1)).squeeze()
        y_val_pred   = yeo.inverse_transform(y_val_pred.reshape(-1, 1)).squeeze()
        y_test_pred  = yeo.inverse_transform(y_test_pred.reshape(-1, 1)).squeeze()

        # 10. 计算 LOSS
        train_loss = np.sqrt(np.mean((y_train - y_train_pred) ** 2))
        val_loss   = np.sqrt(np.mean((y_val   - y_val_pred) ** 2))


        if verbose > 0:
            print(f"Fold {i}: Train LOSS={train_loss:.4f}, Val LOSS={val_loss:.4f}，用时 {elapsed:.2f} 秒")


        # ----- 保存结果 -----
        train_score.append(train_loss)
        val_score.append(val_loss)
        oof_val[val_idx] = y_val_pred
        test_pred.append(y_test_pred)

        elapsed = time.time() - t0
        elapsed_list.append(elapsed)

        fold_records.append({
            "Fold": i,
            "Train_LOSS": train_loss,
            "Val_LOSS": val_loss,
            "Num_Features": len(selected_features),
            "Selected_Features": selected_features,
            "elapsed": elapsed
        })

    # 保存整体结果
    # ==============================================================
    if verbose > 0:
        print("\n")
        print(f"📊 Train LOSS 平均值 : {np.mean(train_score):.4f}")
        print(f"📊 Val   LOSS 平均值 : {np.mean(val_score):.4f}")
        print(f"📊 Train LOSS 标准差 : {np.std(train_score, ddof=0):.4f}")
        print(f"📊 Val   LOSS 标准差 : {np.std(val_score, ddof=0):.4f}")

    # 参数
    with open(os.path.join(history_DIR, "config.json"), "w", encoding="utf-8") as f:
        json.dump(config, f, indent=4, ensure_ascii=False)

    # 每折信息
    folds_df = pd.DataFrame(fold_records)
    folds_df.to_csv(os.path.join(history_DIR, "folds_info.csv"), index=False, encoding="utf-8-sig")


    # 特征重要性
    if all_importances:
        valid_imps = [df for df in all_importances if not df.empty]
        all_imp_df = pd.concat(valid_imps, axis=0) if valid_imps else pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    else:
        all_imp_df = pd.DataFrame(columns=["Feature", "Importance", "Fold"])
    all_imp_df.to_csv(os.path.join(history_DIR, "feature_importance_all.csv"), index=False, encoding="utf-8-sig")


    # 测试集预测
    test_pred_array = np.vstack(test_pred).T
    test_pred_df = pd.DataFrame(test_pred_array, columns=[f"Fold_{j+1}" for j in range(test_pred_array.shape[1])])
    test_pred_df["Final_Pred"] = test_pred_df.mean(axis=1)
    test_pred_df.to_csv(os.path.join(history_DIR, "test_predictions.csv"), index=False, encoding="utf-8-sig")

    # 总结
    with open(os.path.join(history_DIR, "summary.txt"), "w", encoding="utf-8") as f:
        f.write(f"Train LOSS Mean : {np.mean(train_score):.4f}\n")
        f.write(f"Val   LOSS Mean : {np.mean(val_score):.4f}\n")
        f.write(f"Train LOSS Std  : {np.std(train_score, ddof=0):.4f}\n")
        f.write(f"Val   LOSS Std  : {np.std(val_score, ddof=0):.4f}\n")


    # 最终提交
    final_score = np.mean(val_score)
    submission = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "sample_submission.csv"))
    submission["accident_risk"] = test_pred_df["Final_Pred"]

    submission_path = os.path.join(history_DIR, f"sub_{time_str}_{final_score:.8f}.csv")
    submission.to_csv(submission_path, index=False)
    submission.to_csv(os.path.join(DIRS['SUBMISSION'], f"sub_{time_str}_{final_score:.8f}.csv"), index=False)

        
    config["time_str"] = time_str
    config["score"] = final_score


    # ---------- 返回结果 ----------
    return {
        "oof_val": oof_val,
        "train_score": train_score,
        "val_score": val_score,
        "test_pred": test_pred_df,
        "folds_info": folds_df,
        "feature_importance": all_imp_df,
        "submission_path": submission_path,
        "time": time_str,
        "final_score": final_score,
        "config": config
    }


# 单次训练推导

In [46]:
# 执行一次

# 准备流程---------------------------------------------------------------------------------------------------
# 打印当前config
print(config_to_str(config))

# Kaggle 提供的训练集和测试集
train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))

# 数据拆分
print("数据拆分---------------------------")
features_train, target_train, features_test = prepare_features_and_target(train_df, test_df, config)


dataframe_info(features_train, n_sample = 6)


# 特征生成
print("特征生成---------------------------")
if config["use_feature_gen"]:
    features_train = add_new_features(features_train)
    features_test  = add_new_features(features_test)


dataframe_info(features_train, n_sample = 6)


# 手动编码 & 布尔转换
print("手动编码 & 布尔转换----------------")
# 在训练集上拟合
ohe, cat_cols = fit_ohe(features_train)
# 分别编码 train/test，保证一致
features_train = transform_ohe(features_train, ohe, cat_cols)
features_test  = transform_ohe(features_test,  ohe, cat_cols)


# 数据降维
print("数据降维---------------------------")
if config["use_pca"]:
    # 1. 在训练集上拟合
    svd = fit_svd(features_train, n_components = config["pca_components"], random_state=42)

    # 2. 分别对 train/test transform
    features_train_reduced = transform_svd(features_train, svd)
    features_test_reduced  = transform_svd(features_test, svd)

    shape_before_train = features_train.shape
    shape_before_test = features_test.shape

    features_train = pd.concat([features_train, features_train_reduced], axis=1)
    features_test = pd.concat([features_test, features_test_reduced], axis=1)

    shape_after_train = features_train.shape
    shape_after_test = features_test.shape

    print(f"Train: {shape_before_train[0]} × {shape_before_train[1]}  -->  {shape_after_train[0]} × {shape_after_train[1]}   新增 {shape_after_train[1] - shape_before_train[1]} 列")
    print(f"Test : {shape_before_test[0]} × {shape_before_test[1]}  -->  {shape_after_test[0]} × {shape_after_test[1]}   新增 {shape_after_test[1] - shape_before_test[1]} 列")



X, y, X_test = features_train, target_train, features_test
print("开始训练---------------------------")

# 准备流程---------------------------------------------------------------------------------------------------

results = run_kfold_xgb(X, y, X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
config = results['config']
score = results['final_score']



print('\n',score)


- ISTEST              : False
- use_feature_gen     : True
- use_pca             : True
- pca_components      : 10
- study_save_name     : XGBoost_model2
🔹 xgb_selector_model_params:
     - n_estimators        : 500
     - max_depth           : 6
     - learning_rate       : 0.05
     - random_state        : 2025
     - device              : cpu
     - objective           : reg:squarederror
     - tree_method         : hist
     - verbosity           : 0
- selector_threshold  : 0*mean
🔹 xgb_train_model_params:
     - max_depth           : 6
     - eta                 : 0.1
     - tree_method         : hist
     - eval_metric         : rmse
- num_boost_round     : 15000
数据拆分---------------------------
📊 数据拆分完成
训练集特征 features_train  shape   : (517754, 12)
训练集目标   target_train  shape   : (517754,)
测试集特征  features_test  shape   : (172585, 12)
           features_train  类型    : <class 'pandas.core.frame.DataFrame'>
['road_type' 'num_lanes' 'curvature' 'speed_limit' 'lighting' 'weather'
 'ro

Unnamed: 0,数据类型,缺失值数量,缺失值比例,唯一值数量,示例值 (samples)
road_type,object,0,0.0,3,"[urban, rural, highway]"
num_lanes,int64,0,0.0,4,"[2, 4, 1, 3]"
curvature,float64,0,0.0,261,"[0.06, 0.99, 0.63, 0.07, 0.58, 0.54]"
speed_limit,int64,0,0.0,5,"[35, 70, 60, 45, 25]"
lighting,object,0,0.0,3,"[daylight, dim, night]"
weather,object,0,0.0,3,"[rainy, clear, foggy]"
road_signs_present,bool,0,0.0,2,"[False, True]"
public_road,bool,0,0.0,2,"[True, False]"
time_of_day,object,0,0.0,3,"[afternoon, evening, morning]"
holiday,bool,0,0.0,2,"[False, True]"


特征生成---------------------------
517754 × 12 --> 517754 × 22   新增 10 列
172585 × 12 --> 172585 × 22   新增 10 列
['road_type' 'num_lanes' 'curvature' 'speed_limit' 'lighting' 'weather'
 'road_signs_present' 'public_road' 'time_of_day' 'holiday'
 'school_season' 'num_reported_accidents' 'Speed_x_Curvature'
 'Lanes_x_Speed' 'Accidents_x_Speed' 'RoadType_Time' 'Lighting_Weather'
 'Log_Accidents' 'Accident_Bins' 'Accidents_per_Lane' 'Curvature_per_Lane'
 'Speed_per_Lane']


Unnamed: 0,数据类型,缺失值数量,缺失值比例,唯一值数量,示例值 (samples)
road_type,object,0,0.0,3,"[urban, rural, highway]"
num_lanes,int64,0,0.0,4,"[2, 4, 1, 3]"
Curvature_per_Lane,float64,0,0.0,546,"[0.02, 0.198, 0.126, 0.014000000000000002, 0.2..."
Accidents_per_Lane,float64,0,0.0,21,"[0.3333333333333333, 0.0, 0.4, 0.2, 0.5, 1.0]"
Accident_Bins,category,0,0.0,4,"['low', 'none', 'medium', 'high'] Categories (..."
Log_Accidents,float64,0,0.0,8,"[0.6931471805599453, 0.0, 1.0986122886681098, ..."
Lighting_Weather,object,0,0.0,9,"[daylight_rainy, daylight_clear, dim_clear, di..."
RoadType_Time,object,0,0.0,9,"[urban_afternoon, urban_evening, rural_morning..."
Accidents_x_Speed,int64,0,0.0,29,"[35, 0, 140, 60, 90, 50]"
Lanes_x_Speed,int64,0,0.0,17,"[70, 140, 280, 60, 210, 45]"


手动编码 & 布尔转换----------------
检测到7类: ['road_type', 'lighting', 'weather', 'time_of_day', 'RoadType_Time', 'Lighting_Weather', 'Accident_Bins']
517754 × 22 --> 517754 × 49   新增 27 列
172585 × 22 --> 172585 × 49   新增 27 列
数据降维---------------------------
✅ SVD 已拟合完成 (n_components=10)，训练集累计解释方差比: 99.95%
517754 × 49 --> 517754 × 10
172585 × 49 --> 172585 × 10
Train: 517754 × 49  -->  517754 × 59   新增 10 列
Test : 172585 × 49  -->  172585 × 59   新增 10 列
开始训练---------------------------
————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-27 12-07-30
🔄10/10 ST 12:10:49 ET 12:11:11 avg 22.1s [  21.3s   22.0s   20.7s   22.2s   22.5s ///   21.0s   20.1s   24.2s   24.7s]
 0.05627230997776735


# 提交 kaggle 平台测试

In [47]:
# 根据 submission_time 定位文件路径 提交 kaggle 平台测试

import os
import itertools
import time
from kaggle.api.kaggle_api_extended import KaggleApi


def find_submission_file(submission_time, submission_dir):
    """
    在 submission_dir 下查找包含 submission_time 的文件
    一旦找到立刻返回完整路径；如果没找到则返回 None
    """
    for fname in os.listdir(submission_dir):
        if submission_time in fname:
            file_path = os.path.join(submission_dir, fname)
            print(f"✅ 找到目标文件: {fname}")
            return file_path
    
    print(f"⚠️ 未找到包含 {submission_time} 的文件")
    return None

def submit_and_get_score(file_path, competition_name, message="My submission"):
    """
    封装 Kaggle 提交并等待结果评分
    --------------------------------------
    file_path        : str  提交文件路径
    competition_name : str  Kaggle 比赛名称 (URL 最后一段)
    message          : str  提交备注
    """
    # 1. 配置 Kaggle API
    os.environ["KAGGLE_CONFIG_DIR"] = r"C:\Users\Admin\.kaggle"
    api = KaggleApi()
    api.authenticate()
    print("✅ Kaggle API 已经配置成功！")

    # 2. 提交文件
    api.competition_submit(
        file_name=file_path,
        competition=competition_name,
        message=message
    )
    print("✅ 提交完成！请等待评分...")

    # 3. 动态等待
    spinner = itertools.cycle(["|", "/", "-", "\\"])
    while True:
        submissions = api.competition_submissions(competition_name)
        latest = submissions[0]
        status_str = str(latest._status).lower()

        if "complete" in status_str and latest._public_score is not None:
            print("\n🎯 最终结果:")
            print(f"Public 分数 : {latest._public_score}")
            print(f"Private 分数: {latest._private_score}")
            print(f"提交 ID     : {latest._ref}")
            print(f"文件名      : {latest._file_name}")
            print(f"状态        : {latest._status}")
            print(f"提交时间    : {latest._date}")
            print(f"描述/备注   : {latest._description}")
            return latest

        if "pending" in status_str:
            spin_char = next(spinner)
            print(f"当前状态: {status_str} , 等待中 {spin_char}", end="\r", flush=True)
            time.sleep(0.2)  # 每 0.5 秒检查一次
            continue

        else:
            print(f"\n报错")
            print(f"submissions")
            
            break

        


### 不轻易运行，再三考虑

In [48]:
# submission_time 提交
submission_time = "2025-10-26 10-42-46"
competition_name = kaggle_competition_name
message =  f"该提交文件的参数：\n{config_to_str(config)} "

target_file = find_submission_file(submission_time, DIRS['SUBMISSION'] )

# submit_and_get_score(target_file, competition_name, message)

✅ 找到目标文件: sub_2025-10-26 10-42-46_0.05604900.csv


# 参数优化

In [49]:
# 实验配置单
base_config = {
    # 固定开关
    "ISTEST"            : False,

    "use_feature_gen"   : False,
    "use_pca"           : True,
    "pca_components"    : 10,





    "study_save_name"    : study_save_name,


    
    # 特征选择 XGBoost 参数
    "xgb_selector_model_params": {
        "n_estimators"  : 500,
        "max_depth"     : 6,
        "learning_rate" : 0.05,
        "random_state"  : 2025,
        "device"        : "cpu",
        "objective"     : "reg:squarederror",
        "tree_method"   : "hist",
        "verbosity"     : 0
    },

    "selector_threshold"  : "0*mean",   

    # 训练设置
    "xgb_train_model_params": {
        'max_depth'   : 6,
        'eta'         : 0.1,
        'tree_method' : 'hist',
        'eval_metric' : 'rmse',
    },
    "num_boost_round": 15000,
}

In [50]:
# 定义优化任务  加入标识符 host: hao-2   ip: 192.168.40.1

import copy
import contextlib
import io

def objective(trial):
    """
    Optuna 的目标函数 (Objective Function)
    每次 trial 会生成一组超参数，用于训练 XGBoost 模型，
    并返回交叉验证的平均 RMSE 作为优化目标。
    """




    # 1. 定义 超参数 搜索空间
    # 拷贝一份 config，避免全局污染
    config = copy.deepcopy(base_config)

    # 只修改需要优化的参数
    config["use_feature_gen"]   = trial.suggest_categorical("use_feature_gen", [True, False])
    config["use_pca"]           = trial.suggest_categorical("use_pca", [True, False])
    config["pca_components"] = trial.suggest_categorical("pca_components", [5, 10, 15, 20])

    # config["xgb_selector_model_params"]["random_state"] = trial.suggest_categorical("selector_random_state", [2025])
    # config["xgb_selector_model_params"]["device"]       = trial.suggest_categorical("selector_device", ["cpu", "cuda"])

    config["selector_threshold"] = trial.suggest_categorical("selector_threshold", ["0*mean", "0.25*mean", "0.5*mean", "0.75*mean", "mean"])

    config["xgb_train_model_params"]["max_depth"] = trial.suggest_int("train_max_depth", 3, 30)
    config["xgb_train_model_params"]["eta"] = trial.suggest_float("train_eta", 0.01 , 0.3 , log=True)








    # 创建一个黑洞缓冲区
    f = io.StringIO()
    with contextlib.redirect_stdout(f):
        None




        # 准备流程---------------------------------------------------------------------------------------------------
        # 打印当前config
        print(config_to_str(config))

        # Kaggle 提供的训练集和测试集
        train_df = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "train.csv"))
        test_df  = pd.read_csv(os.path.join(DIRS['DATA_DIR000'], "test.csv"))

        # 数据拆分
        print("数据拆分---------------------------")
        features_train, target_train, features_test = prepare_features_and_target(train_df, test_df, config)

        # 特征生成
        print("特征生成---------------------------")
        if config["use_feature_gen"]:
            features_train = add_new_features(features_train)
            features_test  = add_new_features(features_test)


        # 手动编码 & 布尔转换
        print("手动编码 & 布尔转换----------------")
        # 在训练集上拟合
        ohe, cat_cols = fit_ohe(features_train)
        # 分别编码 train/test，保证一致
        features_train = transform_ohe(features_train, ohe, cat_cols)
        features_test  = transform_ohe(features_test,  ohe, cat_cols)


        # 数据降维
        print("数据降维---------------------------")
        if config["use_pca"]:
            # 1. 在训练集上拟合
            svd = fit_svd(features_train, n_components = config["pca_components"], random_state=42)

            # 2. 分别对 train/test transform
            features_train_reduced = transform_svd(features_train, svd)
            features_test_reduced  = transform_svd(features_test, svd)

            shape_before_train = features_train.shape
            shape_before_test = features_test.shape

            features_train = pd.concat([features_train, features_train_reduced], axis=1)
            features_test = pd.concat([features_test, features_test_reduced], axis=1)

            shape_after_train = features_train.shape
            shape_after_test = features_test.shape

            print(f"Train: {shape_before_train[0]} × {shape_before_train[1]}  -->  {shape_after_train[0]} × {shape_after_train[1]}   新增 {shape_after_train[1] - shape_before_train[1]} 列")
            print(f"Test : {shape_before_test[0]} × {shape_before_test[1]}  -->  {shape_after_test[0]} × {shape_after_test[1]}   新增 {shape_after_test[1] - shape_before_test[1]} 列")



        X, y, X_test = features_train, target_train, features_test
        print("开始训练---------------------------")

        # 准备流程---------------------------------------------------------------------------------------------------





    results = run_kfold_xgb(X, y, X_test, config, DIRS, K_FOLDS = 10, verbose = 0)
    config = results['config']
    score = results['final_score']



    HOSTNAME = socket.gethostname()
    HOST_IP = socket.gethostbyname(HOSTNAME)
    trial.set_user_attr("host", HOSTNAME)        # 你自己定义主机 A/B
    trial.set_user_attr("ip", HOST_IP)        # 你自己定义角色 A/B

    # 4. 返回平均 LOSS
    return score




In [None]:
# 开始优化

# 1. 定义 SQLite 数据库存储路径

storage_url = f"mysql+pymysql://{user}:{password}@{host}:3306/{database_name}"

STUDY_NAME = F"test_{study_save_name}" if base_config["ISTEST"] else study_save_name

# 2. 创建或加载 Optuna Study
study = optuna.create_study(
    study_name = STUDY_NAME,
    # study_name="ghsdjsrtjrswtjhwrt",
    storage=storage_url,
    load_if_exists=True
)

# 自动获取当前主机名\当前主机的 IP 地址
HOSTNAME = socket.gethostname()
HOST_IP = socket.gethostbyname(HOSTNAME)
print("主机名:", HOSTNAME," 主机 IP:", HOST_IP)
time.sleep(1)

# 5. 启动超参数搜索
print("🔎 开始超参数搜索...")
if base_config["ISTEST"]:
    study.optimize(objective, n_trials = 3)
else:
    study.optimize(objective, n_trials = 300)


# 6. 打印最优结果
print("\n✅ 训练完成！")
print(f"📊 已完成试验次数 : {len(study.trials)}")
print(f"🏆 最优试验编号   : {study.best_trial.number}")
print(f"📉 最优 LOSS       : {study.best_value}")
print(f"⚙️ 最优参数组合   : {study.best_trial.params}")


[I 2025-10-27 12:11:11,731] Using an existing study with name 'XGBoost_model2' instead of creating a new one.


主机名: hao-2  主机 IP: 192.168.40.1
🔎 开始超参数搜索...
————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-27 12-11-14
🔄10/10 ST 12:15:41 ET 12:16:11 avg 29.7s [  26.3s   38.2s   22.4s   24.1s   34.6s ///   24.7s   35.8s   27.3s   34.1s]

[I 2025-10-27 12:16:07,973] Trial 215 finished with value: 0.05624293120277209 and parameters: {'use_feature_gen': False, 'use_pca': False, 'pca_components': 20, 'selector_threshold': '0.25*mean', 'train_max_depth': 8, 'train_eta': 0.010088339144940148}. Best is trial 210 with value: 0.05602793050530871.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-27 12-16-10
🔄10/10 ST 12:21:15 ET 12:21:49 avg 33.8s [  32.8s   27.8s   37.8s   27.7s   39.7s ///   29.3s   40.3s   28.5s   40.0s]

[I 2025-10-27 12:21:47,310] Trial 216 finished with value: 0.056147561076638805 and parameters: {'use_feature_gen': True, 'use_pca': False, 'pca_components': 20, 'selector_threshold': '0*mean', 'train_max_depth': 9, 'train_eta': 0.011563492906738344}. Best is trial 210 with value: 0.05602793050530871.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-27 12-21-48
🔄10/10 ST 12:25:41 ET 12:26:07 avg 25.9s [  24.4s   33.6s   22.4s   27.2s   25.4s ///   23.7s   25.4s   23.8s   27.0s]

[I 2025-10-27 12:26:09,875] Trial 217 finished with value: 0.05607968801464196 and parameters: {'use_feature_gen': False, 'use_pca': False, 'pca_components': 20, 'selector_threshold': '0*mean', 'train_max_depth': 9, 'train_eta': 0.010948363927709731}. Best is trial 210 with value: 0.05602793050530871.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-27 12-26-11
🔄10/10 ST 12:29:59 ET 12:30:25 avg 25.4s [  30.3s   27.8s   22.6s   26.2s   22.3s ///   27.0s   23.6s   22.5s   26.1s]

[I 2025-10-27 12:30:23,979] Trial 218 finished with value: 0.05603353785265811 and parameters: {'use_feature_gen': False, 'use_pca': False, 'pca_components': 20, 'selector_threshold': '0*mean', 'train_max_depth': 8, 'train_eta': 0.012640951412648016}. Best is trial 210 with value: 0.05602793050530871.


————————————————————————————————————————
✅ 当前结果将保存到: 2025-10-27 12-30-25
🔄 1/10 ST 12:30:25 ET (暂无历史数据)

# 管理数据库信息

In [None]:
# 查询数据库详细数据


storage_url = f"mysql+pymysql://{user}:{password}@{host}:3306/{database_name}"

studies = optuna.study.get_all_study_summaries(storage=storage_url)

if not studies:
    print("❌ 当前数据库里无 study")
else:
    print("✅ 数据库中的 study 列表:")
    for s in studies:

        print("-", s.study_name)

        study = optuna.load_study(study_name=s.study_name, storage=storage_url)

        print("         Trials:")
        for trial in study.trials[:10]:  # 仅显示前 10 个 trial
            host = trial.user_attrs.get("host") or "unknown"
            ip = trial.user_attrs.get("ip") or "unknown"
            value = f"{trial.value:.10f}" if trial.value is not None else "None"

            print(
                f"    Trial {trial.number:4d}: "
                f"host={host:<16}, ip={ip:<15}, "
                f"value={value:<15}, params={trial.params}"
            )

        print("    总 trial 数量:", len(study.trials))
        print("=" * 100)


In [None]:
import optuna


storage_url = f"mysql+pymysql://{user}:{password}@{host}:3306/{database_name}"

study_name = study_save_name
study = optuna.load_study(study_name=study_name, storage=storage_url)

# 查看最优 trial
best_trial = study.best_trial

print(f"✅ Study 名称: {study_name}")
print(f"最优目标值: {best_trial.value:.8f}")
print(f"最优参数: {best_trial.params}")
print(f"Trial 编号: {best_trial.number}")
print(f"Host: {best_trial.user_attrs.get('host', 'unknown')}")
print(f"IP: {best_trial.user_attrs.get('ip', 'unknown')}")

# print("\n🔍 详细优化过程（全部 trial）:")
# for trial in study.trials:
#     value = f"{trial.value:.8f}" if trial.value is not None else "None"
#     print(
#         f"Trial {trial.number:3d}: value={value:<15}, params={trial.params}"
#     )

print(f"\n📊 总 trial 数量: {len(study.trials)}")


In [None]:
# 清理前：先查看数据库里当前有哪些 study 存在，以及每个 study 里有多少个 trial

storage_url = f"mysql+pymysql://{user}:{password}@{host}:3306/{database_name}"

studies = optuna.study.get_all_study_summaries(storage=storage_url)
print("现有 study：", [s.study_name for s in studies])

for s in studies:
    study = optuna.load_study(study_name=s.study_name, storage=storage_url)
    print(f"Study:   {s.study_name:30s}, Trials: {len(study.trials):4d}")

In [None]:
# 清理中：删除指定 study
# 指定要删除的名称
to_delete = ["melting_point_study"]   # 可以写一个或多个

to_delete = [            ]

for s in studies:
    if s.study_name in to_delete:
        optuna.delete_study(study_name=s.study_name, storage=storage_url)
        print("已删除:", s.study_name)


In [None]:
# 清理后：再次检查
studies_after = optuna.study.get_all_study_summaries(storage=storage_url)
print("清理后 study：", [s.study_name for s in studies_after])
