## 导入模块

In [1]:
import datetime

import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error

In [None]:
dataset_dir = "../dataset"

train = pd.read_csv(f"{dataset_dir}/train.csv")
test = pd.read_csv(f"{dataset_dir}/test.csv")

## 特征工程

In [None]:
# 合并训练数据和测试数据
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(["id", "dt"], ascending=False).reset_index(drop=True)

# 历史平移
for i in range(10, 36):
    data[f"target_shift{i}"] = data.groupby("id")["target"].shift(i)

# 历史平移 + 差分特征
for i in range(1, 4):
    data[f"target_shift10_diff{i}"] = data.groupby("id")["target_shift10"].diff(i)

# 窗口统计
for win in [15, 30, 50, 70]:
    data[f"target_win{win}_mean"] = (
        data.groupby("id")["target"]
        .rolling(window=win, min_periods=3, closed="left")
        .mean()
        .values
    )
    data[f"target_win{win}_max"] = (
        data.groupby("id")["target"]
        .rolling(window=win, min_periods=3, closed="left")
        .max()
        .values
    )
    data[f"target_win{win}_min"] = (
        data.groupby("id")["target"]
        .rolling(window=win, min_periods=3, closed="left")
        .min()
        .values
    )
    data[f"target_win{win}_std"] = (
        data.groupby("id")["target"]
        .rolling(window=win, min_periods=3, closed="left")
        .std()
        .values
    )

# 历史平移 + 窗口统计
for win in [7, 14, 28, 35, 50, 70]:
    data[f"target_shift10_win{win}_mean"] = (
        data.groupby("id")["target_shift10"]
        .rolling(window=win, min_periods=3, closed="left")
        .mean()
        .values
    )
    data[f"target_shift10_win{win}_max"] = (
        data.groupby("id")["target_shift10"]
        .rolling(window=win, min_periods=3, closed="left")
        .max()
        .values
    )
    data[f"target_shift10_win{win}_min"] = (
        data.groupby("id")["target_shift10"]
        .rolling(window=win, min_periods=3, closed="left")
        .min()
        .values
    )
    data[f"target_shift10_win{win}_sum"] = (
        data.groupby("id")["target_shift10"]
        .rolling(window=win, min_periods=3, closed="left")
        .sum()
        .values
    )
    data[f"target_shift710win{win}_std"] = (
        data.groupby("id")["target_shift10"]
        .rolling(window=win, min_periods=3, closed="left")
        .std()
        .values
    )

In [None]:
# 进行数据切分
train = data[data.target.notnull()].reset_index(drop=True)
test = data[data.target.isnull()].reset_index(drop=True)

# 确定输入特征
train_cols = [f for f in data.columns if f not in ["id", "target"]]

## 模型训练

In [None]:
def time_model(lgb, train_df, test_df, cols):
    trn_x, trn_y = (
        train_df[train_df.dt >= 31][cols],
        train_df[train_df.dt >= 31]["target"],
    )
    val_x, val_y = (
        train_df[train_df.dt <= 30][cols],
        train_df[train_df.dt <= 30]["target"],
    )

    train_matrix = lgb.Dataset(trn_x, label=trn_y)
    valid_matrix = lgb.Dataset(val_x, label=val_y)

    params = {
        "objective": "regression",
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 2**5,
        "min_child_weight": 5,
        "num_threads": -1,
        "device_type": "cpu",
        "seed": 2024,
        "min_data_in_leaf": 11,
        "min_sum_hessian_in_leaf": 9,
        "bagging_fraction": 0.8,
        "bagging_freq": 4,
        "feature_fraction": 0.8,
        "lambda_l2": 10,
        "metric": "mse",
    }

    model = lgb.train(
        params,
        train_matrix,
        50000,
        valid_sets=[train_matrix, valid_matrix],
        categorical_feature=[],
        callbacks=[
            lgb.early_stopping(500),
            lgb.log_evaluation(500),
        ],
    )

    val_pred = model.predict(
        val_x,
        num_iteration=model.best_iteration,
    )
    test_pred = model.predict(
        test_df[cols],
        num_iteration=model.best_iteration,
    )

    score = mean_squared_error(val_pred, val_y)
    print(score)

    return val_pred, test_pred

## 结果预测

In [None]:
lgb_oof, lgb_test = time_model(lgb, train, test, train_cols)

In [None]:
test["target"] = lgb_test
test[["id", "dt", "target"]].to_csv(
    ("../submit/submit_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + ".csv"),
    index=False,
)