In [1]:
# ================================================================================
# NFL BIG DATA BOWL 2026 - 完整解决方案
# 使用时间序列特征预测传球战术中球员移动
# ================================================================================


In [2]:
# --- 标准库 ---
import gc
import math
import os
import hashlib
import numpy as np
import pickle
from pathlib import Path

import warnings
from datetime import datetime
from itertools import combinations
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from collections import defaultdict





# --- 第三方库 ---
import joblib
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

# --- 深度学习（PyTorch）---
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    LinearLR,
    OneCycleLR,
    SequentialLR,
)

# 忽略警告信息（可选）
warnings.filterwarnings("ignore")


In [3]:
class Config:
    """
    全局配置类
    用于控制模型训练、数据路径、设备参数等全局设置
    """

    # 📁 数据路径设置
    DATA_DIR            = Path(r"D:\数据\Kaggle\2026 年 NFL 大数据碗 - 预测\DATA_DIR000")
    NN_PRETRAIN_DIR     = Path(r"D:\数据\Kaggle\2026 年 NFL 大数据碗 - 预测\DATA_DIR000\bigru-public")

    # ⚙️ 通用参数
    SEED                = 42                               # 随机种子，保证结果可复现
    N_FOLDS             = 5                                # 交叉验证折数
    USE_PLAYERS_INTERACTIONS = True                        # 是否使用球员交互特征

    # 🏟️ 球场参数（单位：码）
    FIELD_X_MIN         = 0.0
    FIELD_X_MAX         = 120.0
    FIELD_Y_MIN         = 0.0
    FIELD_Y_MAX         = 53.3

    # 🧠 模型相关参数
    WINDOW_SIZE         = 12                               # 时序窗口长度
    DEVICE              = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    """设置随机种子以确保实验结果可复现。"""
    import random, os
    random.seed(SEED)                               # Python 内置随机数模块
    np.random.seed(SEED)                            # NumPy 随机数生成器
    torch.manual_seed(SEED)                         # PyTorch CPU 随机数
    torch.cuda.manual_seed_all(SEED)                # PyTorch GPU 随机数（全部 GPU）
    os.environ["PYTHONHASHSEED"] = str(SEED)        # 控制哈希随机化（字典等结构）
    torch.backends.cudnn.deterministic = True       # 使用确定性算法，保证结果一致
    torch.backends.cudnn.benchmark = False          # 关闭自动优化，避免结果不稳定

In [4]:
# 使用 Config 中定义的随机种子
def set_global_seeds(seed: int = 42):
    """设置随机种子以确保实验结果可复现。"""
    import random, os
    random.seed(seed)                               # Python 内置随机数模块
    np.random.seed(seed)                            # NumPy 随机数生成器
    torch.manual_seed(seed)                         # PyTorch CPU 随机数
    torch.cuda.manual_seed_all(seed)                # PyTorch GPU 随机数（全部 GPU）
    os.environ["PYTHONHASHSEED"] = str(seed)        # 控制哈希随机化（字典等结构）
    torch.backends.cudnn.deterministic = True       # 使用确定性算法，保证结果一致
    torch.backends.cudnn.benchmark = False          # 关闭自动优化，避免结果不稳定




In [5]:
# 加载数据
def load_data(debug_fraction: float = 1.0):
    """加载训练集与测试集数据，可按比例加载部分数据用于调试。"""
    print("开始加载数据...")

    # 构造训练数据文件路径（共 18 周）
    train_input_files  = [Config.DATA_DIR / f"train/input_2023_w{w:02d}.csv"  for w in range(1, 19)]
    train_output_files = [Config.DATA_DIR / f"train/output_2023_w{w:02d}.csv" for w in range(1, 19)]

    # 过滤掉不存在的文件
    train_input_files  = [f for f in train_input_files  if f.exists()]
    train_output_files = [f for f in train_output_files if f.exists()]
    print(f"检测到 {len(train_input_files)} 周的有效训练数据。")

    # 读取并合并训练数据，同时添加 week 列区分周次
    train_input = pd.concat(
        [pd.read_csv(f).assign(week=w) for w, f in enumerate(train_input_files, start=1)],
        ignore_index=True
    )
    train_output = pd.concat(
        [pd.read_csv(f).assign(week=w) for w, f in enumerate(train_output_files, start=1)],
        ignore_index=True
    )

    # 读取测试集数据
    test_input    = pd.read_csv(Config.DATA_DIR / "test_input.csv")
    test_template = pd.read_csv(Config.DATA_DIR / "test.csv")
    print(f"已加载 {len(train_input):,} 条输入记录，{len(train_output):,} 条输出记录。")

    # 若开启调试模式，仅使用部分比赛数据
    if debug_fraction < 1.0:
        unique_game_ids  = train_input["game_id"].unique()  # 所有比赛 ID
        sampled_game_ids = pd.Series(unique_game_ids).sample(frac=debug_fraction, random_state=42).values
        train_input  = train_input[train_input["game_id"].isin(sampled_game_ids)].reset_index(drop=True)
        train_output = train_output[train_output["game_id"].isin(sampled_game_ids)].reset_index(drop=True)
        print(f"调试模式：使用 {len(train_input):,} 条输入记录，共 {len(sampled_game_ids)} 场比赛。")

    return train_input, train_output, test_input, test_template



In [6]:
class ParticipantVisibleError(Exception):
    """自定义异常类：用于在评分时向参赛者提示错误信息。"""
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    计算 NFL 竞赛的预测得分（RMSE）。
    
    参数：
        solution:   官方真实数据（DataFrame）
        submission: 选手提交的预测结果（DataFrame）
        row_id_column_name: 唯一标识列名（通常为 'id'）

    返回：
        float 类型的 RMSE（Root Mean Squared Error，均方根误差）

    要求：
        solution 和 submission 必须都包含：
            - 'id'：每条样本唯一标识（由 game_id, play_id, nfl_id, frame_id 组成）
            - 'x' 和 'y'：球员在场上的位置坐标

    示例：
        >>> row_id_column_name = 'id'
        >>> solution = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [1,2,3], 'y':[4,2,3]})
        >>> submission = pd.DataFrame({'id': ['21_12_2_1', '21_12_2_2', '21_12_2_3'], 'x': [1.1,2,3], 'y':[4,2.2,3]})
        >>> round(score(solution, submission, row_id_column_name=row_id_column_name), 4)
        0.0913
    """

    TARGET = ['x', 'y']  # 需要计算误差的目标列

    # 检查唯一标识列是否存在
    if row_id_column_name not in solution.columns:
        raise ParticipantVisibleError(f"Solution 文件缺少必要列: '{row_id_column_name}'")
    if row_id_column_name not in submission.columns:
        raise ParticipantVisibleError(f"Submission 文件缺少必要列: '{row_id_column_name}'")

    # 检查预测与真实文件中是否包含目标列 'x'、'y'
    missing_in_solution = set(TARGET) - set(solution.columns)
    missing_in_submission = set(TARGET) - set(submission.columns)

    if missing_in_solution:
        raise ParticipantVisibleError(f"Solution 文件缺少列: {missing_in_solution}")
    if missing_in_submission:
        raise ParticipantVisibleError(f"Submission 文件缺少列: {missing_in_submission}")

    # 只保留 id、x、y 列（防止额外无关列干扰）
    submission = submission[['id'] + TARGET]

    # 按 id 合并真实值与预测值
    merged_df = pd.merge(solution, submission, on=row_id_column_name, suffixes=('_true', '_pred'))

    # 检查预测结果中是否存在 NaN
    nanx_in_pred = merged_df['x_pred'].isna().sum()
    nany_in_pred = merged_df['y_pred'].isna().sum()
    if nanx_in_pred > 0:
        print(f"警告：预测结果中 x_pred 存在 {nanx_in_pred} 个 NaN 值。")
    if nany_in_pred > 0:
        print(f"警告：预测结果中 y_pred 存在 {nany_in_pred} 个 NaN 值。")

    # 检查真实值中对应 NaN 预测的样本是否也有缺失
    nanx_in_true = merged_df[merged_df['x_pred'].isna() | merged_df['y_pred'].isna()]['x_true'].isna().sum()
    nany_in_true = merged_df[merged_df['x_pred'].isna() | merged_df['y_pred'].isna()]['y_true'].isna().sum()
    if nanx_in_true > 0:
        print(f"警告：真实值中 x_true 存在 {nanx_in_true} 个与 NaN 预测对应的缺失。")
    if nany_in_true > 0:
        print(f"警告：真实值中 y_true 存在 {nany_in_true} 个与 NaN 预测对应的缺失。")

    # 计算 RMSE（对 x 与 y 分别计算 MSE 后取平均再开方）
    rmse = np.sqrt(
        0.5 * (
            mean_squared_error(merged_df['x_true'], merged_df['x_pred']) +
            mean_squared_error(merged_df['y_true'], merged_df['y_pred'])
        )
    )

    return float(rmse)


In [7]:
def _compute_interactions_for_play_frames(df_play_frames: pd.DataFrame) -> pd.DataFrame:
    """
    计算每个 (game_id, play_id, frame_id, nfl_id) 的球员交互特征。

    输出特征包括：
      - 距离统计（进攻方/防守方的平均、最小、最大距离）
      - 相对速度统计（平均、最小、最大）
      - 角度统计（进攻方/防守方的平均、最小、最大角度，平均角度为圆均值）
      - 最近对手的距离、角度、相对速度

    注意：
      - 若存在列 'player_to_predict'，则只计算该列为 True 的球员；
      - 否则，对所有球员计算。
    """

    out_rows = []  # 存储每一帧的计算结果

    # 遍历每个 (比赛, 战术, 帧)
    for (g, p, f), grp in df_play_frames.groupby(['game_id', 'play_id', 'frame_id'], sort=False):
        n = len(grp)
        if n == 0:
            continue  # 跳过空帧

        # 提取基本信息
        nfl_ids = grp['nfl_id'].to_numpy()
        x  = grp['x'].to_numpy(dtype=np.float32)
        y  = grp['y'].to_numpy(dtype=np.float32)
        vx = grp['velocity_x'].to_numpy(dtype=np.float32)
        vy = grp['velocity_y'].to_numpy(dtype=np.float32)
        is_off = grp['is_offense'].to_numpy().astype(bool)

        # 若存在 player_to_predict，则仅计算标记为 True 的球员
        compute_mask = grp['player_to_predict'].to_numpy().astype(bool) if 'player_to_predict' in grp.columns else np.ones(n, dtype=bool)

        # 计算两两球员间的几何关系（n×n矩阵）
        dx = x[None, :] - x[:, None]
        dy = y[None, :] - y[:, None]
        dist = np.sqrt(dx * dx + dy * dy)                # 距离矩阵
        angle_mat = np.arctan2(-dy, -dx)                 # 从球员 i 指向 j 的角度
        dvx = vx[:, None] - vx[None, :]
        dvy = vy[:, None] - vy[None, :]
        rel_speed = np.sqrt(dvx * dvx + dvy * dvy)       # 相对速度矩阵

        # 掩码矩阵：进攻方、 防守方、 对手方
        opp_mask = (is_off[:, None] != is_off[None, :])  # True 表示对手关系
        np.fill_diagonal(opp_mask, False)                # 自身置 False

        mask_off = np.broadcast_to(is_off[None, :], (n, n)).copy()   # 进攻方矩阵
        mask_def = np.broadcast_to(~is_off[None, :], (n, n)).copy()  # 防守方矩阵
        np.fill_diagonal(mask_off, False)
        np.fill_diagonal(mask_def, False)

        # 最近对手（基于最小距离）
        dist_opp = np.where(opp_mask, dist, np.nan)
        nearest_dist = np.nanmin(dist_opp, axis=1)
        nearest_idx = np.nanargmin(dist_opp, axis=1)
        all_nan = ~np.isfinite(nearest_dist)
        nearest_idx_safe = nearest_idx.copy()
        nearest_idx_safe[all_nan] = 0  # 防止 nanargmin 报错
        nearest_angle = np.take_along_axis(angle_mat, nearest_idx_safe[:, None], axis=1).squeeze(1)
        nearest_rel   = np.take_along_axis(rel_speed, nearest_idx_safe[:, None], axis=1).squeeze(1)
        nearest_angle[all_nan] = np.nan
        nearest_rel[all_nan]   = np.nan

        # 计算距离的统计特征
        d_off = np.where(mask_off, dist, np.nan)
        d_def = np.where(mask_def, dist, np.nan)
        d_mean_o = np.nanmean(d_off, axis=1); d_min_o = np.nanmin(d_off, axis=1); d_max_o = np.nanmax(d_off, axis=1)
        d_mean_d = np.nanmean(d_def, axis=1); d_min_d = np.nanmin(d_def, axis=1); d_max_d = np.nanmax(d_def, axis=1)

        # 计算相对速度的统计特征
        v_off = np.where(mask_off, rel_speed, np.nan)
        v_def = np.where(mask_def, rel_speed, np.nan)
        v_mean_o = np.nanmean(v_off, axis=1); v_min_o = np.nanmin(v_off, axis=1); v_max_o = np.nanmax(v_off, axis=1)
        v_mean_d = np.nanmean(v_def, axis=1); v_min_d = np.nanmin(v_def, axis=1); v_max_d = np.nanmax(v_def, axis=1)

        # 角度的平均值（圆均值）与最小/最大角
        sinA = np.sin(angle_mat); cosA = np.cos(angle_mat)
        cnt_off = mask_off.sum(axis=1).astype(np.float32)
        cnt_def = mask_def.sum(axis=1).astype(np.float32)
        denom_off = np.where(cnt_off > 0, cnt_off, np.nan)
        denom_def = np.where(cnt_def > 0, cnt_def, np.nan)
        sin_sum_off = (sinA * mask_off).sum(axis=1)
        cos_sum_off = (cosA * mask_off).sum(axis=1)
        sin_sum_def = (sinA * mask_def).sum(axis=1)
        cos_sum_def = (cosA * mask_def).sum(axis=1)
        a_mean_o = np.arctan2(sin_sum_off / denom_off, cos_sum_off / denom_off)
        a_mean_d = np.arctan2(sin_sum_def / denom_def, cos_sum_def / denom_def)
        a_off = np.where(mask_off, angle_mat, np.nan)
        a_def = np.where(mask_def, angle_mat, np.nan)
        a_min_o = np.nanmin(a_off, axis=1); a_max_o = np.nanmax(a_off, axis=1)
        a_min_d = np.nanmin(a_def, axis=1); a_max_d = np.nanmax(a_def, axis=1)

        # 生成输出结果，仅对 compute_mask 为 True 的球员输出
        for idx, nid in enumerate(nfl_ids):
            if not compute_mask[idx]:
                continue
            out_rows.append({
                'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': int(nid),

                # 进攻方距离和速度特征
                'distance_to_player_mean_offense': d_mean_o[idx],
                'distance_to_player_min_offense':  d_min_o[idx],
                'distance_to_player_max_offense':  d_max_o[idx],
                'relative_velocity_magnitude_mean_offense': v_mean_o[idx],
                'relative_velocity_magnitude_min_offense':  v_min_o[idx],
                'relative_velocity_magnitude_max_offense':  v_max_o[idx],
                'angle_to_player_mean_offense': a_mean_o[idx],
                'angle_to_player_min_offense':  a_min_o[idx],
                'angle_to_player_max_offense':  a_max_o[idx],

                # 防守方距离和速度特征
                'distance_to_player_mean_defense': d_mean_d[idx],
                'distance_to_player_min_defense':  d_min_d[idx],
                'distance_to_player_max_defense':  d_max_d[idx],
                'relative_velocity_magnitude_mean_defense': v_mean_d[idx],
                'relative_velocity_magnitude_min_defense':  v_min_d[idx],
                'relative_velocity_magnitude_max_defense':  v_max_d[idx],
                'angle_to_player_mean_defense': a_mean_d[idx],
                'angle_to_player_min_defense':  a_min_d[idx],
                'angle_to_player_max_defense':  a_max_d[idx],

                # 最近对手的特征
                'nearest_opponent_dist': float(nearest_dist[idx]) if np.isfinite(nearest_dist[idx]) else np.nan,
                'nearest_opponent_angle': float(nearest_angle[idx]) if np.isfinite(nearest_angle[idx]) else np.nan,
                'nearest_opponent_rel_speed': float(nearest_rel[idx]) if np.isfinite(nearest_rel[idx]) else np.nan,
            })

    # 返回包含所有特征的数据表
    return pd.DataFrame(
        out_rows,
        columns=[
            'game_id', 'play_id', 'frame_id', 'nfl_id',

            # 进攻方距离特征
            'distance_to_player_mean_offense',
            'distance_to_player_min_offense',
            'distance_to_player_max_offense',

            # 进攻方相对速度特征
            'relative_velocity_magnitude_mean_offense',
            'relative_velocity_magnitude_min_offense',
            'relative_velocity_magnitude_max_offense',

            # 进攻方角度特征
            'angle_to_player_mean_offense',
            'angle_to_player_min_offense',
            'angle_to_player_max_offense',

            # 防守方距离特征
            'distance_to_player_mean_defense',
            'distance_to_player_min_defense',
            'distance_to_player_max_defense',

            # 防守方相对速度特征
            'relative_velocity_magnitude_mean_defense',
            'relative_velocity_magnitude_min_defense',
            'relative_velocity_magnitude_max_defense',

            # 防守方角度特征
            'angle_to_player_mean_defense',
            'angle_to_player_min_defense',
            'angle_to_player_max_defense',

            # 最近对手特征
            'nearest_opponent_dist',
            'nearest_opponent_angle',
            'nearest_opponent_rel_speed'
        ]
    )



In [8]:
from typing import Optional

def height_to_feet(height_str: str) -> Optional[float]:
    """将身高从 '英尺-英寸' 格式（如 '6-2'）转换为以英尺为单位的小数。"""
    try:
        ft, inches = map(int, height_str.split('-'))
        return ft + inches / 12
    except Exception:
        return None



In [9]:
def add_advanced_features(df: pd.DataFrame) -> pd.DataFrame:
    """添加 30~40 个高级特征，以提升模型预测性能。"""
    print("正在添加高级特征...")

    df = df.copy()
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    gcols =             ['game_id', 'play_id', 'nfl_id']  # 分组键（每名球员的时间序列）

    # 1. 距离变化特征（Distance Rate Features）
    if 'distance_to_ball' in df.columns:
        df['distance_to_ball_change'] = df.groupby(gcols)['distance_to_ball'].diff().fillna(0)  # 距离变化
        df['distance_to_ball_accel']  = df.groupby(gcols)['distance_to_ball_change'].diff().fillna(0)  # 距离加速度
        df['time_to_intercept']       = (df['distance_to_ball'] /
                                        (np.abs(df['distance_to_ball_change']) + 0.1)).clip(0, 10)  # 估算到球时间

    # 2. 方向对齐特征（Target Alignment Features）
    if 'ball_direction_x' in df.columns:
        # 速度在球方向上的投影（对齐程度）
        df['velocity_alignment'] = (
            df['velocity_x'] * df['ball_direction_x'] +
            df['velocity_y'] * df['ball_direction_y']
        )
        # 垂直于球方向的速度分量
        df['velocity_perpendicular'] = (
            df['velocity_x'] * (-df['ball_direction_y']) +
            df['velocity_y'] * df['ball_direction_x']
        )
        # 加速度在球方向上的分量
        if 'acceleration_x' in df.columns:
            df['accel_alignment'] = (
                df['acceleration_x'] * df['ball_direction_x'] +
                df['acceleration_y'] * df['ball_direction_y']
            )

    # 3. 多窗口滚动特征（Multi-Window Rolling）
    for window in [3, 5, 10]:
        for col in ['velocity_x', 'velocity_y', 's', 'a']:
            if col in df.columns:
                # 滚动平均值（平滑趋势）
                df[f'{col}_roll{window}'] = df.groupby(gcols)[col].transform(
                    lambda x: x.rolling(window, min_periods=1).mean()
                )
                # 滚动标准差（短期波动性）
                df[f'{col}_std{window}'] = df.groupby(gcols)[col].transform(
                    lambda x: x.rolling(window, min_periods=1).std()
                ).fillna(0)

    # 4. 滞后特征（Extended Lag Features）
    for lag in [4, 5]:
        for col in ['x', 'y', 'velocity_x', 'velocity_y']:
            if col in df.columns:
                df[f'{col}_lag{lag}'] = df.groupby(gcols)[col].shift(lag).fillna(0)

    # 5. 速度变化特征（Velocity Change Features）
    if 'velocity_x' in df.columns:
        df['velocity_x_change'] = df.groupby(gcols)['velocity_x'].diff().fillna(0)  # X方向速度变化
        df['velocity_y_change'] = df.groupby(gcols)['velocity_y'].diff().fillna(0)  # Y方向速度变化
        df['speed_change']      = df.groupby(gcols)['s'].diff().fillna(0)           # 速度模变化
        df['direction_change']  = df.groupby(gcols)['dir'].diff().fillna(0)         # 方向变化（角度）
        # 修正角度跳变（如 -179° → 181°）
        df['direction_change']  = df['direction_change'].apply(
            lambda x: x if abs(x) < 180 else x - 360 * np.sign(x)
        )

    # 6. 场地位置特征（Field Position Features）
    df['dist_from_left']      = df['y']                         # 距离左边线
    df['dist_from_right']     = 53.3 - df['y']                  # 距离右边线
    df['dist_from_sideline']  = np.minimum(df['dist_from_left'], df['dist_from_right'])  # 距离最近边线
    df['dist_from_endzone']   = np.minimum(df['x'], 120 - df['x'])  # 距离端区（前后方向）

    # 7. 角色特征（Role-Specific Features）
    if 'is_receiver' in df.columns and 'velocity_alignment' in df.columns:
        df['receiver_optimality'] = df['is_receiver'] * df['velocity_alignment']              # 接球手运动方向匹配度
        df['receiver_deviation']  = df['is_receiver'] * np.abs(df.get('velocity_perpendicular', 0))  # 偏离球方向程度
    if 'is_coverage' in df.columns and 'closing_speed' in df.columns:
        df['defender_closing_speed'] = df['is_coverage'] * df['closing_speed']               # 防守球员逼近速度

    # 8. 时间特征（Time Features）
    df['frames_elapsed']  = df.groupby(gcols).cumcount()  # 当前帧序号
    df['normalized_time'] = df.groupby(gcols)['frames_elapsed'].transform(
        lambda x: x / (x.max() + 1)                       # 归一化时间（0~1）
    )

    print(f"特征增强后总列数: {len(df.columns)}")
    return df


In [10]:
import torch
import torch.nn as nn

class GaussianNoise(nn.Module):
    """
    高斯噪声层（Gaussian Noise Layer）

    作用：
        在训练阶段向输入张量添加随机高斯噪声，用于数据增强或正则化，防止过拟合。
        在推理（评估）阶段则不添加噪声。

    参数：
        stddev (float): 高斯噪声的标准差，控制噪声强度。
    """

    def __init__(self, stddev: float):
        super().__init__()
        self.stddev = stddev

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        前向传播：
            若处于训练模式，则向输入添加噪声；
            若处于评估模式（model.eval()），则直接返回原输入。
        """
        if self.training:
            noise = torch.randn_like(x) * self.stddev   # 生成与输入同形状的高斯噪声
            return x + noise                            # 输入加噪
        return x                                        # 测试时不加噪


In [11]:
import torch
import torch.nn as nn

class SeqModel(nn.Module):
    """
    时序预测模型（Sequence Model）
    ---------------------------------------------------------
    模型结构：
        [输入序列特征] → [双向GRU] → [注意力池化] → [全连接预测头]
                       → [累积输出（预测轨迹/位移）]

    参数说明：
        input_dim (int): 输入特征维度（每帧的特征数）
        horizon (int):   预测时间步长（输出长度，例如未来N帧）

    模块说明：
        - GRU层：提取时序动态特征（双向）
        - LayerNorm：规范化隐藏状态，提升数值稳定性
        - Multi-Head Attention：基于全局上下文的加权汇聚（池化层）
        - Linear Head：多层感知器输出预测值
        - torch.cumsum：对预测结果进行累积求和（例如预测位移序列）
    """

    def __init__(self, input_dim: int, horizon: int):
        super().__init__()

        # ① 双向 GRU 层：输入→隐藏→输出
        self.gru = nn.GRU(
            input_dim,              # 输入维度
            128,                    # 隐藏层维度
            num_layers=2,           # 堆叠层数
            batch_first=True,       # 输入格式 (B, T, D)
            dropout=0.1,            # 层间 dropout
            bidirectional=True      # 双向 GRU
        )

        # ② 层归一化 LayerNorm：稳定双向拼接后的输出
        self.pool_ln = nn.LayerNorm(256)  # 128×2 = 256

        # ③ 注意力池化 Multi-Head Attention
        self.pool_attn = nn.MultiheadAttention(
            embed_dim=256,          # 输入维度与 LayerNorm 输出相同
            num_heads=4,            # 多头注意力数
            batch_first=True        # 支持 (B, T, D) 格式
        )

        # 注意力查询向量（可学习参数）
        self.pool_query = nn.Parameter(torch.randn(1, 1, 256))

        # ④ 输出预测头（MLP）
        self.head = nn.Sequential(
            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, horizon)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        前向传播过程
        参数：
            x: 输入张量 [B, T, input_dim]
        返回：
            y: 输出张量 [B, horizon]
        """
        # ① 提取时间特征
        h, _ = self.gru(x)                # h: [B, T, 256]
        B = h.size(0)

        # ② 复制查询向量（每个batch一份）
        q = self.pool_query.expand(B, -1, -1)  # [B, 1, 256]

        # ③ 注意力池化：让查询q关注整段序列h
        ctx, _ = self.pool_attn(q, self.pool_ln(h), self.pool_ln(h))  # ctx: [B, 1, 256]

        # ④ 通过预测头映射输出（逐步预测）
        out = self.head(ctx.squeeze(1))   # [B, horizon]

        # ⑤ 对预测结果进行累积求和，生成平滑轨迹
        return torch.cumsum(out, dim=1)


In [12]:
# =========================================================
# 模型构建、保存与加载工具函数
# =========================================================
from pathlib import Path
import torch
import joblib


def build_axis_model_from_config(cfg):
    """
    根据配置字典实例化 SeqModel 模型。

    参数:
        cfg (dict): 包含模型超参数的配置，例如：
            {
                'input_dim': 128,
                'horizon': 20
            }

    返回:
        model (SeqModel): 构建好的序列模型实例。
    """
    input_dim = cfg['input_dim']
    horizon = cfg['horizon']
    print(f"📦 从配置构建 SeqModel (input_dim={input_dim}, horizon={horizon})")
    return SeqModel(input_dim=input_dim, horizon=horizon)


def _model_tag_from_instance(model):
    """
    根据模型实例返回其类型标签（tag）。

    参数:
        model (nn.Module): 模型实例。

    返回:
        str: 模型类型标签，例如 'seq'。
    """
    if isinstance(model, SeqModel):
        return 'seq'
    return model.__class__.__name__.lower()


def create_model_save_config(model, input_dim, horizon):
    """
    生成一个最小化的模型配置，用于模型重建。

    参数:
        model (nn.Module): 模型实例（目前仅支持 SeqModel）
        input_dim (int): 输入维度
        horizon (int): 预测步长

    返回:
        dict: 可保存到 checkpoint 的配置字典
    """
    print(f"🧩 创建模型配置: input_dim={input_dim}, horizon={horizon}")
    return {
        'model': 'seq',
        'input_dim': int(input_dim),
        'horizon': int(horizon),
    }


def save_axis_checkpoint(model, cfg, fold_dir, axis_name='x'):
    """
    保存 SeqModel 的模型权重与配置到指定路径。

    参数:
        model (SeqModel): 模型实例
        cfg (dict): 模型配置
        fold_dir (str | Path): 保存目录
        axis_name (str): 坐标轴名称 ('x' 或 'y')

    保存内容:
        fold_dir/axis_x.pt
        fold_dir/axis_y.pt
    """
    cfg = dict(cfg or {})
    cfg['model'] = 'seq'
    path = Path(fold_dir) / f'axis_{axis_name}.pt'
    torch.save({'state_dict': model.state_dict(), 'config': cfg}, str(path))
    print(f"💾 模型已保存至: {path}")


def load_axis_checkpoint(fold_dir, axis_name='x', device=None):
    """
    从指定路径加载单个 SeqModel 检查点。

    参数:
        fold_dir (str | Path): 模型文件所在目录
        axis_name (str): 模型轴标签 ('x' 或 'y')
        device (torch.device | None): 加载目标设备

    返回:
        model (SeqModel): 已加载参数的模型实例
        cfg (dict): 模型配置
    """
    device = device or Config.DEVICE
    ckpt_path = Path(fold_dir) / f'axis_{axis_name}.pt'
    print(f"📂 正在加载模型 [{axis_name}] 来自 {ckpt_path}")

    ckpt = torch.load(str(ckpt_path), map_location=device)
    cfg = ckpt['config']
    state_dict = ckpt['state_dict']

    try:
        # 尝试严格匹配加载
        model = SeqModel(input_dim=cfg['input_dim'], horizon=cfg['horizon']).to(device)
        model.load_state_dict(state_dict, strict=True)
        model.eval()
        print(f"✅ [{axis_name}] 模型加载成功 (strict=True)")
        return model, cfg
    except Exception as e:
        print(f"⚠️ [{axis_name}] 严格加载失败: {e}，尝试使用 non-strict 模式...")

    # 宽松匹配加载（适用于参数名或结构略有差异的情况）
    model = SeqModel(input_dim=cfg['input_dim'], horizon=cfg['horizon']).to(device)
    missing, unexpected = model.load_state_dict(state_dict, strict=False)
    if missing or unexpected:
        print(f"⚠️ [{axis_name}] 非严格加载: 缺失参数={len(missing)}, 意外参数={len(unexpected)}")
    model.eval()
    print(f"✅ [{axis_name}] 模型加载成功 (strict=False)")
    return model, cfg


def load_folds_xy(num_folds, models_dir=None, device=None):
    """
    加载所有折次 (folds) 的 X/Y 方向模型及其对应特征标准化器。

    参数:
        num_folds (int): 交叉验证折数
        models_dir (str | Path | None): 模型文件目录
        device (torch.device | None): 加载目标设备

    返回:
        (models_x, models_y, scalers, cfgs): 四个列表，分别为：
            - models_x: 所有 x 方向模型
            - models_y: 所有 y 方向模型
            - scalers:  每折的特征标准化器（joblib 对象）
            - cfgs:     每折的配置字典
    """
    device = device or Config.DEVICE
    base = Path(models_dir) if models_dir else Path('.')

    models_x, models_y, scalers, cfgs = [], [], [], []

    print(f"📁 开始加载 {num_folds} 折模型 (路径: {base})")

    for fold in range(1, num_folds + 1):
        fold_dir = base / f'fold_{fold}'
        try:
            # 分别加载 X/Y 模型与 scaler
            mx, cfgx = load_axis_checkpoint(fold_dir, 'x', device=device)
            my, cfgy = load_axis_checkpoint(fold_dir, 'y', device=device)
            scaler = joblib.load(str(fold_dir / 'lstm_feature_scaler_fold.joblib'))

            models_x.append(mx)
            models_y.append(my)
            scalers.append(scaler)
            cfgs.append(cfgx)
            print(f"✅ Fold {fold} 加载完成")
        except Exception as e:
            print(f"❌ Fold {fold} 加载失败: {e}")

    print(f"📦 共加载 {len(models_x)} 个 fold 模型（X/Y 方向）")
    return models_x, models_y, scalers, cfgs


In [13]:
def create_ensemble_predictions_xy(
    models_x, models_y, scalers, X_test_unscaled, test_seq_ids, test_template, batch_size=1024
):
    """
    基于多个折（fold）的 X/Y 轴模型进行集成预测（Ensemble），输出最终坐标预测结果。

    参数说明：
        models_x, models_y : list
            不同折的 SeqModel 模型（x 轴与 y 轴预测模型）。
        scalers : list | None
            对应每个 fold 的 StandardScaler 对象（或 None）。
        X_test_unscaled : list[np.ndarray]
            未缩放的测试序列列表，每个元素形状为 (T, F)。
        test_seq_ids : list[dict]
            每个序列对应的元数据，包含：
            {'game_id', 'play_id', 'nfl_id', 'frame_id'（最后一帧）}
        test_template : pd.DataFrame
            官方提交格式的模板 DataFrame，包含所有 (game_id, play_id, nfl_id, frame_id)。
        batch_size : int
            批量大小，默认 1024。

    返回：
        submission : pd.DataFrame
            最终预测结果，包含列 ['id', 'x', 'y']。
    """

    # ============================
    # Step 0. 模型与数据检查
    # ============================
    if len(models_x) == 0 or len(models_x) != len(models_y):
        print(f"⚠️ 模型数量错误：len(models_x)={len(models_x)}, len(models_y)={len(models_y)}")
        print("❌ 没有可用的模型或折数不匹配。")
        return None

    if scalers is not None and len(scalers) != len(models_x):
        raise ValueError("❌ scalers 的数量必须与模型折数相同。")

    print(f"🧩 共检测到 {len(models_x)} 个折 (fold) 模型，将进行集成预测。")

    # ============================
    # Step 1. 数据预处理
    # ============================
    X_test_unscaled = np.array(X_test_unscaled, dtype=object)
    N = len(X_test_unscaled)
    print(f"📦 测试序列数量: {N}")

    # 获取每个序列最后一帧的原始坐标 (x, y)
    x_last = np.array([seq[-1, 0] for seq in X_test_unscaled], dtype=np.float32)
    y_last = np.array([seq[-1, 1] for seq in X_test_unscaled], dtype=np.float32)

    per_fold_dx, per_fold_dy = [], []

    # ============================
    # Step 2. 各折模型预测
    # ============================
    for i in range(len(models_x)):
        print(f"\n🚀 第 {i + 1}/{len(models_x)} 折模型预测中...")
        model_x = models_x[i]
        model_y = models_y[i]
        scaler = scalers[i] if scalers is not None else None

        # 如果存在标准化器，则逐序列进行缩放
        if scaler is not None:
            scaled = np.array([scaler.transform(s) for s in X_test_unscaled], dtype=object)
        else:
            scaled = X_test_unscaled

        # 拼接为 (N, T, F)
        X = np.stack(scaled.astype(np.float32))
        device = next(model_x.parameters()).device

        # 构建 DataLoader
        ds = TensorDataset(torch.from_numpy(X))
        dl = DataLoader(ds, batch_size=batch_size, shuffle=False)

        dx_list, dy_list = [], []
        model_x.eval()
        model_y.eval()

        with torch.no_grad():
            for (batch,) in dl:
                batch = batch.to(device)
                dx = model_x(batch)   # (B, H)
                dy = model_y(batch)   # (B, H)
                dx_list.append(dx.cpu().numpy())
                dy_list.append(dy.cpu().numpy())

        # 拼接所有 batch 输出
        dx_cum = np.vstack(dx_list)  # (N, H)
        dy_cum = np.vstack(dy_list)  # (N, H)

        per_fold_dx.append(dx_cum)
        per_fold_dy.append(dy_cum)
        print(f"✅ 第 {i + 1} 折预测完成 (形状: dx={dx_cum.shape}, dy={dy_cum.shape})")

    # ============================
    # Step 3. 模型集成 (取均值)
    # ============================
    print("\n🔗 开始模型集成 (平均融合各折预测)...")
    ens_dx = np.mean(np.stack(per_fold_dx, axis=0), axis=0)  # (N, H)
    ens_dy = np.mean(np.stack(per_fold_dy, axis=0), axis=0)  # (N, H)
    print(f"✅ 集成完成 (ens_dx={ens_dx.shape}, ens_dy={ens_dy.shape})")

    # ============================
    # Step 4. 构建提交结果
    # ============================
    test_meta = pd.DataFrame(test_seq_ids)
    out_rows = []
    H = ens_dx.shape[1]

    print("\n🧮 正在生成最终坐标预测结果...")

    for i, seq_info in test_meta.iterrows():
        game_id = int(seq_info['game_id'])
        play_id = int(seq_info['play_id'])
        nfl_id = int(seq_info['nfl_id'])

        # 找出该球员的预测帧序列
        frame_ids = (
            test_template[
                (test_template['game_id'] == game_id) &
                (test_template['play_id'] == play_id) &
                (test_template['nfl_id'] == nfl_id)
            ]['frame_id'].sort_values()
        )

        # 生成每帧预测坐标
        for t, frame_id in enumerate(frame_ids):
            tt = t if t < H else H - 1
            px = np.clip(x_last[i] + ens_dx[i, tt], Config.FIELD_X_MIN, Config.FIELD_X_MAX)
            py = np.clip(y_last[i] + ens_dy[i, tt], Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
            out_rows.append({
                'id': f"{game_id}_{play_id}_{nfl_id}_{frame_id}",
                'x': px,
                'y': py
            })

    submission = pd.DataFrame(out_rows)
    print(f"\n✅ 共生成 {len(submission)} 条预测结果。")

    return submission


In [14]:
def create_ensemble_val_predictions(
    models, scalers, X_val_unscaled, val_ids,
    y_val_dx_fold, y_val_dy_fold, val_data,
    exclude_fold=None
):
    """
    在验证集上生成集成预测（ensemble），并准备真实值与预测结果以供评分。

    功能：
        - 对每个验证序列，使用其他折（fold）的模型进行预测；
        - 防止数据泄露（排除当前fold对应的模型）；
        - 返回真实轨迹与预测轨迹的 DataFrame。

    参数：
        models : list[nn.Module]
            训练好的模型列表（通常为多个折的模型）。
        scalers : list[StandardScaler]
            与模型对应的特征标准化器。
        X_val_unscaled : list[np.ndarray]
            未缩放的验证序列，每个元素为 (T, F)。
        val_ids : list[dict]
            每条序列的元信息，包含 ['game_id', 'play_id', 'nfl_id']。
        y_val_dx_fold, y_val_dy_fold : list[np.ndarray]
            每条验证样本的真实位移（Δx, Δy）。
        val_data : pd.DataFrame
            包含验证集中每个样本的 x_last、y_last。
        exclude_fold : int | None
            需要排除的折号（0-based），用于避免模型泄露。

    返回：
        tuple(pd.DataFrame, pd.DataFrame)
            (ensemble_pred_df, ensemble_true_df)
            其中：
            - ensemble_pred_df：预测结果 DataFrame（id, x, y）
            - ensemble_true_df：真实结果 DataFrame（id, x, y）
    """

    pred_rows = []
    true_rows = []

    print("🔍 开始生成验证集集成预测...")
    if exclude_fold is not None:
        print(f"⚙️  当前排除第 {exclude_fold + 1} 折的模型，以防止数据泄露。")

    for i, seq_info in enumerate(val_ids):
        game_id = seq_info["game_id"]
        play_id = seq_info["play_id"]
        nfl_id = seq_info["nfl_id"]

        x_last = val_data.iloc[i]["x_last"]
        y_last = val_data.iloc[i]["y_last"]

        # ----------------------------
        # 真实轨迹 (Ground Truth)
        # ----------------------------
        dx_true = y_val_dx_fold[i]
        dy_true = y_val_dy_fold[i]
        for t in range(len(dx_true)):
            frame_rel = t + 1
            true_x = x_last + dx_true[t]
            true_y = y_last + dy_true[t]
            true_rows.append({
                "id": f"{game_id}_{play_id}_{nfl_id}_{frame_rel}",
                "x": true_x,
                "y": true_y
            })

        # ----------------------------
        # 模型预测（排除当前 fold）
        # ----------------------------
        per_model_dx = []
        per_model_dy = []

        for j, model in enumerate(models):
            if exclude_fold is not None and j == exclude_fold:
                continue  # 排除当前验证 fold 对应模型

            scaler = scalers[j]
            scaled_seq = scaler.transform(X_val_unscaled[i]).astype(np.float32)
            scaled_seq = torch.tensor(scaled_seq).unsqueeze(0).to(next(model.parameters()).device)

            model.eval()
            with torch.no_grad():
                output = model(scaled_seq).cpu().numpy()[0]  # (H, 2)
            per_model_dx.append(output[:, 0])
            per_model_dy.append(output[:, 1])

        # ----------------------------
        # 集成（取平均）
        # ----------------------------
        if per_model_dx:  # 至少一个模型可用
            ens_dx = np.mean(per_model_dx, axis=0)
            ens_dy = np.mean(per_model_dy, axis=0)
        else:
            # 理论上不会发生，除非只有一个 fold
            ens_dx = np.zeros(len(dx_true))
            ens_dy = np.zeros(len(dy_true))
            print(f"⚠️ 样本 {i} 没有可用模型，使用零预测。")

        # ----------------------------
        # 生成预测轨迹
        # ----------------------------
        for t in range(len(dx_true)):
            pred_x = x_last + ens_dx[t]
            pred_y = y_last + ens_dy[t]
            pred_rows.append({
                "id": f"{game_id}_{play_id}_{nfl_id}_{t + 1}",
                "x": np.clip(pred_x, Config.FIELD_X_MIN, Config.FIELD_X_MAX),
                "y": np.clip(pred_y, Config.FIELD_Y_MIN, Config.FIELD_Y_MAX)
            })

    print(f"✅ 集成预测完成，共生成 {len(pred_rows)} 条预测样本。")

    return pd.DataFrame(pred_rows), pd.DataFrame(true_rows)


In [15]:
set_global_seeds(Config.SEED)
# print(f"Loading pretrained models from {Config.NN_PRETRAIN_DIR}")
# models_x_nn, models_y_nn, scalers, cfgs = load_folds_xy(num_folds=Config.N_FOLDS, models_dir=Config.NN_PRETRAIN_DIR, device=Config.DEVICE)


train_input, train_output, test_input, test_template = load_data(debug_fraction=1.0)

开始加载数据...
检测到 18 周的有效训练数据。
已加载 4,880,579 条输入记录，562,936 条输出记录。


In [16]:
# # 生成时序样本（测试模式）
# test_sequences, test_seq_ids, feature_cols = prepare_sequences(
#     test_input, test_template=test_template, is_training=False, window_size=Config.WINDOW_SIZE
# )

# print(f"✅ 已准备好 {len(test_sequences)} 个测试序列，每个样本包含 {len(feature_cols)} 个特征。")
# print(f"📏 示例序列形状：{test_sequences[0].shape}")








训练数据若帧数不足 不填充

In [17]:
def prepare_sequences(input_df, output_df=None, test_template=None,
                      is_training=True, window_size=8,
                      use_players_interactions=Config.USE_PLAYERS_INTERACTIONS):
    """
    构建包含所有高级特征的时序样本序列（支持训练与测试）。
    input_df：输入特征数据
    output_df：训练标签（仅在 is_training=True 时使用）
    test_template：测试模板（仅在预测时使用）
    """
    print(f"\n{'=' * 80}")
    print("🚀 开始构建时序样本（包含高级特征）")
    print(f"{'=' * 80}")
    print(f"窗口大小（window_size）: {window_size}")

    input_df = input_df.copy()

    # Step 1：基础特征构建
    print("步骤 1/4 ▶ 添加基础特征...")

    # 将球员身高从“英尺-英寸”格式转换为英尺小数
    input_df['player_height_feet'] = input_df['player_height'].apply(height_to_feet)

    # 计算速度和加速度的 x、y 分量
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))
    delta_t = 0.1
    input_df['velocity_x']     = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.sin(dir_rad)
    input_df['velocity_y']     = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.cos(dir_rad)
    input_df['acceleration_x'] = input_df['a'] * np.sin(dir_rad)
    input_df['acceleration_y'] = input_df['a'] * np.cos(dir_rad)

    # 角度特征：将朝向角(o)与运动方向(dir)编码为正弦/余弦形式
    input_df['o_sin']  = np.sin(np.deg2rad(input_df['o'].fillna(0)))
    input_df['o_cos']  = np.cos(np.deg2rad(input_df['o'].fillna(0)))
    input_df['dir_sin'] = np.sin(np.deg2rad(input_df['dir'].fillna(0)))
    input_df['dir_cos'] = np.cos(np.deg2rad(input_df['dir'].fillna(0)))

    # 角色特征：进攻/防守/传球/接球/防守覆盖
    input_df['is_offense']  = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense']  = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Targeted Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer']   = (input_df['player_role'] == 'Passer').astype(int)

    # 物理特征：动量与动能
    mass_kg = input_df['player_weight'].fillna(200.0) / 2.20462  # 磅→千克
    input_df['momentum_x']     = input_df['velocity_x'] * mass_kg
    input_df['momentum_y']     = input_df['velocity_y'] * mass_kg
    input_df['kinetic_energy'] = 0.5 * mass_kg * (input_df['s'] ** 2)

    # 球与球员之间的空间特征
    if 'ball_land_x' in input_df.columns:
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']
        input_df['distance_to_ball']   = np.sqrt(ball_dx**2 + ball_dy**2)
        input_df['angle_to_ball']      = np.arctan2(ball_dy, ball_dx)
        input_df['ball_direction_x']   = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y']   = ball_dy / (input_df['distance_to_ball'] + 1e-6)
        input_df['closing_speed']      = (
            input_df['velocity_x'] * input_df['ball_direction_x'] +
            input_df['velocity_y'] * input_df['ball_direction_y']
        )

    # 时间排序（确保帧序一致）
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    gcols = ['game_id', 'play_id', 'nfl_id']

    # 添加滞后特征（历史 1~3 帧）
    for lag in [1, 2, 3]:
        input_df[f'x_lag{lag}']          = input_df.groupby(gcols)['x'].shift(lag)
        input_df[f'y_lag{lag}']          = input_df.groupby(gcols)['y'].shift(lag)
        input_df[f'velocity_x_lag{lag}'] = input_df.groupby(gcols)['velocity_x'].shift(lag)
        input_df[f'velocity_y_lag{lag}'] = input_df.groupby(gcols)['velocity_y'].shift(lag)

    # EMA（指数滑动平均）平滑速度变化
    input_df['velocity_x_ema'] = input_df.groupby(gcols)['velocity_x'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['velocity_y_ema'] = input_df.groupby(gcols)['velocity_y'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['speed_ema'] = input_df.groupby(gcols)['s'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )

    # Step 2：高级特征
    print("步骤 2/4 ▶ 添加高级特征...")
    input_df = add_advanced_features(input_df)

    # Step 3：球员交互特征
    print("步骤 3/4 ▶ 添加球员交互特征...")
    if use_players_interactions:
        print("✅ 已启用球员交互特征计算（use_players_interactions=True）")

        agg_rows = []  # 用于保存每一帧的交互统计结果

        # 按比赛 (game_id)、回合 (play_id)、帧 (frame_id) 分组
        # 每一组包含同一帧中所有球员的空间状态
        for (g, p, f), grp in input_df.groupby(['game_id', 'play_id', 'frame_id'], sort=False):
            n = len(grp)
            nfl_ids = grp['nfl_id'].to_numpy()

            # 如果存在 player_to_predict，则只对需要预测的球员计算交互特征
            compute_mask = (
                grp['player_to_predict'].to_numpy().astype(bool)
                if 'player_to_predict' in grp.columns
                else np.ones(n, dtype=bool)
            )

            # 若该帧球员不足 2 人，则构造空记录（NaN）
            if n < 2:
                for nid in nfl_ids[compute_mask]:
                    agg_rows.append({
                        'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                        'distance_to_player_mean_offense': np.nan,
                        'distance_to_player_min_offense': np.nan,
                        'distance_to_player_max_offense': np.nan,
                        'relative_velocity_magnitude_mean_offense': np.nan,
                        'relative_velocity_magnitude_min_offense': np.nan,
                        'relative_velocity_magnitude_max_offense': np.nan,
                        'angle_to_player_mean_offense': np.nan,
                        'angle_to_player_min_offense': np.nan,
                        'angle_to_player_max_offense': np.nan,
                        'distance_to_player_mean_defense': np.nan,
                        'distance_to_player_min_defense': np.nan,
                        'distance_to_player_max_defense': np.nan,
                        'relative_velocity_magnitude_mean_defense': np.nan,
                        'relative_velocity_magnitude_min_defense': np.nan,
                        'relative_velocity_magnitude_max_defense': np.nan,
                        'angle_to_player_mean_defense': np.nan,
                        'angle_to_player_min_defense': np.nan,
                        'angle_to_player_max_defense': np.nan,
                        'nearest_opponent_dist': np.nan,
                        'nearest_opponent_angle': np.nan,
                        'nearest_opponent_rel_speed': np.nan,
                    })
                continue

            # 获取球员位置与速度信息
            x  = grp['x'].to_numpy(dtype=np.float32)
            y  = grp['y'].to_numpy(dtype=np.float32)
            vx = grp['velocity_x'].to_numpy(dtype=np.float32)
            vy = grp['velocity_y'].to_numpy(dtype=np.float32)
            is_offense = grp['is_offense'].to_numpy()
            is_defense = grp['is_defense'].to_numpy()

            # --- 计算两两间的几何关系矩阵 ---
            dx = x[None, :] - x[:, None]      # X方向差值矩阵
            dy = y[None, :] - y[:, None]      # Y方向差值矩阵
            dist = np.sqrt(dx ** 2 + dy ** 2) # 欧氏距离矩阵 (n×n)
            angle_mat = np.arctan2(-dy, -dx)  # 从球员 i 指向 j 的角度

            # --- 相对速度矩阵 ---
            dvx = vx[:, None] - vx[None, :]
            dvy = vy[:, None] - vy[None, :]
            rel_speed = np.sqrt(dvx ** 2 + dvy ** 2)

            # --- 各类掩码 ---
            offense_mask = (is_offense[:, None] == is_offense[None, :])
            np.fill_diagonal(offense_mask, False)  # 自身不参与计算

            defense_mask = (is_defense[:, None] == is_defense[None, :])
            np.fill_diagonal(defense_mask, False)

            opp_mask = (is_offense[:, None] != is_offense[None, :])  # 对手阵营
            np.fill_diagonal(opp_mask, False)

            # --- 将自身的值置为 NaN，避免干扰统计 ---
            dist_diag_nan  = dist.copy();      np.fill_diagonal(dist_diag_nan,  np.nan)
            rel_diag_nan   = rel_speed.copy(); np.fill_diagonal(rel_diag_nan, np.nan)
            angle_diag_nan = angle_mat.copy(); np.fill_diagonal(angle_diag_nan, np.nan)

            # --- 定义统计函数：计算均值、最小值、最大值 ---
            def masked_stats(mat, mask):
                masked = np.where(mask, mat, np.nan)
                cnt  = mask.sum(axis=1)
                mean = np.nanmean(masked, axis=1)
                amin = np.nanmin(masked, axis=1)
                amax = np.nanmax(masked, axis=1)
                zero = cnt == 0  # 若无有效数据则置 NaN
                mean[zero] = np.nan; amin[zero] = np.nan; amax[zero] = np.nan
                return mean, amin, amax

            # --- 计算进攻方之间的距离、相对速度、角度统计 ---
            d_mean_o, d_min_o, d_max_o = masked_stats(dist_diag_nan, offense_mask)
            v_mean_o, v_min_o, v_max_o = masked_stats(rel_diag_nan, offense_mask)
            a_mean_o, a_min_o, a_max_o = masked_stats(angle_diag_nan, offense_mask)

            # --- 计算防守方之间的统计 ---
            d_mean_d, d_min_d, d_max_d = masked_stats(dist_diag_nan, defense_mask)
            v_mean_d, v_min_d, v_max_d = masked_stats(rel_diag_nan, defense_mask)
            a_mean_d, a_min_d, a_max_d = masked_stats(angle_diag_nan, defense_mask)

            # --- 计算最近对手距离/角度/相对速度 ---
            masked_dist_opp = np.where(opp_mask, dist_diag_nan, np.nan)
            nearest_dist = np.nanmin(masked_dist_opp, axis=1)
            nearest_idx  = np.nanargmin(masked_dist_opp, axis=1)
            all_nan = ~np.isfinite(nearest_dist)
            nearest_idx_safe = nearest_idx.copy()
            nearest_idx_safe[all_nan] = 0

            nearest_angle = np.take_along_axis(angle_diag_nan, nearest_idx_safe[:, None], axis=1).squeeze(1)
            nearest_rel   = np.take_along_axis(rel_diag_nan, nearest_idx_safe[:, None], axis=1).squeeze(1)
            nearest_angle[all_nan] = np.nan
            nearest_rel[all_nan]   = np.nan

            # --- 汇总每位球员的交互特征 ---
            for idx, nid in enumerate(nfl_ids):
                if not compute_mask[idx]:
                    continue
                agg_rows.append({
                    'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                    'distance_to_player_mean_offense': d_mean_o[idx],
                    'distance_to_player_min_offense':  d_min_o[idx],
                    'distance_to_player_max_offense':  d_max_o[idx],
                    'relative_velocity_magnitude_mean_offense': v_mean_o[idx],
                    'relative_velocity_magnitude_min_offense':  v_min_o[idx],
                    'relative_velocity_magnitude_max_offense':  v_max_o[idx],
                    'angle_to_player_mean_offense': a_mean_o[idx],
                    'angle_to_player_min_offense':  a_min_o[idx],
                    'angle_to_player_max_offense':  a_max_o[idx],

                    'distance_to_player_mean_defense': d_mean_d[idx],
                    'distance_to_player_min_defense':  d_min_d[idx],
                    'distance_to_player_max_defense':  d_max_d[idx],
                    'relative_velocity_magnitude_mean_defense': v_mean_d[idx],
                    'relative_velocity_magnitude_min_defense':  v_min_d[idx],
                    'relative_velocity_magnitude_max_defense':  v_max_d[idx],
                    'angle_to_player_mean_defense': a_mean_d[idx],
                    'angle_to_player_min_defense':  a_min_d[idx],
                    'angle_to_player_max_defense':  a_max_d[idx],

                    'nearest_opponent_dist':      nearest_dist[idx],
                    'nearest_opponent_angle':     nearest_angle[idx],
                    'nearest_opponent_rel_speed': nearest_rel[idx],
                })

        # 合并交互特征回主表
        interaction_agg = pd.DataFrame(agg_rows)
        input_df = input_df.merge(
            interaction_agg,
            on=['game_id', 'play_id', 'frame_id', 'nfl_id'],
            how='left'
        )

        print("✅ 球员交互特征添加完成。")

    else:
        print("⚠️ 跳过球员交互特征计算（use_players_interactions=False）。")


    # Step 4：构建输入序列
    print("步骤 4/4 ▶ 构建输入序列样本...")


    feature_cols = [
        # —— 基础核心特征（Core, 6）——
        'x', 'y', 's', 'a', 'ball_land_x', 'ball_land_y',

        # —— 角度编码特征（Angles encoded, 4）——
        'o_sin', 'o_cos', 'dir_sin', 'dir_cos',

        # —— 球员静态特征（Player, 2）——
        'player_height_feet', 'player_weight',

        # —— 动态运动特征（Motion, 7）——
        'velocity_x', 'velocity_y', 'acceleration_x', 'acceleration_y',
        'momentum_x', 'momentum_y', 'kinetic_energy',

        # —— 角色身份特征（Roles, 5）——
        'is_offense', 'is_defense', 'is_receiver', 'is_coverage', 'is_passer',

        # —— 球与球员空间特征（Ball relation, 5）——
        'distance_to_ball', 'angle_to_ball',
        'ball_direction_x', 'ball_direction_y', 'closing_speed',

        # —— 原始时序滞后特征（Original temporal lags, 15）——
        'x_lag1', 'y_lag1', 'velocity_x_lag1', 'velocity_y_lag1',
        'x_lag2', 'y_lag2', 'velocity_x_lag2', 'velocity_y_lag2',
        'x_lag3', 'y_lag3', 'velocity_x_lag3', 'velocity_y_lag3',
        'velocity_x_ema', 'velocity_y_ema', 'speed_ema',

        # —— 距离变化速率特征（Distance rate, 3）——
        'distance_to_ball_change', 'distance_to_ball_accel', 'time_to_intercept',

        # —— 目标方向对齐特征（Target alignment, 3）——
        'velocity_alignment', 'velocity_perpendicular', 'accel_alignment',

        # —— 多窗口滚动特征（Multi-window rolling, 24）——
        'velocity_x_roll3', 'velocity_x_std3', 'velocity_y_roll3', 'velocity_y_std3',
        's_roll3', 's_std3', 'a_roll3', 'a_std3',
        'velocity_x_roll5', 'velocity_x_std5', 'velocity_y_roll5', 'velocity_y_std5',
        's_roll5', 's_std5', 'a_roll5', 'a_std5',
        'velocity_x_roll10', 'velocity_x_std10', 'velocity_y_roll10', 'velocity_y_std10',
        's_roll10', 's_std10', 'a_roll10', 'a_std10',

        # —— 扩展时序滞后特征（Extended lags, 8）——
        'x_lag4', 'y_lag4', 'velocity_x_lag4', 'velocity_y_lag4',
        'x_lag5', 'y_lag5', 'velocity_x_lag5', 'velocity_y_lag5',

        # —— 速度变化特征（Velocity change, 4）——
        'velocity_x_change', 'velocity_y_change', 'speed_change', 'direction_change',

        # —— 场地位置特征（Field position, 2）——
        'dist_from_sideline', 'dist_from_endzone',

        # —— 角色相关特征（Role-specific, 3）——
        'receiver_optimality', 'receiver_deviation', 'defender_closing_speed',

        # —— 时间进程特征（Time, 2）——
        'frames_elapsed', 'normalized_time',

        # —— 球员交互特征（Player interactions, 21）——
        'distance_to_player_mean_offense', 'distance_to_player_min_offense', 'distance_to_player_max_offense',
        'relative_velocity_magnitude_mean_offense', 'relative_velocity_magnitude_min_offense', 'relative_velocity_magnitude_max_offense',
        'angle_to_player_mean_offense', 'angle_to_player_min_offense', 'angle_to_player_max_offense',
        'distance_to_player_mean_defense', 'distance_to_player_min_defense', 'distance_to_player_max_defense',
        'relative_velocity_magnitude_mean_defense', 'relative_velocity_magnitude_min_defense', 'relative_velocity_magnitude_max_defense',
        'angle_to_player_mean_defense', 'angle_to_player_min_defense', 'angle_to_player_max_defense',
        'nearest_opponent_dist', 'nearest_opponent_angle', 'nearest_opponent_rel_speed',
    ]

    # 保留当前数据集中确实存在的特征列（避免 KeyError）
    feature_cols = [c for c in feature_cols if c in input_df.columns]

    # 输出特征数量信息
    print(f"✅ 使用的特征列数量: {len(feature_cols)} 个")


    # CREATE SEQUENCES

    # 设置索引并按球员分组
    input_df.set_index(['game_id', 'play_id', 'nfl_id'], inplace=True)
    grouped = input_df.groupby(level=['game_id', 'play_id', 'nfl_id'])

    # 选择目标数据源（训练或测试）
    target_rows = output_df if is_training else test_template
    target_groups = target_rows[['game_id', 'play_id', 'nfl_id']].drop_duplicates()

    # 存储容器
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = [], [], [], [], []

    # 遍历每个球员的时间序列
    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups), desc="⏳ 正在创建序列"):
        key = (row['game_id'], row['play_id'], row['nfl_id'])

        try:
            group_df = grouped.get_group(key)
        except KeyError:
            continue

        # 提取时间窗口（最后 window_size 帧）
        input_window = group_df.tail(window_size)

        # 若帧数不足则进行填充
        if len(input_window) < window_size:
            if is_training:
                print(f"Skipping sequence with insufficient history for {key}")
                continue
            print(f"⚠️ 序列不足 {window_size} 帧，自动填充：{key}")
            pad_len = window_size - len(input_window)
            pad_df  = pd.concat([input_window.iloc[0:1]] * pad_len, ignore_index=True)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)

        # 缺失值前向/后向填充
        input_window = input_window.ffill().bfill().fillna(0.0)

        seq = input_window[feature_cols].values
        if np.isnan(seq).any():
            print(f"⚠️ 在 {key} 的序列中发现 NaN，已用 0.0 替换。")
            seq = np.nan_to_num(seq, nan=0.0)

        sequences.append(seq)

        # 若为训练模式，则计算预测目标（Δx, Δy）
        if is_training:
            out_grp = output_df[
                (output_df['game_id'] == row['game_id']) &
                (output_df['play_id'] == row['play_id']) &
                (output_df['nfl_id']  == row['nfl_id'])
            ].sort_values('frame_id')

            last_x = input_window.iloc[-1]['x']
            last_y = input_window.iloc[-1]['y']

            dx = out_grp['x'].values - last_x
            dy = out_grp['y'].values - last_y

            targets_dx.append(dx)
            targets_dy.append(dy)
            targets_frame_ids.append(out_grp['frame_id'].values)

        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'nfl_id': key[2],
            'frame_id': input_window.iloc[-1]['frame_id']
        })

    print(f"\n✅ 共生成 {len(sequences)} 个序列，每个序列包含 {len(feature_cols)} 个特征。")

    if is_training:
        return sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids, feature_cols
    return sequences, sequence_ids, feature_cols


In [18]:
# 生成时序样本（训练模式）
sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids, feature_cols = prepare_sequences(
    train_input,
    output_df=train_output,
    test_template=test_template,
    is_training=True,
    window_size=Config.WINDOW_SIZE
)

# 输出结果信息
print(f"✅ 已准备好 {len(sequences)} 个时序样本，每个样本包含 {len(feature_cols)} 个特征。")
print(f"📏 每个样本的窗口长度为 {Config.WINDOW_SIZE} 帧。")
print(f"📊 示例样本形状：{sequences[0].shape}")
print(f"🎯 训练目标示例：dx={targets_dx[0].shape}, dy={targets_dy[0].shape}")
print(f"🆔 样本索引数量：{len(sequence_ids)}")



🚀 开始构建时序样本（包含高级特征）
窗口大小（window_size）: 12
步骤 1/4 ▶ 添加基础特征...
步骤 2/4 ▶ 添加高级特征...
正在添加高级特征...
特征增强后总列数: 112
步骤 3/4 ▶ 添加球员交互特征...
✅ 已启用球员交互特征计算（use_players_interactions=True）
✅ 球员交互特征添加完成。
步骤 4/4 ▶ 构建输入序列样本...
✅ 使用的特征列数量: 114 个


⏳ 正在创建序列:   0%|          | 0/46045 [00:00<?, ?it/s]

Skipping sequence with insufficient history for (2023091004, 1594, 52453)
Skipping sequence with insufficient history for (2023091004, 1594, 52430)
Skipping sequence with insufficient history for (2023091004, 4122, 41233)
Skipping sequence with insufficient history for (2023091008, 1292, 48516)
Skipping sequence with insufficient history for (2023091008, 1292, 55921)
Skipping sequence with insufficient history for (2023091008, 1292, 54597)
Skipping sequence with insufficient history for (2023091400, 318, 52430)
Skipping sequence with insufficient history for (2023091700, 1728, 46087)
Skipping sequence with insufficient history for (2023091700, 1728, 54473)
Skipping sequence with insufficient history for (2023091709, 493, 53531)
Skipping sequence with insufficient history for (2023091709, 493, 53601)
Skipping sequence with insufficient history for (2023091709, 493, 56042)
Skipping sequence with insufficient history for (2023091712, 81, 43327)
Skipping sequence with insufficient history 

In [19]:
# === 保存 pkl 文件 ===
save_path = Config.DATA_DIR / "train_data_cache_unpad.pkl"

# 保存（保留 numpy 对象类型）
with open(save_path, "wb") as f:
    pickle.dump({
        "sequences": sequences,
        "targets_dx": targets_dx,
        "targets_dy": targets_dy,
        "targets_frame_ids": targets_frame_ids,
        "sequence_ids": sequence_ids,
        "feature_cols": feature_cols
    }, f)

print(f"✅ 已使用 pickle 保存对象到 {save_path}")


✅ 已使用 pickle 保存对象到 D:\数据\Kaggle\2026 年 NFL 大数据碗 - 预测\DATA_DIR000\train_data_cache_unpad.pkl


In [20]:
# === 加载 pkl 文件 ===
data_path = Config.DATA_DIR / "train_data_cache_unpad.pkl"

with open(save_path, "rb") as f:
    data = pickle.load(f)

sequences_2 = data["sequences"]
targets_dx_2 = data["targets_dx"]
targets_dy_2 = data["targets_dy"]
targets_frame_ids_2 = data["targets_frame_ids"]
sequence_ids_2 = data["sequence_ids"]
feature_cols_2 = data["feature_cols"]

print("✅ 数据加载成功到 *_2 变量！")



✅ 数据加载成功到 *_2 变量！


In [21]:
# 哈希验证
def get_obj_type(obj):
    """识别对象类型（支持嵌套结构）"""
    if isinstance(obj, np.ndarray):
        return f"numpy.ndarray shape={obj.shape} dtype={obj.dtype}"
    elif isinstance(obj, list):
        if len(obj) == 0:
            return "list (empty)"
        inner_type = get_obj_type(obj[0])
        return f"list[{inner_type}] len={len(obj)}"
    elif isinstance(obj, dict):
        return f"dict (keys={len(obj.keys())})"
    elif isinstance(obj, (int, float, str, bool, type(None))):
        return type(obj).__name__
    else:
        return f"object ({type(obj).__name__})"


def hash_object(obj) -> str:
    """生成对象的稳定哈希签名（支持 list / dict / ndarray / 标量）"""
    m = hashlib.sha256()

    def _update(o):
        if isinstance(o, np.ndarray):
            m.update(o.tobytes())
            m.update(str(o.shape).encode())
            m.update(str(o.dtype).encode())
        elif isinstance(o, (list, tuple)):
            m.update(f"len={len(o)}".encode())
            for item in o:
                _update(item)
        elif isinstance(o, dict):
            for k in sorted(o.keys()):
                m.update(str(k).encode())
                _update(o[k])
        elif isinstance(o, (str, int, float, bool, type(None))):
            m.update(str(o).encode())
        else:
            m.update(repr(o).encode())

    _update(obj)
    return m.hexdigest()


def verify_objects(obj1, obj2, name="变量"):
    """
    比较两个对象内容是否一致，并输出简洁单行信息。
    支持：list / dict / numpy.ndarray / 标量
    """
    type1 = get_obj_type(obj1)
    type2 = get_obj_type(obj2)
    hash1 = hash_object(obj1)
    hash2 = hash_object(obj2)
    same = hash1 == hash2

    print(f"[{name}]")
    print(f"{type1}")
    print(f"{type2}")
    print(f"{hash1[:24]}")
    print(f"{hash2[:24]}")
    print(f"{'✅一致' if same else '❌不同'}")
    return same


In [22]:
verify_objects(sequences, sequences_2, "sequences")
verify_objects(targets_dx, targets_dx_2, "targets_dx")
verify_objects(targets_dy, targets_dy_2, "targets_dy")
verify_objects(targets_frame_ids, targets_frame_ids_2, "targets_frame_ids")
verify_objects(sequence_ids, sequence_ids_2, "sequence_ids")
verify_objects(feature_cols, feature_cols_2, "feature_cols")


[sequences]
list[numpy.ndarray shape=(12, 114) dtype=float64] len=45956
list[numpy.ndarray shape=(12, 114) dtype=float64] len=45956
f6eb874159c60cf6c9905132
f6eb874159c60cf6c9905132
✅一致
[targets_dx]
list[numpy.ndarray shape=(21,) dtype=float64] len=45956
list[numpy.ndarray shape=(21,) dtype=float64] len=45956
2fcee115f7b268256baf2a46
2fcee115f7b268256baf2a46
✅一致
[targets_dy]
list[numpy.ndarray shape=(21,) dtype=float64] len=45956
list[numpy.ndarray shape=(21,) dtype=float64] len=45956
1aaf324c0fe11c32418be172
1aaf324c0fe11c32418be172
✅一致
[targets_frame_ids]
list[numpy.ndarray shape=(21,) dtype=int64] len=45956
list[numpy.ndarray shape=(21,) dtype=int64] len=45956
22c0bb381dffffe48d1cb725
22c0bb381dffffe48d1cb725
✅一致
[sequence_ids]
list[dict (keys=4)] len=45956
list[dict (keys=4)] len=45956
e39ef11edaf968ab84a15ffe
e39ef11edaf968ab84a15ffe
✅一致
[feature_cols]
list[str] len=114
list[str] len=114
44cf06a02b1e47da769fea6a
44cf06a02b1e47da769fea6a
✅一致


True

训练数据若帧数不足 填充

In [23]:
def prepare_sequences(input_df, output_df=None, test_template=None,
                      is_training=True, window_size=8,
                      use_players_interactions=Config.USE_PLAYERS_INTERACTIONS):
    """
    构建包含所有高级特征的时序样本序列（支持训练与测试）。
    input_df：输入特征数据
    output_df：训练标签（仅在 is_training=True 时使用）
    test_template：测试模板（仅在预测时使用）
    """
    print(f"\n{'=' * 80}")
    print("🚀 开始构建时序样本（包含高级特征）")
    print(f"{'=' * 80}")
    print(f"窗口大小（window_size）: {window_size}")

    input_df = input_df.copy()

    # Step 1：基础特征构建
    print("步骤 1/4 ▶ 添加基础特征...")

    # 将球员身高从“英尺-英寸”格式转换为英尺小数
    input_df['player_height_feet'] = input_df['player_height'].apply(height_to_feet)

    # 计算速度和加速度的 x、y 分量
    dir_rad = np.deg2rad(input_df['dir'].fillna(0))
    delta_t = 0.1
    input_df['velocity_x']     = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.sin(dir_rad)
    input_df['velocity_y']     = (input_df['s'] + 0.5 * input_df['a'] * delta_t) * np.cos(dir_rad)
    input_df['acceleration_x'] = input_df['a'] * np.sin(dir_rad)
    input_df['acceleration_y'] = input_df['a'] * np.cos(dir_rad)

    # 角度特征：将朝向角(o)与运动方向(dir)编码为正弦/余弦形式
    input_df['o_sin']  = np.sin(np.deg2rad(input_df['o'].fillna(0)))
    input_df['o_cos']  = np.cos(np.deg2rad(input_df['o'].fillna(0)))
    input_df['dir_sin'] = np.sin(np.deg2rad(input_df['dir'].fillna(0)))
    input_df['dir_cos'] = np.cos(np.deg2rad(input_df['dir'].fillna(0)))

    # 角色特征：进攻/防守/传球/接球/防守覆盖
    input_df['is_offense']  = (input_df['player_side'] == 'Offense').astype(int)
    input_df['is_defense']  = (input_df['player_side'] == 'Defense').astype(int)
    input_df['is_receiver'] = (input_df['player_role'] == 'Targeted Receiver').astype(int)
    input_df['is_coverage'] = (input_df['player_role'] == 'Defensive Coverage').astype(int)
    input_df['is_passer']   = (input_df['player_role'] == 'Passer').astype(int)

    # 物理特征：动量与动能
    mass_kg = input_df['player_weight'].fillna(200.0) / 2.20462  # 磅→千克
    input_df['momentum_x']     = input_df['velocity_x'] * mass_kg
    input_df['momentum_y']     = input_df['velocity_y'] * mass_kg
    input_df['kinetic_energy'] = 0.5 * mass_kg * (input_df['s'] ** 2)

    # 球与球员之间的空间特征
    if 'ball_land_x' in input_df.columns:
        ball_dx = input_df['ball_land_x'] - input_df['x']
        ball_dy = input_df['ball_land_y'] - input_df['y']
        input_df['distance_to_ball']   = np.sqrt(ball_dx**2 + ball_dy**2)
        input_df['angle_to_ball']      = np.arctan2(ball_dy, ball_dx)
        input_df['ball_direction_x']   = ball_dx / (input_df['distance_to_ball'] + 1e-6)
        input_df['ball_direction_y']   = ball_dy / (input_df['distance_to_ball'] + 1e-6)
        input_df['closing_speed']      = (
            input_df['velocity_x'] * input_df['ball_direction_x'] +
            input_df['velocity_y'] * input_df['ball_direction_y']
        )

    # 时间排序（确保帧序一致）
    input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    gcols = ['game_id', 'play_id', 'nfl_id']

    # 添加滞后特征（历史 1~3 帧）
    for lag in [1, 2, 3]:
        input_df[f'x_lag{lag}']          = input_df.groupby(gcols)['x'].shift(lag)
        input_df[f'y_lag{lag}']          = input_df.groupby(gcols)['y'].shift(lag)
        input_df[f'velocity_x_lag{lag}'] = input_df.groupby(gcols)['velocity_x'].shift(lag)
        input_df[f'velocity_y_lag{lag}'] = input_df.groupby(gcols)['velocity_y'].shift(lag)

    # EMA（指数滑动平均）平滑速度变化
    input_df['velocity_x_ema'] = input_df.groupby(gcols)['velocity_x'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['velocity_y_ema'] = input_df.groupby(gcols)['velocity_y'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )
    input_df['speed_ema'] = input_df.groupby(gcols)['s'].transform(
        lambda x: x.ewm(alpha=0.3, adjust=False).mean()
    )

    # Step 2：高级特征
    print("步骤 2/4 ▶ 添加高级特征...")
    input_df = add_advanced_features(input_df)

    # Step 3：球员交互特征
    print("步骤 3/4 ▶ 添加球员交互特征...")
    if use_players_interactions:
        print("✅ 已启用球员交互特征计算（use_players_interactions=True）")

        agg_rows = []  # 用于保存每一帧的交互统计结果

        # 按比赛 (game_id)、回合 (play_id)、帧 (frame_id) 分组
        # 每一组包含同一帧中所有球员的空间状态
        for (g, p, f), grp in input_df.groupby(['game_id', 'play_id', 'frame_id'], sort=False):
            n = len(grp)
            nfl_ids = grp['nfl_id'].to_numpy()

            # 如果存在 player_to_predict，则只对需要预测的球员计算交互特征
            compute_mask = (
                grp['player_to_predict'].to_numpy().astype(bool)
                if 'player_to_predict' in grp.columns
                else np.ones(n, dtype=bool)
            )

            # 若该帧球员不足 2 人，则构造空记录（NaN）
            if n < 2:
                for nid in nfl_ids[compute_mask]:
                    agg_rows.append({
                        'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                        'distance_to_player_mean_offense': np.nan,
                        'distance_to_player_min_offense': np.nan,
                        'distance_to_player_max_offense': np.nan,
                        'relative_velocity_magnitude_mean_offense': np.nan,
                        'relative_velocity_magnitude_min_offense': np.nan,
                        'relative_velocity_magnitude_max_offense': np.nan,
                        'angle_to_player_mean_offense': np.nan,
                        'angle_to_player_min_offense': np.nan,
                        'angle_to_player_max_offense': np.nan,
                        'distance_to_player_mean_defense': np.nan,
                        'distance_to_player_min_defense': np.nan,
                        'distance_to_player_max_defense': np.nan,
                        'relative_velocity_magnitude_mean_defense': np.nan,
                        'relative_velocity_magnitude_min_defense': np.nan,
                        'relative_velocity_magnitude_max_defense': np.nan,
                        'angle_to_player_mean_defense': np.nan,
                        'angle_to_player_min_defense': np.nan,
                        'angle_to_player_max_defense': np.nan,
                        'nearest_opponent_dist': np.nan,
                        'nearest_opponent_angle': np.nan,
                        'nearest_opponent_rel_speed': np.nan,
                    })
                continue

            # 获取球员位置与速度信息
            x  = grp['x'].to_numpy(dtype=np.float32)
            y  = grp['y'].to_numpy(dtype=np.float32)
            vx = grp['velocity_x'].to_numpy(dtype=np.float32)
            vy = grp['velocity_y'].to_numpy(dtype=np.float32)
            is_offense = grp['is_offense'].to_numpy()
            is_defense = grp['is_defense'].to_numpy()

            # --- 计算两两间的几何关系矩阵 ---
            dx = x[None, :] - x[:, None]      # X方向差值矩阵
            dy = y[None, :] - y[:, None]      # Y方向差值矩阵
            dist = np.sqrt(dx ** 2 + dy ** 2) # 欧氏距离矩阵 (n×n)
            angle_mat = np.arctan2(-dy, -dx)  # 从球员 i 指向 j 的角度

            # --- 相对速度矩阵 ---
            dvx = vx[:, None] - vx[None, :]
            dvy = vy[:, None] - vy[None, :]
            rel_speed = np.sqrt(dvx ** 2 + dvy ** 2)

            # --- 各类掩码 ---
            offense_mask = (is_offense[:, None] == is_offense[None, :])
            np.fill_diagonal(offense_mask, False)  # 自身不参与计算

            defense_mask = (is_defense[:, None] == is_defense[None, :])
            np.fill_diagonal(defense_mask, False)

            opp_mask = (is_offense[:, None] != is_offense[None, :])  # 对手阵营
            np.fill_diagonal(opp_mask, False)

            # --- 将自身的值置为 NaN，避免干扰统计 ---
            dist_diag_nan  = dist.copy();      np.fill_diagonal(dist_diag_nan,  np.nan)
            rel_diag_nan   = rel_speed.copy(); np.fill_diagonal(rel_diag_nan, np.nan)
            angle_diag_nan = angle_mat.copy(); np.fill_diagonal(angle_diag_nan, np.nan)

            # --- 定义统计函数：计算均值、最小值、最大值 ---
            def masked_stats(mat, mask):
                masked = np.where(mask, mat, np.nan)
                cnt  = mask.sum(axis=1)
                mean = np.nanmean(masked, axis=1)
                amin = np.nanmin(masked, axis=1)
                amax = np.nanmax(masked, axis=1)
                zero = cnt == 0  # 若无有效数据则置 NaN
                mean[zero] = np.nan; amin[zero] = np.nan; amax[zero] = np.nan
                return mean, amin, amax

            # --- 计算进攻方之间的距离、相对速度、角度统计 ---
            d_mean_o, d_min_o, d_max_o = masked_stats(dist_diag_nan, offense_mask)
            v_mean_o, v_min_o, v_max_o = masked_stats(rel_diag_nan, offense_mask)
            a_mean_o, a_min_o, a_max_o = masked_stats(angle_diag_nan, offense_mask)

            # --- 计算防守方之间的统计 ---
            d_mean_d, d_min_d, d_max_d = masked_stats(dist_diag_nan, defense_mask)
            v_mean_d, v_min_d, v_max_d = masked_stats(rel_diag_nan, defense_mask)
            a_mean_d, a_min_d, a_max_d = masked_stats(angle_diag_nan, defense_mask)

            # --- 计算最近对手距离/角度/相对速度 ---
            masked_dist_opp = np.where(opp_mask, dist_diag_nan, np.nan)
            nearest_dist = np.nanmin(masked_dist_opp, axis=1)
            nearest_idx  = np.nanargmin(masked_dist_opp, axis=1)
            all_nan = ~np.isfinite(nearest_dist)
            nearest_idx_safe = nearest_idx.copy()
            nearest_idx_safe[all_nan] = 0

            nearest_angle = np.take_along_axis(angle_diag_nan, nearest_idx_safe[:, None], axis=1).squeeze(1)
            nearest_rel   = np.take_along_axis(rel_diag_nan, nearest_idx_safe[:, None], axis=1).squeeze(1)
            nearest_angle[all_nan] = np.nan
            nearest_rel[all_nan]   = np.nan

            # --- 汇总每位球员的交互特征 ---
            for idx, nid in enumerate(nfl_ids):
                if not compute_mask[idx]:
                    continue
                agg_rows.append({
                    'game_id': g, 'play_id': p, 'frame_id': f, 'nfl_id': nid,
                    'distance_to_player_mean_offense': d_mean_o[idx],
                    'distance_to_player_min_offense':  d_min_o[idx],
                    'distance_to_player_max_offense':  d_max_o[idx],
                    'relative_velocity_magnitude_mean_offense': v_mean_o[idx],
                    'relative_velocity_magnitude_min_offense':  v_min_o[idx],
                    'relative_velocity_magnitude_max_offense':  v_max_o[idx],
                    'angle_to_player_mean_offense': a_mean_o[idx],
                    'angle_to_player_min_offense':  a_min_o[idx],
                    'angle_to_player_max_offense':  a_max_o[idx],

                    'distance_to_player_mean_defense': d_mean_d[idx],
                    'distance_to_player_min_defense':  d_min_d[idx],
                    'distance_to_player_max_defense':  d_max_d[idx],
                    'relative_velocity_magnitude_mean_defense': v_mean_d[idx],
                    'relative_velocity_magnitude_min_defense':  v_min_d[idx],
                    'relative_velocity_magnitude_max_defense':  v_max_d[idx],
                    'angle_to_player_mean_defense': a_mean_d[idx],
                    'angle_to_player_min_defense':  a_min_d[idx],
                    'angle_to_player_max_defense':  a_max_d[idx],

                    'nearest_opponent_dist':      nearest_dist[idx],
                    'nearest_opponent_angle':     nearest_angle[idx],
                    'nearest_opponent_rel_speed': nearest_rel[idx],
                })

        # 合并交互特征回主表
        interaction_agg = pd.DataFrame(agg_rows)
        input_df = input_df.merge(
            interaction_agg,
            on=['game_id', 'play_id', 'frame_id', 'nfl_id'],
            how='left'
        )

        print("✅ 球员交互特征添加完成。")

    else:
        print("⚠️ 跳过球员交互特征计算（use_players_interactions=False）。")


    # Step 4：构建输入序列
    print("步骤 4/4 ▶ 构建输入序列样本...")


    feature_cols = [
        # —— 基础核心特征（Core, 6）——
        'x', 'y', 's', 'a', 'ball_land_x', 'ball_land_y',

        # —— 角度编码特征（Angles encoded, 4）——
        'o_sin', 'o_cos', 'dir_sin', 'dir_cos',

        # —— 球员静态特征（Player, 2）——
        'player_height_feet', 'player_weight',

        # —— 动态运动特征（Motion, 7）——
        'velocity_x', 'velocity_y', 'acceleration_x', 'acceleration_y',
        'momentum_x', 'momentum_y', 'kinetic_energy',

        # —— 角色身份特征（Roles, 5）——
        'is_offense', 'is_defense', 'is_receiver', 'is_coverage', 'is_passer',

        # —— 球与球员空间特征（Ball relation, 5）——
        'distance_to_ball', 'angle_to_ball',
        'ball_direction_x', 'ball_direction_y', 'closing_speed',

        # —— 原始时序滞后特征（Original temporal lags, 15）——
        'x_lag1', 'y_lag1', 'velocity_x_lag1', 'velocity_y_lag1',
        'x_lag2', 'y_lag2', 'velocity_x_lag2', 'velocity_y_lag2',
        'x_lag3', 'y_lag3', 'velocity_x_lag3', 'velocity_y_lag3',
        'velocity_x_ema', 'velocity_y_ema', 'speed_ema',

        # —— 距离变化速率特征（Distance rate, 3）——
        'distance_to_ball_change', 'distance_to_ball_accel', 'time_to_intercept',

        # —— 目标方向对齐特征（Target alignment, 3）——
        'velocity_alignment', 'velocity_perpendicular', 'accel_alignment',

        # —— 多窗口滚动特征（Multi-window rolling, 24）——
        'velocity_x_roll3', 'velocity_x_std3', 'velocity_y_roll3', 'velocity_y_std3',
        's_roll3', 's_std3', 'a_roll3', 'a_std3',
        'velocity_x_roll5', 'velocity_x_std5', 'velocity_y_roll5', 'velocity_y_std5',
        's_roll5', 's_std5', 'a_roll5', 'a_std5',
        'velocity_x_roll10', 'velocity_x_std10', 'velocity_y_roll10', 'velocity_y_std10',
        's_roll10', 's_std10', 'a_roll10', 'a_std10',

        # —— 扩展时序滞后特征（Extended lags, 8）——
        'x_lag4', 'y_lag4', 'velocity_x_lag4', 'velocity_y_lag4',
        'x_lag5', 'y_lag5', 'velocity_x_lag5', 'velocity_y_lag5',

        # —— 速度变化特征（Velocity change, 4）——
        'velocity_x_change', 'velocity_y_change', 'speed_change', 'direction_change',

        # —— 场地位置特征（Field position, 2）——
        'dist_from_sideline', 'dist_from_endzone',

        # —— 角色相关特征（Role-specific, 3）——
        'receiver_optimality', 'receiver_deviation', 'defender_closing_speed',

        # —— 时间进程特征（Time, 2）——
        'frames_elapsed', 'normalized_time',

        # —— 球员交互特征（Player interactions, 21）——
        'distance_to_player_mean_offense', 'distance_to_player_min_offense', 'distance_to_player_max_offense',
        'relative_velocity_magnitude_mean_offense', 'relative_velocity_magnitude_min_offense', 'relative_velocity_magnitude_max_offense',
        'angle_to_player_mean_offense', 'angle_to_player_min_offense', 'angle_to_player_max_offense',
        'distance_to_player_mean_defense', 'distance_to_player_min_defense', 'distance_to_player_max_defense',
        'relative_velocity_magnitude_mean_defense', 'relative_velocity_magnitude_min_defense', 'relative_velocity_magnitude_max_defense',
        'angle_to_player_mean_defense', 'angle_to_player_min_defense', 'angle_to_player_max_defense',
        'nearest_opponent_dist', 'nearest_opponent_angle', 'nearest_opponent_rel_speed',
    ]

    # 保留当前数据集中确实存在的特征列（避免 KeyError）
    feature_cols = [c for c in feature_cols if c in input_df.columns]

    # 输出特征数量信息
    print(f"✅ 使用的特征列数量: {len(feature_cols)} 个")


    # CREATE SEQUENCES

    # 设置索引并按球员分组
    input_df.set_index(['game_id', 'play_id', 'nfl_id'], inplace=True)
    grouped = input_df.groupby(level=['game_id', 'play_id', 'nfl_id'])

    # 选择目标数据源（训练或测试）
    target_rows = output_df if is_training else test_template
    target_groups = target_rows[['game_id', 'play_id', 'nfl_id']].drop_duplicates()

    # 存储容器
    sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids = [], [], [], [], []

    # 遍历每个球员的时间序列
    for _, row in tqdm(target_groups.iterrows(), total=len(target_groups), desc="⏳ 正在创建序列"):
        key = (row['game_id'], row['play_id'], row['nfl_id'])

        try:
            group_df = grouped.get_group(key)
        except KeyError:
            continue

        # 提取时间窗口（最后 window_size 帧）
        input_window = group_df.tail(window_size)

        # 若帧数不足则进行填充
        if len(input_window) < window_size:
            # if is_training:
            #     print(f"Skipping sequence with insufficient history for {key}")
            #     continue
            print(f"⚠️ 序列不足 {window_size} 帧，自动填充：{key}")
            pad_len = window_size - len(input_window)
            pad_df  = pd.concat([input_window.iloc[0:1]] * pad_len, ignore_index=True)
            input_window = pd.concat([pad_df, input_window], ignore_index=True)

        # 缺失值前向/后向填充
        input_window = input_window.ffill().bfill().fillna(0.0)

        seq = input_window[feature_cols].values
        if np.isnan(seq).any():
            print(f"⚠️ 在 {key} 的序列中发现 NaN，已用 0.0 替换。")
            seq = np.nan_to_num(seq, nan=0.0)

        sequences.append(seq)

        # 若为训练模式，则计算预测目标（Δx, Δy）
        if is_training:
            out_grp = output_df[
                (output_df['game_id'] == row['game_id']) &
                (output_df['play_id'] == row['play_id']) &
                (output_df['nfl_id']  == row['nfl_id'])
            ].sort_values('frame_id')

            last_x = input_window.iloc[-1]['x']
            last_y = input_window.iloc[-1]['y']

            dx = out_grp['x'].values - last_x
            dy = out_grp['y'].values - last_y

            targets_dx.append(dx)
            targets_dy.append(dy)
            targets_frame_ids.append(out_grp['frame_id'].values)

        sequence_ids.append({
            'game_id': key[0],
            'play_id': key[1],
            'nfl_id': key[2],
            'frame_id': input_window.iloc[-1]['frame_id']
        })

    print(f"\n✅ 共生成 {len(sequences)} 个序列，每个序列包含 {len(feature_cols)} 个特征。")

    if is_training:
        return sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids, feature_cols
    return sequences, sequence_ids, feature_cols


In [24]:
# 生成时序样本（训练模式）
sequences, targets_dx, targets_dy, targets_frame_ids, sequence_ids, feature_cols = prepare_sequences(
    train_input,
    output_df=train_output,
    test_template=test_template,
    is_training=True,
    window_size=Config.WINDOW_SIZE
)

# 输出结果信息
print(f"✅ 已准备好 {len(sequences)} 个时序样本，每个样本包含 {len(feature_cols)} 个特征。")
print(f"📏 每个样本的窗口长度为 {Config.WINDOW_SIZE} 帧。")
print(f"📊 示例样本形状：{sequences[0].shape}")
print(f"🎯 训练目标示例：dx={targets_dx[0].shape}, dy={targets_dy[0].shape}")
print(f"🆔 样本索引数量：{len(sequence_ids)}")



🚀 开始构建时序样本（包含高级特征）
窗口大小（window_size）: 12
步骤 1/4 ▶ 添加基础特征...
步骤 2/4 ▶ 添加高级特征...
正在添加高级特征...
特征增强后总列数: 112
步骤 3/4 ▶ 添加球员交互特征...
✅ 已启用球员交互特征计算（use_players_interactions=True）
✅ 球员交互特征添加完成。
步骤 4/4 ▶ 构建输入序列样本...
✅ 使用的特征列数量: 114 个


⏳ 正在创建序列:   0%|          | 0/46045 [00:00<?, ?it/s]

⚠️ 序列不足 12 帧，自动填充：(2023091004, 1594, 52453)
⚠️ 序列不足 12 帧，自动填充：(2023091004, 1594, 52430)
⚠️ 序列不足 12 帧，自动填充：(2023091004, 4122, 41233)
⚠️ 序列不足 12 帧，自动填充：(2023091008, 1292, 48516)
⚠️ 序列不足 12 帧，自动填充：(2023091008, 1292, 55921)
⚠️ 序列不足 12 帧，自动填充：(2023091008, 1292, 54597)
⚠️ 序列不足 12 帧，自动填充：(2023091400, 318, 52430)
⚠️ 序列不足 12 帧，自动填充：(2023091700, 1728, 46087)
⚠️ 序列不足 12 帧，自动填充：(2023091700, 1728, 54473)
⚠️ 序列不足 12 帧，自动填充：(2023091709, 493, 53531)
⚠️ 序列不足 12 帧，自动填充：(2023091709, 493, 53601)
⚠️ 序列不足 12 帧，自动填充：(2023091709, 493, 56042)
⚠️ 序列不足 12 帧，自动填充：(2023091712, 81, 43327)
⚠️ 序列不足 12 帧，自动填充：(2023091712, 81, 42357)
⚠️ 序列不足 12 帧，自动填充：(2023091712, 2480, 43299)
⚠️ 序列不足 12 帧，自动填充：(2023091712, 2480, 42357)
⚠️ 序列不足 12 帧，自动填充：(2023092406, 3048, 44819)
⚠️ 序列不足 12 帧，自动填充：(2023092410, 1352, 46150)
⚠️ 序列不足 12 帧，自动填充：(2023092410, 2619, 53565)
⚠️ 序列不足 12 帧，自动填充：(2023092410, 2619, 52425)
⚠️ 序列不足 12 帧，自动填充：(2023100110, 2747, 53476)
⚠️ 序列不足 12 帧，自动填充：(2023100110, 2747, 54679)
⚠️ 序列不足 12 帧，自动填充：(2023100110, 2747, 559

In [25]:
# === 保存 pkl 文件 ===
save_path = Config.DATA_DIR / "train_data_cache_pad.pkl"

# 保存（保留 numpy 对象类型）
with open(save_path, "wb") as f:
    pickle.dump({
        "sequences": sequences,
        "targets_dx": targets_dx,
        "targets_dy": targets_dy,
        "targets_frame_ids": targets_frame_ids,
        "sequence_ids": sequence_ids,
        "feature_cols": feature_cols
    }, f)

print(f"✅ 已使用 pickle 保存对象到 {save_path}")


✅ 已使用 pickle 保存对象到 D:\数据\Kaggle\2026 年 NFL 大数据碗 - 预测\DATA_DIR000\train_data_cache_pad.pkl


In [26]:
# === 加载 pkl 文件 ===
data_path = Config.DATA_DIR / "train_data_cache_pad.pkl"

with open(save_path, "rb") as f:
    data = pickle.load(f)

sequences_2 = data["sequences"]
targets_dx_2 = data["targets_dx"]
targets_dy_2 = data["targets_dy"]
targets_frame_ids_2 = data["targets_frame_ids"]
sequence_ids_2 = data["sequence_ids"]
feature_cols_2 = data["feature_cols"]

print("✅ 数据加载成功到 *_2 变量！")



✅ 数据加载成功到 *_2 变量！


In [27]:
verify_objects(sequences, sequences_2, "sequences")
verify_objects(targets_dx, targets_dx_2, "targets_dx")
verify_objects(targets_dy, targets_dy_2, "targets_dy")
verify_objects(targets_frame_ids, targets_frame_ids_2, "targets_frame_ids")
verify_objects(sequence_ids, sequence_ids_2, "sequence_ids")
verify_objects(feature_cols, feature_cols_2, "feature_cols")

[sequences]
list[numpy.ndarray shape=(12, 114) dtype=float64] len=46045
list[numpy.ndarray shape=(12, 114) dtype=float64] len=46045
5d2684f49c9ca71f64c41636
5d2684f49c9ca71f64c41636
✅一致
[targets_dx]
list[numpy.ndarray shape=(21,) dtype=float64] len=46045
list[numpy.ndarray shape=(21,) dtype=float64] len=46045
d99185f0ebe88b2786faf5c8
d99185f0ebe88b2786faf5c8
✅一致
[targets_dy]
list[numpy.ndarray shape=(21,) dtype=float64] len=46045
list[numpy.ndarray shape=(21,) dtype=float64] len=46045
73183fc597e548240f481540
73183fc597e548240f481540
✅一致
[targets_frame_ids]
list[numpy.ndarray shape=(21,) dtype=int64] len=46045
list[numpy.ndarray shape=(21,) dtype=int64] len=46045
2b505c057b7da08aed772945
2b505c057b7da08aed772945
✅一致
[sequence_ids]
list[dict (keys=4)] len=46045
list[dict (keys=4)] len=46045
9a1568866c31a8c79c11dfc4
9a1568866c31a8c79c11dfc4
✅一致
[feature_cols]
list[str] len=114
list[str] len=114
44cf06a02b1e47da769fea6a
44cf06a02b1e47da769fea6a
✅一致


True

In [None]:

print(f"Loading pretrained models from {Config.NN_PRETRAIN_DIR}")
models_x_nn, models_y_nn, scalers, cfgs = load_folds_xy(num_folds=Config.N_FOLDS, models_dir=Config.NN_PRETRAIN_DIR, device=Config.DEVICE)
