In [1]:
from utils.testing.seed_everything import set_seed
set_seed(42)

✅ Seed set: 42


In [2]:
import pandas as pd
import torch
from utils.batching.batch_races import batch_races
from modeling.inputs_config import FLOAT_FEATURES, IDX_FEATURES, NLP_VECTOR_SIZE
from modeling.transformer_model import RaceTransformer

In [3]:
# ✅ Load model-ready data
df = pd.read_pickle("data/processed/2025/03/model_ready_march-2025.pkl")

In [4]:
# 🧪 Setup: Isolate a clean trial race for forward pass.

# This code helper ensures:
# - The data feed isn't corrupted.
# - The trial race is consistently scoped.
# - We know the input tensor shape is as we intended.

from utils.testing.testing_utils import get_clean_trial_race

df = pd.read_pickle("data/processed/2025/03/model_ready_march-2025.pkl")
trial_df = get_clean_trial_race(df)


🎯 Trial race_id: 891228
✅ Cleaned trial race shape: (11, 102)


In [5]:
# ✅ Set up NLP columns
NLP_COLS = ["comment_vector", "spotlight_vector"]

In [6]:
# ✅ Run batching (just one batch, for test)
batches = batch_races(
    trial_df,
    float_cols=FLOAT_FEATURES,
    idx_cols=IDX_FEATURES,
    nlp_cols=NLP_COLS,
    batch_size=1,       # only one race for demo
    max_runners=None,   # dynamic padding
    shuffle=False
)
batch = batches[0]

In [7]:
# ✅ Extract dimensions for model instantiation
float_dim = len(FLOAT_FEATURES)
idx_vocab_sizes = [df[col].max() + 1 for col in IDX_FEATURES]  # assuming 0-indexed
nlp_dim = NLP_VECTOR_SIZE

In [8]:
# ✅ Convert batch to PyTorch tensors
float_feats = torch.tensor(batch["float_features"], dtype=torch.float32)
idx_feats = torch.tensor(batch["embedding_indices"], dtype=torch.long)
comment_vecs = torch.tensor(batch["comment_vector"], dtype=torch.float32)
spotlight_vecs = torch.tensor(batch["spotlight_vector"], dtype=torch.float32)
mask = torch.tensor(batch["mask"], dtype=torch.float32)

In [9]:
# ✅ Instantiate model
model = RaceTransformer(
    idx_vocab_sizes=idx_vocab_sizes,
    float_dim=float_dim,
    nlp_dim=nlp_dim
)

In [10]:
# ✅ Run forward pass
model.eval()
with torch.no_grad():
    out = model(float_feats, idx_feats, comment_vecs, spotlight_vecs, mask)

print("✅ Output shape:", out.shape)  # (B, R)
print("🔢 Sample output:", out[0])

✅ Output shape: torch.Size([1, 11])
🔢 Sample output: tensor([0.5401, 0.5755, 0.6598, 0.7424, 0.5578, 0.5471, 0.5668, 0.5722, 0.4755,
        0.5619, 0.5415])


  output = torch._nested_tensor_from_mask(


In [None]:
🔢 Sample output: tensor([0.5401, 0.5755, 0.6598, 0.7424, 0.5578, 0.5471, 0.5668, 0.5722, 0.4755,
        0.5619, 0.5415])

In [20]:
race_id = df["race_id"].unique()[0]
print("🎯 Trial race_id:", race_id)

race_df = df[df["race_id"] == race_id]
print("🏇 Runners in race (after non-runner filter):", race_df.shape[0])


🎯 Trial race_id: 891228
🏇 Runners in race (after non-runner filter): 11


In [21]:
# Validate no 'Nan's in the dataset.
import pandas as pd
import torch
from utils.batching.batch_races import batch_races
from modeling.inputs_config import FLOAT_FEATURES, IDX_FEATURES, NLP_VECTOR_SIZE

# Set up schema
NLP_COLS = ["comment_vector", "spotlight_vector"]

# ✅ Load post-fix model-ready dataset
df = pd.read_pickle("data/processed/2025/03/model_ready_march-2025.pkl")

# ✅ Run batching (just 1 race)
batch = batch_races(
    df,
    float_cols=FLOAT_FEATURES,
    idx_cols=IDX_FEATURES,
    nlp_cols=NLP_COLS,
    batch_size=1,
    max_runners=None,
    shuffle=False
)[0]

# ✅ Check for NaNs in float features
float_feats_new = torch.tensor(batch["float_features"], dtype=torch.float32)
print("❓ NaNs in float_feats:", float_feats_new.isnan().sum().item())


❓ NaNs in float_feats: 0
