In [26]:
import torch
import pandas as pd
from src.process.stats_encoder import StatsEncoder
from transformers import GPT2Tokenizer
import src.models.encoder as encoder_module 
import matplotlib.pyplot as plt

In [27]:
df = pd.read_csv("data/dataset.csv")
df.head()


Unnamed: 0,date,MP,PTS,FG%,TRB,AST,STL,BLK,TOV,PF,Result,question,answer
0,2007-05-21,45:22,10.0,0.333,10.0,9.0,4.0,1.0,2.0,1.0,0,"I know you say that you're a football player, ...",(Laughing) It was definitely a physical game t...
1,2007-05-21,45:22,10.0,0.333,10.0,9.0,4.0,1.0,2.0,1.0,0,"The last play there, Coach said that was kind ...","No, I go for the winning play. If two guys co..."
2,2007-05-21,45:22,10.0,0.333,10.0,9.0,4.0,1.0,2.0,1.0,0,"As a franchise player, how do you justify only...","No, you've just got to take what's there. It'..."
3,2007-05-21,45:22,10.0,0.333,10.0,9.0,4.0,1.0,2.0,1.0,0,Coach said that a couple of adjustments need t...,We definitely played pretty well. Both teams ...
4,2007-05-21,45:22,10.0,0.333,10.0,9.0,4.0,1.0,2.0,1.0,0,"Third quarter, again, you guys fell behind 7-0...",Not sure. It's just something that we've got ...


In [28]:
numeric_columns = ['PTS', 'AST', 'TRB', 'STL', 'BLK', 'FG%', 'TOV', 'PF', 'Result']
stats_data = df[numeric_columns]

stats_tensor = torch.tensor(stats_data.values, dtype=torch.float32)

print(stats_tensor.shape)


torch.Size([1193, 9])


In [29]:
input_dim = stats_tensor.shape[1] 
encoder = StatsEncoder(input_dim=input_dim)


In [30]:
output = encoder(stats_tensor)

print("Input shape:", stats_tensor.shape)
print("Output shape:", output.shape)
print("Sample output embedding:", output[0])


Input shape: torch.Size([1193, 9])
Output shape: torch.Size([1193, 32])
Sample output embedding: tensor([-0.2570,  1.0996,  0.4883, -0.7144, -0.4034, -0.1482, -0.0683, -0.3039,
        -0.4909, -0.4835,  0.5232, -0.9005, -0.2558,  0.3948, -0.1877,  0.0289,
         0.9870,  0.0267,  0.0456, -0.1573,  0.9555,  0.9101,  0.0060,  0.6319,
         0.6138,  0.4949,  0.4973, -0.0089, -0.5625, -0.3327, -0.0397, -0.5056],
       grad_fn=<SelectBackward0>)


In [31]:
GameStats2TextModel = encoder_module.GameStats2TextModel

df = pd.read_csv("data/dataset.csv")
batch = df.sample(4, random_state=0)

stat_cols = ['PTS','AST','TRB','STL','BLK','FG%','TOV','PF']
stats = torch.tensor(batch[stat_cols].values, dtype=torch.float32)

In [32]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token
encoded = tokenizer(
    batch['question'].tolist(),
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=64
)
input_ids      = encoded.input_ids
attention_mask = encoded.attention_mask

In [33]:
model = GameStats2TextModel(
    stats_input_dim=stats.size(1),
    stats_hidden_dims=[128,64],
    stats_output_dim=32,
    gpt_model_name='gpt2',
    fusion_method='concat'
)

In [34]:
with torch.no_grad():
    out = model(stats, input_ids, attention_mask)

print(" stats shape:", stats.shape)             
print(" input_ids shape:", input_ids.shape)     
print(" output shape:", out.shape)

 stats shape: torch.Size([4, 8])
 input_ids shape: torch.Size([4, 49])
 output shape: torch.Size([4, 49, 768])
