## Import Data

In [None]:
# helper.py
from helper import *

# pytorch
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

# set up GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [21]:
# load data
df = pd.read_csv('./data/clean/features.csv')

# drop all aggregated cols
df = df.drop(columns=[col for col in df.columns if 'mean' in col or 'std' in col])

# check
df.shape, len(set(df.player))

((1099, 66), 237)

In [54]:
def create_player_histories(df):
    """
    Returns:
      X_pad: FloatTensor, shape (n_players, max_seasons, n_features).
      y:      FloatTensor, shape (n_players,).
      lengths: IntTensor, shape (n_players,).
      mask:   BoolTensor, shape (n_players, max_seasons).
      players: list of player names.
    """

    # non-feature columns
    non_feat_cols = ['player', 'team_name', 'year', 'target']

    # init lists
    sequences, labels, players = [], [], []

    # iterate through each player
    for player, g in df.groupby('player'):
        # sort
        g = g.sort_values('year').reset_index(drop=True)

        # cache the feature matrix once
        feat_mat = g.drop(columns=non_feat_cols).values

        # iterate through each season
        for i in range(len(g)):
            # seasons 0 through i
            seq = torch.tensor(feat_mat[:i+1], dtype=torch.float32)
            
            # target for season i
            lbl = torch.tensor(g['target'].iloc[i], dtype=torch.float32)

            # append to lists
            sequences.append(seq)
            labels.append(lbl)
            players.append(player)

    # pad to longest sequence
    X_pad = pad_sequence(sequences, batch_first=True)  

    # build mask so model knows which timesteps are real
    lengths = torch.tensor([seq.size(0) for seq in sequences])
    max_len = X_pad.size(1)
    mask = torch.arange(max_len)[None, :] < lengths[:, None]

    # create labels
    y = torch.stack(labels)

    return X_pad, y, lengths,  mask, players

In [55]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# non-feature columns
non_feat_cols = ['player', 'team_name', 'year', 'target']
feat_cols = [c for c in df.columns if c not in non_feat_cols]

# split df into train/val by player to avoid leakage
players = df['player'].unique()
train_players, val_players = train_test_split(players, test_size=0.2, random_state=SEED)
df_train = df[df['player'].isin(train_players)].copy()
df_val   = df[df['player'].isin(val_players)].copy()

# scale features
scaler = StandardScaler()
df_train[feat_cols] = scaler.fit_transform(df_train[feat_cols])
df_val  [feat_cols] = scaler.transform(df_val[feat_cols])

# recombine so our create_player_histories sees scaled data
df_scaled = pd.concat([df_train, df_val], ignore_index=True)

# build sequences
X_pad, y, lengths, mask, players = create_player_histories(df_scaled)

# split train/val
idx = torch.arange(len(y))
train_idx = idx[[p in train_players for p in players]]
val_idx = idx[[p in val_players for p in players]]
X_train, X_val = X_pad[train_idx], X_pad[val_idx]
len_train, len_val = lengths[train_idx], lengths[val_idx]
y_train, y_val = y[train_idx], y[val_idx]

# torch dataset
class SeqDataset(Dataset):
    def __init__(self, X, lengths, y):
        self.X, self.lengths, self.y = X, lengths, y

    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.X[i], self.lengths[i], self.y[i]

# create dataset and dataloader
train_ds = SeqDataset(X_train, len_train, y_train)
val_ds = SeqDataset(X_val, len_val, y_val)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32)

# simple LSTM
class LSTMRegressor(nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hidden_dim, batch_first=True)
        self.head = nn.Linear(hidden_dim, 1)

    def forward(self, x, lengths):
        packed, _ = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        out = hn[-1]                    
        return self.head(out).squeeze(1)

# instantiate
model = LSTMRegressor(in_dim=X_pad.size(2), hidden_dim=64).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

In [56]:
# training loop
for epoch in range(10):
    model.train()
    train_loss = 0
    for Xb, lb, yb in train_dl:
        Xb = Xb.to(device)
        yb = yb.to(device)
        opt.zero_grad()
        preds = model(Xb, lb)
        loss  = criterion(preds, yb)
        loss.backward()
        opt.step()
        train_loss += loss.item()*Xb.size(0)
    train_loss /= len(train_ds)

    model.eval()
    val_preds, val_trues = [], []
    with torch.no_grad():
        for Xb, lb, yb in val_dl:
            Xb = Xb.to(device)
            yb = yb.to(device)
            vp = model(Xb, lb)
            val_preds.append(vp.cpu())
            val_trues.append(yb.cpu())
    val_preds = torch.cat(val_preds).numpy()
    val_trues = torch.cat(val_trues).numpy()

    rmse = mean_squared_error(val_trues, val_preds, squared=False)
    r2   = r2_score(val_trues, val_preds)
    print(f"Epoch {epoch+1} → train MSE {train_loss:.4f}, val RMSE {rmse:.4f}, R² {r2:.3f}")

ValueError: too many values to unpack (expected 2)

In [67]:
def objective_function(hidden_dim, num_layers, dropout, batch_size):
    # cast continuous values to int
    hidden_dim = int(hidden_dim)
    num_layers = int(num_layers)
    batch_size = int(batch_size)
    
    # create sequences
    X, y = create_seq(feature_subset=all_feats, seq_len=2, df=df)
    
    # create dataloaders
    train_loader, val_loader = create_loaders(X, y, test_size=0.1, batch_size=batch_size)

    # create model
    model = RNN(input_dim=66, hidden_dim=hidden_dim, num_layers=num_layers, dropout=dropout).to(device)
        
    # create optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    
    # train model
    train_rmse, train_r2, val_rmse, val_r2, num_epochs, patience = train_val(model, optimizer, train_loader, val_loader)
    
    # return the negative of the validation metric (since Bayesian optimization minimizes)
    return -val_rmse

In [68]:
# define bounds of each hyperparameter
pbounds = {
    'hidden_dim': (4, 1025),
    'num_layers': (2, 17),
    'dropout': (0, 0.9),
    'batch_size': (1, 129)
}

# create the Bayesian optimizer
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=pbounds,
    random_state=random_state,
    allow_duplicate_points=True
)

In [69]:
# iterate through feature space
optimizer.maximize(init_points=10, n_iter=100)

|   iter    |  target   | batch_... |  dropout  | hidden... | num_la... |
-------------------------------------------------------------------------
Early stopping on Epoch 20.
| [0m1        [0m | [0m-13.31   [0m | [0m2.328    [0m | [0m0.4517   [0m | [0m510.2    [0m | [0m4.007    [0m |
Early stopping on Epoch 93.
| [95m2        [0m | [95m-12.43   [0m | [95m19.19    [0m | [95m0.1967   [0m | [95m431.3    [0m | [95m5.722    [0m |
Early stopping on Epoch 54.
| [0m3        [0m | [0m-13.34   [0m | [0m11.76    [0m | [0m0.3109   [0m | [0m174.3    [0m | [0m15.18    [0m |
Early stopping on Epoch 21.
| [0m4        [0m | [0m-72.67   [0m | [0m122.7    [0m | [0m0.03487  [0m | [0m717.8    [0m | [0m10.59    [0m |
Early stopping on Epoch 73.
| [0m5        [0m | [0m-13.32   [0m | [0m115.9    [0m | [0m0.6002   [0m | [0m563.3    [0m | [0m12.54    [0m |
Early stopping on Epoch 39.
| [0m6        [0m | [0m-14.19   [0m | [0m50.47    [0m | [0m0.

Early stopping on Epoch 208.
| [0m56       [0m | [0m-13.33   [0m | [0m90.87    [0m | [0m0.2738   [0m | [0m134.4    [0m | [0m16.37    [0m |
Early stopping on Epoch 63.
| [0m57       [0m | [0m-12.27   [0m | [0m26.34    [0m | [0m0.9      [0m | [0m209.5    [0m | [0m2.0      [0m |
Early stopping on Epoch 32.
| [0m58       [0m | [0m-13.28   [0m | [0m41.07    [0m | [0m0.9      [0m | [0m749.1    [0m | [0m17.0     [0m |
Early stopping on Epoch 75.
| [0m59       [0m | [0m-12.38   [0m | [0m76.71    [0m | [0m0.9      [0m | [0m581.6    [0m | [0m2.0      [0m |
Early stopping on Epoch 469.
| [0m60       [0m | [0m-13.34   [0m | [0m128.5    [0m | [0m0.8691   [0m | [0m60.56    [0m | [0m13.48    [0m |
Early stopping on Epoch 42.
| [0m61       [0m | [0m-12.95   [0m | [0m1.0      [0m | [0m0.0      [0m | [0m346.2    [0m | [0m2.0      [0m |
Early stopping on Epoch 16.
| [0m62       [0m | [0m-67.17   [0m | [0m88.12    [0m | [0m0.0 

In [70]:
# look at params that gave lowest validation RMSE
best_hyperparams = optimizer.max['params']
best_hyperparams

{'batch_size': 32.03638856422199,
 'dropout': 0.9,
 'hidden_dim': 100.74795411866992,
 'num_layers': 2.0}

- Best val_rmse = __11.87__ with a sequence length of 2.

# 2023 predictions
- This model will be trained on the entire train/val data, and then will predict 2023 offensive grade from the 2022 holdout set.
- Since we are using a sequence length of 2 seasons to predict the third, we will use 2021 & 2022 seasons (to predict 2022 target) as the test set.
- This means that players with under 3 seasons played can't be predicted on. I will use the best model (Random Forest) from [this notebook](./models_1.ipynb) to make these predictions.

In [97]:
# get the names of the 48 players that have 2023 targets
player_names_2023 = players_2022.player.values

# master_df includes 2022 rows. create a subset for these players, get players with at least 2 seasons
players_subset = master_df[master_df['player'].isin(player_names_2023)]
players_subset = players_subset.groupby('player').filter(lambda x: len(x) >= 2)

# get last two rows for each player
seq_test = players_subset.groupby('player').apply(lambda x: x.tail(2)).reset_index(drop=True)

In [98]:
# train sequences
X_train, y_train = create_seq(feature_subset=all_feats, seq_len=2, df=df)

# test sequences
X_test, y_test = create_seq(feature_subset=all_feats, seq_len=2, df=seq_test)

In [99]:
# 752 total examples to train on, 42 QBs to predict on
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((752, 2, 66), (752,), (42, 2, 66), (42,))

In [100]:
# create dataloaders
train_loader = create_loaders(X_train, y_train, test_size=0, batch_size=32)
test_loader = create_loaders(X_test, y_test, test_size=0, batch_size=len(X_test))

In [169]:
# best sequence model
best_seq = RNN(input_dim=len(all_feats), hidden_dim=98, num_layers=2, dropout=0.9).to(device)

# create optimizer
optimizer = torch.optim.AdamW(best_seq.parameters(), lr=0.001)

In [170]:
# loss function
criterion = nn.MSELoss()

# 100 epochs
num_epochs = 100

# training mode
best_seq.train()

# training loop
for epoch in range(num_epochs):  
    # train batches
    for x, y in train_loader:
        # put x and y on gpu
        x = x.to(device)
        y = y.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward pass
        train_preds = best_seq(x)
        # calc loss
        train_loss = criterion(train_preds, y)
        # backward pass
        train_loss.backward()
        # optimize
        optimizer.step()

In [171]:
# test set
best_seq.eval()

with torch.inference_mode():
    for x, y in test_loader:
        # put x and y on gpu
        x = x.to(device)
        y = y.to(device)

        # forward pass
        test_preds = best_seq(x)
        # calc loss
        test_loss = criterion(test_preds, y)

        # performance metrics
        rmse = np.sqrt(((test_preds - y) ** 2).sum().item() / y.shape[0])
        r2 = r2_score(y.numpy(force=True), test_preds.numpy(force=True))

print(f'RMSE: {rmse:.3f}')
print(f'R^2: {r2:.3f}')

RMSE: 12.895
R^2: 0.317


- On the 42/48 QBs who have 3+ seasons played, our model predicts their 2023 offensive grade with an RMSE of 12.895.

In [172]:
# get 2022 players who can't be predicted on with sequence model (players with less than 2 seasons)
players_subset = master_df[master_df['player'].isin(player_names_2023)]
players_subset = players_subset.groupby('player').filter(lambda x: len(x) < 2)

# get last row for each player
non_seq_test = players_subset.groupby('player').apply(lambda x: x.tail(1)).reset_index(drop=True)
non_seq_test.shape

(6, 70)

- 6 QBs in 2023 with less than 3 seasons played.

In [173]:
# best random forest
best_rf = RandomForestRegressor(random_state=random_state, min_samples_split=112)

# features and target
X_train = df[all_feats]
y_train = df.target
X_test = non_seq_test[all_feats]
y_test = non_seq_test.target

# create pieline
pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', best_rf)
    ])

# train on entire dataset
pipeline.fit(X_train, y_train)

# predict
preds = pipeline.predict(X_test)

rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)

print(f'RMSE: {rmse:.3f}')
print(f'R^2: {r2:.3f}')

RMSE: 11.658
R^2: 0.245


In [194]:
# combine preds from the two models
y_pred = np.concatenate([test_preds.squeeze(-1).cpu().numpy(), preds])

# get true values
y_true = np.concatenate([y.squeeze(-1).cpu().numpy(), y_test])

# look at overall performance
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)
print(f'RMSE: {rmse:.3f}')
print(f'R^2: {r2:.3f}')

RMSE: 12.747
R^2: 0.338


- Using an RNN with a sequence length of 2 (paired with best Random Forest), we achieve a RMSE of __12.75__.
- This is worse performance than just the Random Forest from [models_1](./models_1.ipynb).

In [291]:
# player names
player_names = seq_test.player.unique().tolist() + non_seq_test.player.unique().tolist()

# teams
team_names = []
for _, group in seq_test.groupby('player'):
    team_names.append(group.iloc[-1].team_name)

team_names.extend(non_seq_test.team_name.tolist())