# Imports, Load data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Subset

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score

import joblib
from tqdm import tqdm

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

sns.set_theme(context='talk', style='darkgrid', palette='magma')

# global random_state
random_state = 9

In [2]:
# set up GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [5]:
df = pd.read_csv('../train_data/non_seq.csv').drop('Unnamed: 0', axis=1)
df.shape, len(set(df.player))

((969, 66), 218)

- 969 QB seasons.
- 64 features + year and target.
- 218 unique QBs.

# Prepare data

In [6]:
# look at all 66 cols
df.columns

Index(['player', 'team_name', 'player_game_count', 'pass_accuracy_percent',
       'pass_aimed_passes', 'pass_attempts', 'pass_avg_depth_of_target',
       'pass_avg_time_to_throw', 'pass_bats', 'pass_big_time_throws',
       'pass_btt_rate', 'pass_completion_percent', 'pass_completions',
       'pass_def_gen_pressures', 'pass_drop_rate', 'pass_dropbacks',
       'pass_drops', 'pass_first_downs', 'pass_grades_hands_fumble',
       'pass_grades_offense', 'pass_grades_pass', 'pass_hit_as_threw',
       'pass_interceptions', 'pass_passing_snaps', 'pass_penalties',
       'pass_pressure_to_sack_rate', 'pass_qb_rating', 'pass_sack_percent',
       'pass_sacks', 'pass_scrambles', 'pass_spikes', 'pass_thrown_aways',
       'pass_touchdowns', 'pass_turnover_worthy_plays', 'pass_twp_rate',
       'pass_yards', 'pass_ypa', 'year', 'rush_attempts',
       'rush_avoided_tackles', 'rush_breakaway_attempts',
       'rush_breakaway_percent', 'rush_breakaway_yards', 'rush_designed_yards',
       'rush

In [7]:
# normal numerical (non-percentage) stats
numerical_features = ['player_game_count', 'pass_aimed_passes', 'pass_avg_depth_of_target', 'pass_avg_time_to_throw', 
                      'pass_big_time_throws', 'pass_completions', 'pass_def_gen_pressures', 'pass_dropbacks', 'pass_drops', 
                      'pass_first_downs', 'pass_grades_offense', 'pass_grades_pass', 'pass_hit_as_threw', 
                      'pass_passing_snaps', 'pass_sacks', 'pass_scrambles', 'pass_spikes', 'pass_thrown_aways', 
                      'pass_touchdowns', 'pass_turnover_worthy_plays', 'pass_yards', 'rush_attempts', 'rush_avoided_tackles', 
                      'rush_designed_yards', 'rush_explosive', 'rush_first_downs', 'rush_fumbles', 'rush_grades_run', 
                      'rush_scramble_yards', 'rush_total_touches', 'rush_touchdowns', 'rush_yards', 'rush_yards_after_contact', 
                      'exp']

# using many of the same features as above, but percentage stats where possible
percent_features = ['player_game_count', 'pass_accuracy_percent', 'pass_avg_depth_of_target', 'pass_avg_time_to_throw', 
                    'pass_btt_rate', 'pass_drop_rate', 'pass_dropbacks', 'pass_first_downs', 'pass_grades_offense', 
                    'pass_grades_pass', 'pass_hit_as_threw', 'pass_passing_snaps', 'pass_pressure_to_sack_rate', 
                    'pass_sack_percent', 'pass_scrambles', 'pass_spikes', 'pass_thrown_aways', 'pass_touchdowns', 
                    'pass_twp_rate', 'pass_ypa', 'rush_attempts', 'rush_avoided_tackles', 'rush_designed_yards', 
                    'rush_explosive', 'rush_first_downs', 'rush_fumbles', 'rush_grades_run', 'rush_scramble_yards', 
                    'rush_total_touches', 'rush_touchdowns', 'rush_ypa', 'rush_yco_attempt', 'exp']

len(numerical_features), len(percent_features)

(34, 33)

- 34 features of pure numerical stats.
- 33 features of percentage stats, stats normalized by attempt, as well as a few pure numerical stats.

In [95]:
# features and target
X = df[numerical_features]
y = df.target

# train/test split, 90/10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

# train/val split, 80/20
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state)

In [96]:
# create scaler
scaler = StandardScaler()

# scale data
train_scaled = scaler.fit_transform(X_train)
val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(X_test)

# save the scaler
joblib.dump(scaler, '../model/dense_scaler.pkl')

['../model/dense_scaler.pkl']

In [97]:
# convert data to tensors, specify 'device' to use gpu
X_train = torch.tensor(train_scaled, dtype=torch.float32, device=device)
y_train = torch.tensor(y_train.values, dtype=torch.float32, device=device)

X_val = torch.tensor(val_scaled, dtype=torch.float32, device=device)
y_val = torch.tensor(y_val.values, dtype=torch.float32, device=device)

X_test = torch.tensor(test_scaled, dtype=torch.float32, device=device)
y_test = torch.tensor(y_test.values, dtype=torch.float32, device=device)

In [98]:
# create datasets
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
test_data = TensorDataset(X_test, y_test)

# batch size
batch_size = 8

# create dataloaders
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_data, batch_size=X_val.shape[0], shuffle=False)
test_loader = DataLoader(dataset=test_data, batch_size=X_test.shape[0], shuffle=False)

# shapes of inputs, targets
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

(torch.Size([697, 34]),
 torch.Size([175, 34]),
 torch.Size([97, 34]),
 torch.Size([697]),
 torch.Size([175]),
 torch.Size([97]))

# Dense NN

In [111]:
# simple NN with 1 hidden layer and relu activation
class dense(nn.Module):
    def __init__(self, input_dim, hidden_nodes):
        super(dense, self).__init__()
        
        # Linear layers
        self.fc1 = nn.Linear(input_dim, hidden_nodes)
        self.output = nn.Linear(hidden_nodes, 1)
        
        # batch norm
        self.bn1 = nn.BatchNorm1d(hidden_nodes)
        
        
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.output(x)
        return x

In [112]:
# 5 layers
# class dense(nn.Module):
#     def __init__(self, input_dim, hidden_nodes):
#         super(dense, self).__init__()
        
#         # Linear layers
#         self.fc1 = nn.Linear(input_dim, hidden_nodes)
#         self.fc2 = nn.Linear(hidden_nodes, int(hidden_nodes/2))
#         self.fc3 = nn.Linear(int(hidden_nodes/2), int(hidden_nodes/4))
#         self.fc4 = nn.Linear(int(hidden_nodes/4), int(hidden_nodes/8))
#         self.output = nn.Linear(int(hidden_nodes/8), 1)
        
#         # batch norm
#         self.bn1 = nn.BatchNorm1d(hidden_nodes)
#         self.bn2 = nn.BatchNorm1d(int(hidden_nodes/2))
#         self.bn3 = nn.BatchNorm1d(int(hidden_nodes/4))
#         self.bn4 = nn.BatchNorm1d(int(hidden_nodes/8))
        
        
        
#     def forward(self, x):
#         x = torch.relu(self.bn1(self.fc1(x)))
#         x = torch.relu(self.bn2(self.fc2(x)))
#         x = torch.relu(self.bn3(self.fc3(x)))
#         x = torch.relu(self.bn4(self.fc4(x)))
#         x = self.output(x)
#         return x

In [113]:
# instantiate model with 64 hidden neurons
model = dense(input_dim=X.shape[1], hidden_nodes=int(64))

# move params to gpu
model.to(device)

# loss, optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Train loop

In [114]:
# epochs
num_epochs = int(1000)

# early stopping
patience = 20
best_val_loss = float('inf')
epochs_without_improvement = 0

# train loop
for epoch in range(num_epochs):
    # set model to train mode
    model.train()

    # sum of rmse, preds, and y. reset at the start of every epoch
    sum_rmse = 0
    epoch_preds = []
    epoch_y = []

    for x, y in train_loader:
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward pass
        train_preds = model(x).squeeze(-1)
        # calc loss
        train_loss = criterion(train_preds, y)
        # backward pass
        train_loss.backward()
        # optimize
        optimizer.step()

        # get rmse for current batch, add to sum rmse
        sum_rmse += np.sqrt(train_loss.item())

        # get preds and y from batch to calc r2
        epoch_preds.extend(train_preds.numpy(force=True))
        epoch_y.extend(y.numpy(force=True))

    # train metrics
    train_rmse = sum_rmse / len(train_data)
    train_r2 = r2_score(epoch_y, epoch_preds)



    # validation
    model.eval()

    with torch.inference_mode():
        for x, y in val_loader:
            # forward pass
            val_preds = model(x).squeeze(-1)
            # calc loss
            val_loss = criterion(val_preds, y)

        # val metrics
        val_rmse = np.sqrt(val_loss.item())
        val_r2 = r2_score(y.numpy(force=True), val_preds.numpy(force=True))



    # print metrics
    if (epoch+1) % 1e1 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'Train RMSE: {train_rmse:.4f}')
        print(f'Train R2: {train_r2:.4f}')
        print(f'Val RMSE: {val_rmse:.4f}')
        print(f'Val R2: {val_r2:.4f}')
        print()

    # early stopping
    if val_rmse < best_val_loss:
        best_val_loss = val_rmse
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

        # break out of train loop if we reach patience value
        if epochs_without_improvement == patience:
            print(f'Early stopping on Epoch {epoch+1}.')
            break



print(f'\nFinal validation RMSE: {val_rmse}')
print(f'Best validation RMSE: {best_val_loss}\n')

Epoch [10/1000]
Train RMSE: 2.8439
Train R2: -1.4865
Val RMSE: 20.4310
Val R2: -0.9504

Epoch [20/1000]
Train RMSE: 2.0294
Train R2: -0.2929
Val RMSE: 15.1025
Val R2: -0.0657

Epoch [30/1000]
Train RMSE: 1.6362
Train R2: 0.1652
Val RMSE: 12.8606
Val R2: 0.2272

Epoch [40/1000]
Train RMSE: 1.5055
Train R2: 0.3067
Val RMSE: 12.6840
Val R2: 0.2483

Epoch [50/1000]
Train RMSE: 1.4439
Train R2: 0.3528
Val RMSE: 12.1429
Val R2: 0.3110

Epoch [60/1000]
Train RMSE: 1.4454
Train R2: 0.3705
Val RMSE: 12.3569
Val R2: 0.2865

Epoch [70/1000]
Train RMSE: 1.4384
Train R2: 0.3983
Val RMSE: 12.0598
Val R2: 0.3204

Epoch [80/1000]
Train RMSE: 1.4052
Train R2: 0.4062
Val RMSE: 12.2283
Val R2: 0.3013

Epoch [90/1000]
Train RMSE: 1.3800
Train R2: 0.4134
Val RMSE: 11.9252
Val R2: 0.3355

Epoch [100/1000]
Train RMSE: 1.3518
Train R2: 0.4305
Val RMSE: 12.0544
Val R2: 0.3210

Epoch [110/1000]
Train RMSE: 1.3709
Train R2: 0.4356
Val RMSE: 11.9998
Val R2: 0.3272

Epoch [120/1000]
Train RMSE: 1.3555
Train R2: 0.

- So far, best val RMSE is 11.8 and best val R2 is 0.33

# KFolds Validation

In [58]:
# use KFold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

# indices to split on
splits = kf.split(range(len(train_data)))

In [69]:
# list to store each set of (train, val) dataloaders
dataloaders = []

for fold, (train_idx, val_idx) in enumerate(splits):
    # subset data to create a train and val set for each fold
    train_subset = Subset(train_data, train_idx)
    val_subset = Subset(train_data, val_idx)
    
    # create dataLoaders for training and validation sets
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=len(val_subset), shuffle=False)
    
    # store in list
    dataloaders.append((train_loader, val_loader))

In [70]:
# epochs
num_epochs = int(100)

# store val loss and r2 for each fold
kfold_loss = []
kfold_r2 = []

# early stopping
patience = 20
best_val_loss = float('inf')
epochs_without_improvement = 0

# KFold loop, 5 iterations for 5 splits
for fold, (train_loader, val_loader) in enumerate(dataloaders):
    print(f"Training fold {fold+1}:")
    
    # initialize model, move params to gpu
    model = dense(input_dim=X_train.shape[1], hidden_nodes=int(64))
    model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # train loop
    for epoch in range(num_epochs):
        model.train()
        
        # sum of rmse, preds, and y. reset at the start of every epoch
        sum_rmse = 0
        epoch_preds = []
        epoch_y = []

        for x, y in train_loader:
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward pass
            train_preds = model(x).squeeze(-1)
            # calc loss
            train_loss = criterion(train_preds, y)
            # backward pass
            train_loss.backward()
            # optimize
            optimizer.step()

            # get rmse for current batch, add to sum rmse
            sum_rmse += np.sqrt(train_loss.item())

            # get preds and y from batch to calc r2
            epoch_preds.extend(train_preds.numpy(force=True))
            epoch_y.extend(y.numpy(force=True))
            
        # train metrics
        train_rmse = sum_rmse / len(train_data)
        train_r2 = r2_score(epoch_y, epoch_preds)
        
        
        
        # validation
        model.eval()
        
        with torch.inference_mode():
            for x, y in val_loader:
                # forward pass
                val_preds = model(x).squeeze(-1)
                # calc loss
                val_loss = criterion(val_preds, y)

            # val metrics
            val_rmse = np.sqrt(val_loss.item())
            val_r2 = r2_score(y.numpy(force=True), val_preds.numpy(force=True))

            
            
        # print metrics
#         if (epoch+1) % 1e1 == 0:
#             print(f'Epoch [{epoch+1}/{num_epochs}]')
#             print(f'Train RMSE: {train_rmse:.4f}')
#             print(f'Train R2: {train_r2:.4f}')
#             print(f'Val RMSE: {val_rmse:.4f}')
#             print(f'Val R2: {val_r2:.4f}')
#             print()

        # early stopping
        if val_loss.item() < best_val_loss:
            best_val_loss = val_rmse
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

            # break out of train loop if we reach patience value
            if epochs_without_improvement == patience:
                print(f'Early stopping on Epoch {epoch+1}.')
                break

                
    
    print(f'Final validation RMSE: {val_rmse}\n')
    print(f'Best validation RMSE: {best_val_loss}\n')
    
    # append validation metrics at the end of a fold
    kfold_loss.append(val_rmse)
    kfold_loss.append(val_r2)

In [29]:
np.mean(kfold_loss), np.std(kfold_loss)

(7.1108801163769115, 10.908309078255208)

# Results df
- Load in dataframe from dense.csv to store all results from dense models.
- Stores loss, r2, hyperparameters, and features used.

In [57]:
# load in dense.csv ()
dense_df = pd.read_csv('../model/dense.csv')
dense_df

Unnamed: 0,train_rmse,train_r2,'test_rmse,test_r2,num_layers,shape,optim,epochs,batch_size,feats


In [None]:
Subset()

In [None]:
dense_df

In [None]:
def fit_eval(layers, df=dense_df, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):

    '''
    Fit model, append performance metrics and hyperparameters to df.
    
    Parameters:
    layers - a list containing node counts for hidden layers

    Returns:
    df - dense_df passed in with the results appended from the pipeline passed in, params of nn also included
    
    '''
    


    # get number of layers
    

    # get shape of nn
    nn_shape = []
    for i, layer in enumerate():
        if i % 2 == 0:
            nn_shape.append(layer.units)

    # get optimizer
    optim = str(nn.optimizer).split()[0].split('.')[-1]
    
    
    
    # append results to last row of the df
    df.loc[len(df.index)] = [train_loss, test_loss, num_layers, nn_shape, optimizer, num_epochs, batch_size]
    
    
    
    return df

# Test set

In [116]:
# set model to eval mode
model.eval()

with torch.inference_mode():
    # test loop (single batch)
    for i, (inputs, targets) in enumerate(test_loader):
        # forward pass
        test_preds = model(inputs).squeeze()
        # calc loss
        test_loss = criterion(test_preds, targets)

        # test metrics
        test_loss = np.sqrt(test_loss.item())
        test_r2 = r2_score(targets.numpy(force=True), test_preds.numpy(force=True))

In [117]:
print(f'Test RMSE: {test_loss:.4f}')
print(f'Test R2: {test_r2:.4f}')

Test RMSE: 13.2927
Test R2: 0.2553


# 2022 Rookie QBs
- Making projections for the 2023 season.
- Using the first 10/18 2023 weeks as the target.

In [None]:
pd.read_csv('../train_data/rookies_2022')