<a href="https://colab.research.google.com/github/jskaza/nfl-big-data-bowl-2023/blob/master/sack_probability_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using RePP to Predict Pass Rusher Impact as Plays Develop
*RePP: **Re**current **P**ressure **P**robabilities*

**Jon Skaza & Matt Guthrie**

In [24]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, precision_recall_curve, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
import pickle
try:
  from google.colab import drive
  in_colab = True
except:
  in_colab = False
seed = 314 # for reproducibility, used in various places
np.random.seed(seed)
random.seed(seed)

## Dataset Preprocessing

In [25]:
if in_colab:
  drive.mount("/content/drive")
  path = "/content/drive/MyDrive/nfl-big-data-bowl-2023"
else:
  path = os.environ.get("BIG_DATA_BOWL")

df = pd.read_csv(f"{path}/data/dataset.csv", index_col=0)
df.head()

Unnamed: 0,game_id,play_id,nfl_id,speed,pressure,x,y,dist_from_qb,qb_in_tackle_box,n_blockers,...,speed_qb,x_qb,y_qb,x_ball,y_ball,quarter,down,yards_to_go,absolute_yardline_number,score_delta
1,2021090900,97,41263,0.96,1,1.74,-5.03,7.802083,1.0,5,...,0.35,-3.92,0.34,0.0,0.0,1,3,2,43.0,0
2,2021090900,97,41263,1.08,1,1.63,-5.01,7.766557,1.0,5,...,0.54,-4.0,0.34,-0.49,0.02,1,3,2,43.0,0
3,2021090900,97,41263,1.3,1,1.47,-4.99,7.695193,1.0,5,...,0.8,-4.09,0.33,-0.85,0.03,1,3,2,43.0,0
4,2021090900,97,41263,1.48,1,1.31,-4.94,7.603138,1.0,5,...,0.99,-4.18,0.32,-1.29,0.05,1,3,2,43.0,0
5,2021090900,97,41263,2.16,1,1.04,-4.83,7.404627,1.0,5,...,1.19,-4.29,0.31,-1.77,0.06,1,3,2,43.0,0


In [26]:
# examine missingness, models will need balanced sequences
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1077605 entries, 1 to 1077605
Data columns (total 30 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   game_id                   1077605 non-null  int64  
 1   play_id                   1077605 non-null  int64  
 2   nfl_id                    1077605 non-null  int64  
 3   speed                     1077605 non-null  float64
 4   pressure                  1077605 non-null  int64  
 5   x                         1077605 non-null  float64
 6   y                         1077605 non-null  float64
 7   dist_from_qb              1077605 non-null  float64
 8   qb_in_tackle_box          1077333 non-null  float64
 9   n_blockers                1077605 non-null  int64  
 10  x_C                       1077605 non-null  float64
 11  x_LT                      1077605 non-null  float64
 12  x_LG                      1077605 non-null  float64
 13  x_RG                      1

In [27]:
df.dropna(axis = 0, how = "any", inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1077203 entries, 1 to 1077605
Data columns (total 30 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   game_id                   1077203 non-null  int64  
 1   play_id                   1077203 non-null  int64  
 2   nfl_id                    1077203 non-null  int64  
 3   speed                     1077203 non-null  float64
 4   pressure                  1077203 non-null  int64  
 5   x                         1077203 non-null  float64
 6   y                         1077203 non-null  float64
 7   dist_from_qb              1077203 non-null  float64
 8   qb_in_tackle_box          1077203 non-null  float64
 9   n_blockers                1077203 non-null  int64  
 10  x_C                       1077203 non-null  float64
 11  x_LT                      1077203 non-null  float64
 12  x_LG                      1077203 non-null  float64
 13  x_RG                      1

In [28]:
def oversampler(df: pd.DataFrame, group_by: list, feats: list, outcome: str) -> pd.DataFrame:
    grouped_df = df.copy().groupby(group_by)
    group_labels = grouped_df.first()[outcome]
    vals = group_labels.value_counts()
    n_to_add = vals[0] - vals[1]
    minority_lab = vals.index[1]
    groups_to_sample = list(group_labels[group_labels == minority_lab].index)
    choices = [random.choice(groups_to_sample) for _ in range(n_to_add)]
    unique_key_counter = -1 # new "keys" will be negative so as not to duplicate groups which are all positive
    samples = []
    for i in choices:
        to_add = grouped_df.get_group(i).copy()
        for j in group_by:
            to_add[j] = unique_key_counter
        samples.append(to_add)
        unique_key_counter += -1
    return pd.concat([df] + samples, axis = 0)

In [19]:
def make_datasets(df: pd.DataFrame, group_by: list, feats: list, outcome: str, sequences: bool, 
samples: int, test_size: float, oversample = False):

  if oversample:
    df = oversampler(df, group_by, feats, outcome)

  if sequences or samples > 0:
    X, y  = [], []
    grouped_df = df.groupby(group_by)
    for _, group_df in grouped_df:
      f = group_df[feats].to_numpy()
      if samples > 0:
        sub_seq_lens = np.random.randint(0, len(f), samples)
        sub_seq = [f[:n] for n in sub_seq_lens]
        X += sub_seq
      else: # sequences but no sampling
        X.append(group_df[feats].to_numpy())
      if sequences:
        if samples == 0:
          y.append(group_df[[outcome]].to_numpy()[0])
        else:
          for s in range(samples):
            y.append(group_df[[outcome]].to_numpy()[0])
      else: # no sequences but multiple samples
        y.append(np.repeat(group_df[outcome].values[0], sum(sub_seq_lens)))
    if sequences:
      X = tf.keras.utils.pad_sequences(X, dtype="float", padding="pre", value = -99)
      y = np.stack(y)
    else: # no sequences but multiple samples
      X = np.concatenate(X)
      y = np.concatenate(y).ravel()

  else:
      X = df[feats].to_numpy()
      y = df[outcome].to_numpy()
    
  return train_test_split(X, y, test_size = test_size, random_state = seed) + [outcome]

In [20]:
model_metrics = {}
def add_metrics(model, outcome_name: str, model_name: str, X_test: np.ndarray, y_test: np.ndarray):
    global model_metrics
    model_metrics[outcome_name] = {}
    y_pred = model.predict(X_test)
    if type(model) == LogisticRegression:
        y_score = model.decision_function(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_score, pos_label = 1)
        roc = RocCurveDisplay(fpr = fpr, tpr = tpr)
        prec, recall, _ = precision_recall_curve(y_test, y_score, pos_label= 1)
        pr = PrecisionRecallDisplay(precision=prec, recall=recall)
        auc = roc_auc_score(y_test, y_score)
        model_metrics[outcome_name][model_name] = {"auc": auc, "roc_curve": roc, "pr_curve": pr}
    else:
        evaluation = model.evaluate(X_test, y_test)
        auc = evaluation[1]
        acc = evaluation[2]
        model_metrics[outcome_name][model_name] = {"auc": auc, "acc": acc}
    print(f"Added metric for model {model_name} with outcome {outcome_name} to the model_metrics dictionary.")
    for k, v in model_metrics[outcome_name][model_name].items():
        print(f"{k}: {v}")

In [21]:
outcome = "pressure"
group_by = ["game_id", "play_id", "nfl_id"]
feats = [x for x in list(df.columns) if x not in group_by + [outcome]]

## ReSaP: **Re**current **Sa**ck **P**robabilities

### "Naive" Logistic Model

In [22]:
def model_logistic(data: list, max_iter: int, file: str):
    mod = LogisticRegression(max_iter = max_iter)
    mod.fit(data[0], data[2])
    add_metrics(mod, data[4], "logistic", data[1], data[3])
    pickle.dump(mod, open(file, "wb"))
    return mod

In [23]:
data = make_datasets(df, group_by, feats, outcome, sequences = False, samples = 3, test_size = 0.2, oversample = True)
# mod1 = model_logistic(data, 1000, "mod1.sav")

KeyboardInterrupt: 

### LSTM

In [None]:
def model_lstm(data: list, units: int, num_epochs: int, val: float, folder: str):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    
    mod = tf.keras.Sequential()
    mod.add(tf.keras.layers.Masking(mask_value= -99., input_shape= data[0].shape[1:]))
    mod.add(tf.keras.layers.LSTM(units, input_shape = data[0].shape[1:]))
    mod.add(tf.keras.layers.Dense(data[2].shape[1], activation="sigmoid"))

    mod.compile(loss = "binary_crossentropy", optimizer="adam",
     metrics = [tf.keras.metrics.AUC(), tf.keras.metrics.BinaryAccuracy()])

    # callbacks = [tf.keras.callbacks.EarlyStopping(patience = 5, restore_best_weights=True)]

    mod.fit(data[0], data[2], epochs = num_epochs, validation_split = val)
    mod.save(folder)
    add_metrics(mod, data[4], "lstm", data[1], data[3])
    return mod

In [None]:
data = make_datasets(df, group_by, feats, outcome, sequences = True, samples = 3, test_size = 0.2, oversample = True)
mod2 = model_lstm(data, 64, 50, 0.1, "mod2")

### LSTM + Transformer

In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = tf.keras.layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = tf.keras.layers.Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(res)
    x = tf.keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

In [None]:
def model_transformer(data: list, head_size: int, num_heads: int, ff_dim: int, num_transformer_blocks: int, mlp_units: list,
 mlp_dropout: float, dropout: float, lstm_units: int, num_epochs: int, val: float, folder: str):
    inputs = tf.keras.Input(shape=data[0].shape[1:])
    x = inputs
    x = tf.keras.layers.Masking(mask_value = -99.,input_shape = data[0].shape[1:])(x)
    x = tf.keras.layers.LSTM(lstm_units, input_shape = data[0].shape[1:], return_sequences = True)(x)

    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = tf.keras.layers.GlobalAveragePooling1D(data_format = "channels_last")(x)
    x = tf.keras.layers.Dropout(0.1)(x)

    for dim in mlp_units:
        x = tf.keras.layers.Dense(dim, activation="relu")(x)
        x = tf.keras.layers.Dropout(mlp_dropout)(x)

    outputs = tf.keras.layers.Dense(data[2].shape[1], activation = "sigmoid")(x)
    mod = tf.keras.Model(inputs, outputs)

    mod.compile(
        loss = "binary_crossentropy",
        optimizer = "adam",
        metrics = [tf.keras.metrics.AUC(), tf.keras.metrics.BinaryAccuracy()]
    )

    # callbacks = [tf.keras.callbacks.EarlyStopping(min_delta=0.01, patience=3, restore_best_weights=True)]

    mod.fit(data[0], data[2], epochs= num_epochs, validation_split = val)

    mod.save(folder)
    add_metrics(mod, data[4], "lstm", data[1], data[3])
    return mod

In [None]:
mod3 = model_transformer(data, 128, 4, 4, 1, [128], 0.2, 0.25, 64, 50, 0.1, "mod3")