<a href="https://colab.research.google.com/github/jskaza/nfl-big-data-bowl-2023/blob/master/sack_probability_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using ReSaP and ReHaP to Predict Pass Rusher Impact as Plays Develop
*ReSaP: **Re**current **Sa**ck **P**robabilities*

*ReHaP: **Re**current **Ha**voc **P**robabilities*

**Jon Skaza & Matt Guthrie**

In [324]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, precision_recall_curve, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
try:
  from google.colab import drive
  in_colab = True
except:
  in_colab = False
seed = 314 # for reproducibility, used in various places
np.random.seed(seed)

## Dataset Preprocessing

In [125]:
if in_colab:
  drive.mount("/content/drive")
  path = "/content/drive/MyDrive/nfl-big-data-bowl-2023"
else:
  path = os.environ.get("BIG_DATA_BOWL")

df = pd.read_csv(f"{path}/data/dataset.csv", index_col=0).head(20000)
df.head()

Unnamed: 0,game_id,play_id,nfl_id,speed,pff_sack,havoc,x,y,dist_from_qb,qb_in_tackle_box,...,speed_qb,x_qb,y_qb,x_ball,y_ball,quarter,down,yards_to_go,absolute_yardline_number,score_delta
1,2021090900,97,41263,0.96,0,1,1.74,-5.03,7.802083,1.0,...,0.35,-3.92,0.34,0.0,0.0,1,3,2,43.0,0
2,2021090900,97,41263,1.08,0,1,1.63,-5.01,7.766557,1.0,...,0.54,-4.0,0.34,-0.49,0.02,1,3,2,43.0,0
3,2021090900,97,41263,1.3,0,1,1.47,-4.99,7.695193,1.0,...,0.8,-4.09,0.33,-0.85,0.03,1,3,2,43.0,0
4,2021090900,97,41263,1.48,0,1,1.31,-4.94,7.603138,1.0,...,0.99,-4.18,0.32,-1.29,0.05,1,3,2,43.0,0
5,2021090900,97,41263,2.16,0,1,1.04,-4.83,7.404627,1.0,...,1.19,-4.29,0.31,-1.77,0.06,1,3,2,43.0,0


In [126]:
# examine missingness, models will need balanced sequences
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 1 to 20000
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   game_id                   20000 non-null  int64  
 1   play_id                   20000 non-null  int64  
 2   nfl_id                    20000 non-null  int64  
 3   speed                     20000 non-null  float64
 4   pff_sack                  20000 non-null  int64  
 5   havoc                     20000 non-null  int64  
 6   x                         20000 non-null  float64
 7   y                         20000 non-null  float64
 8   dist_from_qb              20000 non-null  float64
 9   qb_in_tackle_box          20000 non-null  float64
 10  n_blockers                20000 non-null  int64  
 11  x_C                       20000 non-null  float64
 12  x_LT                      20000 non-null  float64
 13  x_LG                      20000 non-null  float64
 14  x_RG  

In [325]:
def make_datasets(df: pd.DataFrame, group_by: list, feats: list, outcome: str, sequences: bool, samples: int, test_size: float, smote = False):
  if sequences or samples > 0:
    X, y  = [], []
    grouped_df = df.groupby(group_by)
    for _, group_df in grouped_df:
      f = group_df[feats].to_numpy()
      if samples > 0:
        sub_seq_lens = np.random.randint(0, len(f), samples)
        sub_seq = [f[:n] for n in sub_seq_lens]
        X += sub_seq
      else: # sequences but no sampling
        X.append(group_df[feats].to_numpy())
      if sequences:
        if samples == 0:
          y.append(group_df[[outcome]].to_numpy()[0])
        else:
          for s in range(samples):
            y.append(group_df[[outcome]].to_numpy()[0])
      else: # no sequences but multiple samples
        y.append(np.repeat(group_df[outcome].values[0], sum(sub_seq_lens)))
    if sequences:
      X = tf.keras.utils.pad_sequences(X, dtype="float", padding="pre", value = -99)
      y = np.stack(y)
    else: # no sequences but multiple samples
      X = np.concatenate(X)
      y = np.concatenate(y).ravel()
  else:
      X = df[feats].to_numpy()
      y = df[outcome].to_numpy()

  if smote:
    sm = SMOTE(random_state = seed)
    X, y = sm.fit_resample(X, y) 
  return train_test_split(X, y, test_size = test_size, random_state = seed) + [outcome]

In [326]:
model_metrics = {}
def add_metrics(model, outcome_name: str, model_name: str, X_test: np.ndarray, y_test: np.ndarray):
    global model_metrics
    model_metrics[outcome_name] = {}
    y_pred = model.predict(X_test)
    if type(model) == LogisticRegression:
        y_score = model.decision_function(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_score, pos_label = 1)
        roc = RocCurveDisplay(fpr = fpr, tpr = tpr)
        prec, recall, _ = precision_recall_curve(y_test, y_score, pos_label= 1)
        pr = PrecisionRecallDisplay(precision=prec, recall=recall)
        auc = roc_auc_score(y_test, y_score)
        model_metrics[outcome_name][model_name] = {"auc": auc, "roc_curve": roc, "pr_curve": pr}
    else:
        evaluation = model.evaluate(X_test, y_test)
        auc = evaluation[1]
        acc = evaluation[2]
        model_metrics[outcome_name][model_name] = {"auc": auc, "acc": acc}
    print(f"Added metric for model {model_name} with outcome {outcome_name} to the model_metrics dictionary.")
    print(model_metrics)

## ReSaP: **Re**current **Sa**ck **P**robabilities

### "Naive" Logistic Model

In [327]:
def model_logistic(data: list, max_iter: int):
    mod = LogisticRegression(max_iter = max_iter)
    mod.fit(data[0], data[2])
    add_metrics(mod, data[4], "logistic", data[1], data[3])

In [330]:
outcome = "havoc"
group_by = ["game_id", "play_id", "nfl_id"]
feats = [x for x in list(df.columns) if x not in group_by + ["havoc","pff_sack"]]

data = make_datasets(df, group_by, feats, outcome, sequences = False, samples = 0, test_size = 0.2, smote = True)

model_logistic(data, 1000)

(28444, 26)

In [None]:
outcome = "havoc"
group_by = ["game_id", "play_id", "nfl_id"]
feats = [x for x in list(df.columns) if x not in group_by + ["havoc","pff_sack"]]

data = make_datasets(df, group_by, feats, outcome, sequences = False, samples = 10, test_size = 0.2, smote = True)
model_logistic(data, 1000)

### LSTM

In [322]:
def model_lstm(data: list, channels: int, num_epochs: int, val: float):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    
    mod = tf.keras.Sequential()
    mod.add(tf.keras.layers.Masking(mask_value= -99., input_shape= data[0].shape[1:]))
    mod.add(tf.keras.layers.LSTM(channels, input_shape = data[0].shape[1:]))
    mod.add(tf.keras.layers.Dense(data[2].shape[1], activation="sigmoid"))

    mod.compile(loss = "binary_crossentropy", optimizer="adam",
     metrics = [tf.keras.metrics.AUC(), tf.keras.metrics.BinaryAccuracy()])

    # callbacks = [tf.keras.callbacks.EarlyStopping(patience = 5, restore_best_weights=True)]

    mod.fit(data[0], data[2], epochs = num_epochs, validation_split = val)
    add_metrics(mod, data[4], "lstm", data[1], data[3])

In [323]:
outcome = "havoc"
group_by = ["game_id", "play_id", "nfl_id"]
feats = [x for x in list(df.columns) if x not in group_by + ["havoc","pff_sack"]]

data = make_datasets(df, group_by, feats, outcome, sequences = True, samples = 0, test_size = 0.2, smote = True)
model_lstm(data, 64, 50, 0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Added metric for model lstm with outcome havoc to the model_metrics dictionary.
{'havoc': {'lstm': {'auc': 0.7214783430099487, 'acc': 0.8996282815933228}}}


In [None]:
outcome = "havoc"
group_by = ["game_id", "play_id", "nfl_id"]
feats = [x for x in list(df.columns) if x not in group_by + ["havoc","pff_sack"]]

data = make_datasets(df, group_by, feats, outcome, sequences = True, samples = 10, test_size = 0.2, smote = True)
model_lstm(data, 64, 50, 0.1)

In [24]:
# roc curve
# accuracy: predict no sack
# logistic 
# lstm
# break test into chunks
model_metrics

{'pff_sack': {'lstm': {'auc': 0.14070352911949158, 'acc': 0.9950000047683716}},
 'havoc': {'lstm': {'auc': 0.44570034742355347, 'acc': 0.3050000071525574}}}

### Transformer

In [25]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = tf.keras.layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = tf.keras.layers.Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(res)
    x = tf.keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

In [26]:
def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    lstm_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs
    x = tf.keras.layers.Masking(mask_value=-99.,input_shape= input_shape)(x)
    x = tf.keras.layers.LSTM(lstm_units, input_shape = input_shape, return_sequences=True)(x)

    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = tf.keras.layers.GlobalAveragePooling1D(data_format="channels_last")(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    
    for dim in mlp_units:
        x = tf.keras.layers.Dense(dim, activation="relu")(x)
        x = tf.keras.layers.Dropout(mlp_dropout)(x)
    
    outputs = tf.keras.layers.Dense(y.shape[1], activation="sigmoid")(x)
    return tf.keras.Model(inputs, outputs)

In [27]:
outcomes = ["pff_sack", "havoc"]
datasets = {}
for o in outcomes:
    group_by = ["game_id", "play_id", "nfl_id"]
    outcome = [o]
    feats = [x for x in list(df.columns) if x not in group_by + outcomes]

    X, y = make_features(df, group_by, feats, outcome)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)

    weight_0 = (1 / sum(y_train == [0])) * (len(y_train) / 2.0)
    weight_1 = (1 / sum(y_train == [1])) * (len(y_train) / 2.0)
    class_weight = {0: weight_0, 1: weight_1}

    num_epochs = 1
    val = 0.2 

    datasets[o] = {"X_train": X_train, "X_test": X_test,
    "y_train": y_train, "y_test": y_test, "class_weight": class_weight,
    "num_epochs": num_epochs, "val": val}

In [None]:
for k, v in datasets.items():
    input_shape = v["X_train"].shape[1:]

    model = build_model(
        input_shape,
        head_size=128,
        num_heads=4,
        ff_dim=4,
        num_transformer_blocks=1,
        mlp_units=[128],
        mlp_dropout=0.2,
        dropout=0.25,
        lstm_units=32
    )

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        #optimizer=keras.optimizers.Adam(learning_rate=1e-4),
        metrics = [tf.keras.metrics.AUC()]
    )
    #model.summary()

    callbacks = [tf.keras.callbacks.EarlyStopping(min_delta=0.01, patience=3, restore_best_weights=True)]

    model.fit(
        v["X_train"],
        v["y_train"],
        validation_split=v["val"],
        epochs=v["num_epochs"],
        #batch_size=64,
        callbacks=callbacks,
        class_weight = v["class_weight"]
    )

    model.evaluate(v["X_test"], v["y_test"], verbose=1)

: 

: 

: 

: 