<a href="https://colab.research.google.com/github/jskaza/nfl-big-data-bowl-2023/blob/master/sack_probability_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using ReSaP and ReHaP to Predict Pass Rusher Impact as Plays Develop
*ReSaP: **Re**current **Sa**ck **P**robabilities*

*ReHaP: **Re**current **Ha**voc **P**robabilities*

### Jon Skaza & Matt Guthrie

In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import drive
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
drive.mount("/content/drive")
seed = 314 # for reproducibility, used in various places

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Dataset Preprocessing

In [18]:
df = pd.read_csv("/content/drive/MyDrive/nfl-big-data-bowl-2023/data/dataset.csv", index_col=0)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,game_id,play_id,nfl_id,speed,pff_sack,x,y,dist_from_qb,qb_in_tackle_box,x_blocker_1,...,receiver_sep_1,receiver_sep_2,receiver_sep_3,receiver_sep_4,receiver_sep_5,quarter,down,yards_to_go,absolute_yardline_number,score_delta
1,2021090900,97,41263,0.96,0,1.74,-5.03,7.802083,True,0.54,...,3.030017,2.706917,6.139422,1.377679,4.278247,1,3,2,43.0,0
2,2021090900,97,41263,1.08,0,1.63,-5.01,7.766557,True,0.47,...,2.961689,2.659568,6.040149,1.369708,4.222345,1,3,2,43.0,0
3,2021090900,97,41263,1.3,0,1.47,-4.99,7.695193,True,0.39,...,2.859266,2.607221,5.928642,1.388416,3.898166,1,3,2,43.0,0
4,2021090900,97,41263,1.48,0,1.31,-4.94,7.603138,True,0.3,...,2.719577,2.452305,5.756813,1.42443,3.516049,1,3,2,43.0,0
5,2021090900,97,41263,2.16,0,1.04,-4.83,7.404627,True,0.18,...,2.612279,2.297325,5.472961,1.480034,3.040066,1,3,2,43.0,0


In [19]:
# examine missingness, models will need balanced sequences
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1077605 entries, 1 to 1077605
Data columns (total 42 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   game_id                   1077605 non-null  int64  
 1   play_id                   1077605 non-null  int64  
 2   nfl_id                    1077605 non-null  int64  
 3   speed                     1077605 non-null  float64
 4   pff_sack                  1077605 non-null  int64  
 5   x                         1077605 non-null  float64
 6   y                         1077605 non-null  float64
 7   dist_from_qb              1077605 non-null  float64
 8   qb_in_tackle_box          1077333 non-null  object 
 9   x_blocker_1               1077605 non-null  float64
 10  x_blocker_2               1077605 non-null  float64
 11  x_blocker_3               1077605 non-null  float64
 12  x_blocker_4               1077605 non-null  float64
 13  x_blocker_5               1

In [20]:
def make_features(df: pd.DataFrame, group_by: list, feats: list, outcomes: list, naive = False):
  X, y  = [], []
  grouped_df = df.groupby(group_by)
  for group_name, group_df in grouped_df:
    if naive:
      play_start = group_df[np.abs(group_df["x_qb"] - group_df["x_ball"]) < 1].head(1)
      if len(play_start) == 1:
        X.append(group_df[np.abs(group_df["x_qb"] - group_df["x_ball"]) < 1].head(1)[feats].to_numpy())
        y.append(group_df[outcomes].values[0])
    else:
      X.append(group_df[feats].fillna(-99.).to_numpy())
      y.append(group_df[outcomes].to_numpy()[0])

  if naive:
    X = np.concatenate(X)
    indices = pd.isnull(X).any(axis=0)
    X = np.delete(X, indices, axis=1)
    y = np.array(y).flatten()
  else:
    X = tf.keras.utils.pad_sequences(X, dtype="float", padding="pre", value = -99)
    y = tf.keras.utils.pad_sequences(y, dtype="float", padding="pre", value= -99)

  return X, y

In [21]:
group_by = ["game_id", "play_id", "nfl_id"]
outcomes = ["pff_sack"]       
feats = [x for x in list(df.columns) if x not in group_by + outcomes]

X_naive, y_naive = make_features(df, group_by, feats, outcomes, naive = True)
X_net, y_net = make_features(df, group_by, feats, outcomes)

# shuffle?
X_train_naive, X_test_naive, y_train_naive, y_test_naive = train_test_split(X_naive, y_naive, test_size = 0.2, random_state = seed)
X_train_net, X_test_net, y_train_net, y_test_net = train_test_split(X_net, y_net, test_size = 0.2, random_state = seed)

weight_for_0 = (1 / sum(y_train_net==[0])) * (len(y_train_net) / 2.0)
weight_for_1 = (1 / sum(y_train_net==[1])) * (len(y_train_net) / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

In [23]:
del df, X_naive, y_naive, X_net, y_net # free up some mem

# "Naive" Logistic Model

In [24]:
log_reg = LogisticRegression(class_weight = class_weight, max_iter = 1000)

log_reg.fit(X_train_naive, y_train_naive)

preds = log_reg.predict_proba(X_test_naive)[:,1]

fpr, tpr, thresholds = metrics.roc_curve(y_test_naive, preds, pos_label=1)
metrics.auc(fpr, tpr)

0.6194667260843731

# LSTM Approach

In [25]:
tf.random.set_seed(seed)
np.random.seed(seed)
num_epochs = 50

# create the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Masking(mask_value= -99.,
                                  input_shape= X_train_net.shape[1:]))
model.add(tf.keras.layers.LSTM(128, input_shape = X_train_net.shape[1:]))
model.add(tf.keras.layers.Dense(y_train_net.shape[1], activation="sigmoid"))

# compile and fit the model
model.compile(loss= "binary_crossentropy", optimizer="adam", metrics = [tf.keras.metrics.AUC()])

callbacks = [tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]

model.fit(X_train_net, y_train_net, epochs = num_epochs, validation_split = 0.2, class_weight = class_weight, callbacks = callbacks)

model.evaluate(X_test_net, y_test_net)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


[0.07005374133586884, 0.9398061633110046]

# Transformer Approach

In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = tf.keras.layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = tf.keras.layers.Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(res)
    x = tf.keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

In [None]:
def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs
    x = tf.keras.layers.Masking(mask_value=-99.,input_shape= X.shape[1:])(x)
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = tf.keras.layers.GlobalAveragePooling1D(data_format="channels_first")(x)
    # for dim in mlp_units:
    #     x = tf.keras.layers.Dense(dim, activation="relu")(x)
    #     x = tf.keras.layers.Dropout(mlp_dropout)(x)
    outputs = tf.keras.layers.Dense(y.shape[1], activation="sigmoid")(x)
    return tf.keras.Model(inputs, outputs)

In [None]:
input_shape = X_train.shape[1:]

model = build_model(
    input_shape,
    head_size=256,
    num_heads=4,
    ff_dim=4,
    num_transformer_blocks=1,
    mlp_units=[64],
    mlp_dropout=0.4,
    dropout=0.25,
)

model.compile(
    loss="binary_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics = [tf.keras.metrics.AUC()]
)
#model.summary()

callbacks = [tf.keras.callbacks.EarlyStopping(patience=10, \
    restore_best_weights=True)]

model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=5,
    #batch_size=64,
    callbacks=callbacks,
)

model.evaluate(X_test, y_test, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.2248472422361374, 0.6851325035095215]