# Using ReSaP and ReHaP to Predict Pass Rusher Impact as Plays Develop
*ReSaP: **Re**current **Sa**ck **P**robabilities*

*ReHaP: **Re**current **Ha**voc **P**robabilities*

### Jon Skaza & Matt Guthrie

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import drive
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
drive.mount("/content/drive", force_remount=True)
seed = 314 # for reproducibility, used in various places

Mounted at /content/drive


# Dataset Preprocessing

In [15]:
df = pd.read_csv("/content/drive/MyDrive/nfl-big-data-bowl-2023/data/dataset_with_frame.csv", index_col=0)
df.head()

Unnamed: 0,game_id,play_id,nfl_id,frame_id,speed,pff_sack,havoc,x,y,dist_from_qb,...,receiver_sep_1,receiver_sep_2,receiver_sep_3,receiver_sep_4,receiver_sep_5,quarter,down,yards_to_go,absolute_yardline_number,score_delta
1,2021090900,97,41263,6,0.96,0,1,1.74,-5.03,7.802083,...,3.030017,2.706917,6.139422,1.377679,4.278247,1,3,2,43.0,0
2,2021090900,97,41263,7,1.08,0,1,1.63,-5.01,7.766557,...,2.961689,2.659568,6.040149,1.369708,4.222345,1,3,2,43.0,0
3,2021090900,97,41263,8,1.3,0,1,1.47,-4.99,7.695193,...,2.859266,2.607221,5.928642,1.388416,3.898166,1,3,2,43.0,0
4,2021090900,97,41263,9,1.48,0,1,1.31,-4.94,7.603138,...,2.719577,2.452305,5.756813,1.42443,3.516049,1,3,2,43.0,0
5,2021090900,97,41263,10,2.16,0,1,1.04,-4.83,7.404627,...,2.612279,2.297325,5.472961,1.480034,3.040066,1,3,2,43.0,0


In [3]:
# examine missingness, models will need balanced sequences
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1077605 entries, 1 to 1077605
Data columns (total 44 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   game_id                   1077605 non-null  int64  
 1   play_id                   1077605 non-null  int64  
 2   nfl_id                    1077605 non-null  int64  
 3   frame_id                  1077605 non-null  int64  
 4   speed                     1077605 non-null  float64
 5   pff_sack                  1077605 non-null  int64  
 6   havoc                     1077605 non-null  int64  
 7   x                         1077605 non-null  float64
 8   y                         1077605 non-null  float64
 9   dist_from_qb              1077605 non-null  float64
 10  qb_in_tackle_box          1077333 non-null  float64
 11  x_blocker_1               1077605 non-null  float64
 12  x_blocker_2               1077605 non-null  float64
 13  x_blocker_3               1

In [17]:
def make_features(df: pd.DataFrame, group_by: list, feats: list, outcomes: list, naive = False):
  X, y  = [], []
  grouped_df = df.groupby(group_by)
  for group_name, group_df in grouped_df:
    if naive:
      X.append(group_df[feats].to_numpy())
      y.append(group_df[outcomes])
      # play_start = group_df[np.abs(group_df["x_qb"] - group_df["x_ball"]) < 1].head(1)
      # if len(play_start) == 1:
      #   X.append(group_df[np.abs(group_df["x_qb"] - group_df["x_ball"]) < 1].head(1)[feats].to_numpy())
      #   y.append(group_df[outcomes].values[0])
    else:
      
      X.append(group_df[feats].fillna(-99.).to_numpy())
      y.append(group_df[outcomes].to_numpy()[0])

  if naive:
    X = np.concatenate(X)
    indices = pd.isnull(X).any(axis=0)
    X = np.delete(X, indices, axis=1)
    y = np.concatenate(y)
  else:
    X = tf.keras.utils.pad_sequences(X, dtype="float", padding="pre", value = -99)
    y = tf.keras.utils.pad_sequences(y, dtype="float", padding="pre", value= -99)

  return X, y

In [18]:
group_by = ["game_id", "play_id", "nfl_id"]
outcomes = ["pff_sack"]       
feats = [x for x in list(df.columns) if x not in group_by + outcomes + ["havoc"]]

X_naive, y_naive = make_features(df, group_by, feats, outcomes, naive = True)
X_net, y_net = make_features(df, group_by, feats, outcomes)

# shuffle?
X_train_naive, X_test_naive, y_train_naive, y_test_naive = train_test_split(X_naive, y_naive, test_size = 0.2, random_state = seed)
X_train_net, X_test_net, y_train_net, y_test_net = train_test_split(X_net, y_net, test_size = 0.2, random_state = seed)

weight_for_0 = (1 / sum(y_train_net==[0])) * (len(y_train_net) / 2.0)
weight_for_1 = (1 / sum(y_train_net==[1])) * (len(y_train_net) / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

In [10]:
del df, X_naive, y_naive, X_net, y_net # free up some mem

In [19]:
print(X_train_naive.shape)
print(X_test_naive.shape)
print(y_train_naive.shape)
print(y_test_naive.shape)
print(X_train_net.shape)
print(X_test_net.shape)
print(y_train_net.shape)
print(y_test_net.shape)
pd.DataFrame(X_train_net[0])

(862084, 25)
(215521, 25)
(862084, 1)
(215521, 1)
(27485, 193, 39)
(6872, 193, 39)
(27485, 1)
(6872, 1)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,-99.0,-99.00,-99.00,-99.00,-99.000000,-99.0,-99.00,-99.00,-99.00,-99.00,...,-99.000000,-99.000000,-99.000000,-99.000000,-99.000000,-99.0,-99.0,-99.0,-99.0,-99.0
1,-99.0,-99.00,-99.00,-99.00,-99.000000,-99.0,-99.00,-99.00,-99.00,-99.00,...,-99.000000,-99.000000,-99.000000,-99.000000,-99.000000,-99.0,-99.0,-99.0,-99.0,-99.0
2,-99.0,-99.00,-99.00,-99.00,-99.000000,-99.0,-99.00,-99.00,-99.00,-99.00,...,-99.000000,-99.000000,-99.000000,-99.000000,-99.000000,-99.0,-99.0,-99.0,-99.0,-99.0
3,-99.0,-99.00,-99.00,-99.00,-99.000000,-99.0,-99.00,-99.00,-99.00,-99.00,...,-99.000000,-99.000000,-99.000000,-99.000000,-99.000000,-99.0,-99.0,-99.0,-99.0,-99.0
4,-99.0,-99.00,-99.00,-99.00,-99.000000,-99.0,-99.00,-99.00,-99.00,-99.00,...,-99.000000,-99.000000,-99.000000,-99.000000,-99.000000,-99.0,-99.0,-99.0,-99.0,-99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,30.0,3.44,-4.93,5.08,4.902051,1.0,-3.15,-4.46,-6.03,-3.31,...,6.068748,6.849241,6.263194,4.185236,3.164427,2.0,2.0,4.0,59.0,4.0
189,31.0,2.93,-5.24,5.02,4.710170,1.0,-3.33,-4.76,-6.25,-3.46,...,5.956887,6.672219,6.859636,4.678002,2.910756,2.0,2.0,4.0,59.0,4.0
190,32.0,2.41,-5.49,4.95,4.550363,1.0,-3.50,-5.02,-6.43,-3.59,...,5.858498,6.468423,7.471312,5.227791,2.689721,2.0,2.0,4.0,59.0,4.0
191,33.0,1.81,-5.68,4.86,4.401375,1.0,-3.62,-5.27,-6.56,-3.70,...,5.760139,6.258466,8.104048,5.650779,2.546998,2.0,2.0,4.0,59.0,4.0


# "Naive" Logistic Model

In [20]:
log_reg = LogisticRegression(class_weight = class_weight, max_iter = 1000)

log_reg.fit(X_train_naive, y_train_naive)

preds = log_reg.predict_proba(X_test_naive)[:,1]

fpr, tpr, thresholds = metrics.roc_curve(y_test_naive, preds, pos_label=1)
metrics.auc(fpr, tpr)

  y = column_or_1d(y, warn=True)


0.7566924388713208

# LSTM Approach

In [21]:
tf.random.set_seed(seed)
np.random.seed(seed)
NUM_EPOCHS = 10

In [None]:
# create the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Masking(mask_value= -99.,
                                  input_shape= X_train_net.shape[1:]))
model.add(tf.keras.layers.LSTM(128, input_shape = X_train_net.shape[1:]))
model.add(tf.keras.layers.Dense(y_train_net.shape[1], activation="sigmoid"))

# compile and fit the model
model.compile(loss= "binary_crossentropy", optimizer="adam", metrics = [tf.keras.metrics.AUC()])

callbacks = [tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]

model.fit(X_train_net, y_train_net, epochs = NUM_EPOCHS, validation_split = 0.2, class_weight = class_weight, callbacks = callbacks)

model.evaluate(X_test_net, y_test_net)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

# Transformer Approach

In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = layers.Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = layers.LayerNormalization(epsilon=1e-6)(res)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

In [None]:
def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    lstm_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    x = layers.Masking(mask_value=-99.,input_shape= input_shape)(x)
    x = layers.LSTM(lstm_units, input_shape = input_shape, return_sequences=True)(x)

    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_last")(x)
    x = layers.Dropout(0.1)(x)
    
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    
    outputs = layers.Dense(y.shape[1], activation="sigmoid")(x)
    return keras.Model(inputs, outputs)

In [None]:
input_shape = X_train.shape[1:]

model = build_model(
    input_shape,
    head_size=128,
    num_heads=4,
    ff_dim=4,
    num_transformer_blocks=1,
    mlp_units=[128],
    mlp_dropout=0.2,
    dropout=0.25,
    lstm_units=32
)

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    #optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    metrics = [tf.keras.metrics.AUC()]
)
#model.summary()

callbacks = [keras.callbacks.EarlyStopping(min_delta=0.01, patience=3, restore_best_weights=True)]

model.fit(
    X_train,
    y_train,
    validation_split=0.15,
    epochs=NUM_EPOCHS,
    #batch_size=64,
    callbacks=callbacks,
)

model.evaluate(X_test, y_test, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.2248472422361374, 0.6851325035095215]