<a href="https://colab.research.google.com/github/jskaza/nfl-big-data-bowl-2023/blob/master/model_training_multiclass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [107]:
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import pandas as pd
import tensorflow as tf
import math
from google.colab import drive
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
drive.mount("/content/drive")
%load_ext tensorboard

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [108]:
df = pd.read_json("/content/drive/MyDrive/nfl-big-data-bowl-2023/data/dataset_week1.json")
df.head()

Unnamed: 0,nfl_id,ball_x,ball_y,opponent_x_1,opponent_x_2,opponent_x_3,opponent_x_4,opponent_x_5,opponent_x_6,opponent_x_7,...,x,pff_hurry,pff_hit,pff_sack,pff_no_impact,quarter,down,yards_to_go,absolute_yardline_number,score_delta
0,41263,0.0,0.0,-7.72,-2.61,-1.29,0.1,-1.65,1.73,0.34,...,-5.03,1,0,0,0,1,3,2,43,0
1,42403,0.0,0.0,5.42,11.17,12.85,3.0,1.73,0.1,-1.29,...,8.71,1,0,0,0,1,3,2,43,0
2,44955,0.0,0.0,1.73,0.1,3.0,-1.29,-2.61,5.42,0.34,...,1.26,0,0,0,1,1,3,2,43,0
3,53441,0.0,0.0,-1.29,0.1,-2.61,1.73,3.0,-7.72,-1.65,...,-1.99,1,0,0,0,1,3,2,43,0
4,53504,0.0,0.0,1.73,3.0,0.1,5.42,-1.29,-2.61,0.34,...,2.75,1,0,0,0,1,3,2,43,0


In [109]:
def make_features(df: pd.DataFrame, feats: list, outcomes: list):
    X, y  = [], []
    grouped_df = df.groupby(["nfl_id","game_id","play_id"])
    for group_name, group_df in grouped_df:
      X.append(group_df[feats].to_numpy())
      y.append(group_df[outcomes].to_numpy()[0])
    
    X = tf.keras.utils.pad_sequences(X ,dtype="float", padding="pre", value= -99)
    y = tf.keras.utils.pad_sequences(y ,dtype="float", padding="pre", value= -99)
    return X, y

In [110]:
feats = ["x","y","ball_x","ball_y", "score_delta", "absolute_yardline_number", "quarter", "down", "yards_to_go"] + [c for c in list(df.columns) if "opponent_" in c or "teammate_" in c]
outcomes = ["pff_hurry" ,"pff_hit", "pff_sack", "pff_no_impact"]
# outcomes = ["pff_sack"]

X, y = make_features(df, feats, outcomes)

split = 0.8
num_train = round(split * X.shape[0])

X_train = X[:num_train]
y_train = y[:num_train]

X_test = X[num_train:]
y_test = y[num_train:]

# del df # free up some mem

In [111]:
# set random seed for reproducibility

tf.random.set_seed(42)
np.random.seed(42)
num_epochs = 50

# create the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Masking(mask_value=-99.,
                                  input_shape= X.shape[1:]))
model.add(tf.keras.layers.LSTM(32, input_shape = X.shape[1:]))
model.add(tf.keras.layers.Dense(y.shape[1], activation="softmax"))

# compile and fit the model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics = ["accuracy"])

model.fit(X_train, y_train, epochs = num_epochs, verbose=0)
model.evaluate(X_test, y_test)



[0.5023863315582275, 0.8698845505714417]

In [112]:
for i,j in zip(model.predict(X_test[100:125]), y_test[100:125]):
  print(i,j)

[0.03396212 0.00225544 0.00510459 0.95867795] [0. 0. 0. 1.]
[1.0307083e-05 4.7134372e-04 1.6471419e-06 9.9951673e-01] [0. 0. 0. 1.]
[0.00723533 0.00273429 0.0045269  0.9855035 ] [0. 0. 0. 1.]
[0.1323409  0.00866639 0.08384424 0.7751485 ] [0. 0. 0. 1.]
[7.2882761e-04 1.2832174e-04 1.4001319e-05 9.9912882e-01] [0. 0. 0. 1.]
[6.0066279e-05 4.6702055e-05 4.7518603e-05 9.9984574e-01] [0. 0. 0. 1.]
[5.2289529e-06 2.6198813e-06 5.3819645e-07 9.9999160e-01] [0. 0. 0. 1.]
[0.00212622 0.06397607 0.00099435 0.93290335] [0. 1. 0. 0.]
[1.3189148e-03 4.4264612e-03 1.0472942e-04 9.9414986e-01] [0. 0. 0. 1.]
[0.00790045 0.05725579 0.00219561 0.9326482 ] [0. 0. 0. 1.]
[2.9907972e-03 4.9779592e-03 2.5778891e-05 9.9200553e-01] [0. 0. 0. 1.]
[0.04120376 0.00591319 0.00415257 0.94873047] [0. 0. 0. 1.]
[0.20256712 0.05397035 0.6372721  0.10619043] [0. 0. 0. 1.]
[9.1743737e-04 9.9804802e-03 4.3335538e-03 9.8476857e-01] [0. 0. 0. 1.]
[5.7796369e-05 6.8122626e-04 2.2912009e-04 9.9903190e-01] [0. 0. 0. 1.]
[0.0

In [113]:
feats = ["x","y","ball_x","ball_y", "score_delta", "absolute_yardline_number", "quarter", "down", "yards_to_go"] + [c for c in list(df.columns) if "opponent_" in c or "teammate_" in c]
# outcomes = ["pff_hurry" ,"pff_hit", "pff_sack", "pff_no_impact"]
outcomes = ["pff_sack"]

X, y = make_features(df, feats, outcomes)

split = 0.8
num_train = round(split * X.shape[0])

X_train = X[:num_train]
y_train = y[:num_train].flatten()

weight_for_0 = (1 / sum(y_train==[0])) * (len(y_train) / 2.0)
weight_for_1 = (1 / sum(y_train==[1])) * (len(y_train) / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

class_weight

X_test = X[num_train:]
y_test = y[num_train:].flatten()

# del df # free up some mem

In [114]:
# set random seed for reproducibility

tf.random.set_seed(42)
np.random.seed(42)
num_epochs = 1000

# create the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Masking(mask_value=-99.,
                                  input_shape= X.shape[1:]))
model.add(tf.keras.layers.LSTM(32, input_shape = X.shape[1:]))
model.add(tf.keras.layers.Dense(y.shape[1], activation="sigmoid"))

# compile and fit the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics = [tf.keras.metrics.AUC()])

model.fit(X_train, y_train, epochs = num_epochs, class_weight=class_weight, verbose=0)
model.evaluate(X_test, y_test)



[0.0911240428686142, 0.7212216854095459]

In [115]:
for i,j in zip(model.predict(X_test), y_test):
  if j == [1]:
    print(i,j)

[8.067905e-06] 1.0
[0.00168219] 1.0
[0.01676147] 1.0
[0.02591879] 1.0
[0.5811265] 1.0
[1.3351973e-05] 1.0
[0.01554512] 1.0
[2.2134785e-05] 1.0
[0.91253096] 1.0


In [116]:
# # set random seed for reproducibility

# tf.random.set_seed(42)
# np.random.seed(42)
# NUM_EPOCHS = 50

# # create the model
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Masking(mask_value=-99.,
#                                   input_shape= X.shape[1:]))
# model.add(tf.keras.layers.LSTM(32, input_shape = X.shape[1:]))
# model.add(tf.keras.layers.Dense(y.shape[1], activation="sigmoid"))

# # compile and fit the model
# model.compile(loss="binary_crossentropy", optimizer="adam", metrics = [tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall(),
#                                                                        tf.keras.metrics.TruePositives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(),
#                                                                        tf.keras.metrics.FalseNegatives()])

# logdir = os.path.join("/content/drive/MyDrive/nfl-big-data-bowl-2023/logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

# model.fit(X_train, y_train, epochs=NUM_EPOCHS, verbose=0, callbacks=[tensorboard_callback])
# model.evaluate(X_test, y_test)

In [117]:
# X2, y2  = [], []
# grouped_df = df.groupby(["nfl_id","game_id","play_id"])
# for group_name, group_df in grouped_df:
#   X2.append(group_df[["x","y","ball_x","ball_y"]].to_numpy()[-1])
#   y2.append(group_df[outcomes].to_numpy()[-1])
# X2 = np.asmatrix(X2)
# y2 = np.array(y2).flatten()
# split = 0.8
# num_train = round(split * X2.shape[0])

# X2_train = np.stack(X2[:num_train])
# y2_train = np.stack(y2[:num_train])

# X2_test = np.stack(X2[num_train:])
# y2_test = np.stack(y2[num_train:])

# mod = LogisticRegression()
# mod.fit(X2_train, y2_train)
# preds = mod.predict_proba(X2_test)[:,1]
# fpr, tpr, thresholds = metrics.roc_curve(y2_test, preds)
# metrics.auc(fpr, tpr)

In [118]:
# %tensorboard --logdir /content/drive/MyDrive/nfl-big-data-bowl-2023/logs