<a href="https://colab.research.google.com/github/jskaza/nfl-big-data-bowl-2023/blob/master/model_training_google.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import math
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
df = pd.read_json("/content/drive/MyDrive/nfl-big-data-bowl-2023/data/dataset.json")

In [None]:
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        end_ix = i + n_steps
        if end_ix > len(sequence)-1:
            break
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append([[c[0] for c in seq_x], [c[1] for c in seq_x]])
        y.append(seq_y)
    return X, y

In [None]:
def make_features(df, n_steps):
    X, y  = list(), list()
    grouped_df = df.groupby(["nfl_id","game_id","play_id"])
    for group_name, group_df in grouped_df:
        coords = group_df["coords"].tolist()
        X1, y1 = split_sequence(coords, n_steps)
        X += X1
        y += y1
    return np.array(X), np.array(y)
    # return pad_sequences(X, value = -99, dtype= "float"), pad_sequences(y, value = -99, dtype= "float")

In [None]:
# the dataset is as follows

# each observation is contains an array of length n_steps*2 containing x and y coords from
# the previous n_steps and an array of length 2 containing the current x and y coord pair
# thus, we are predicting current frame position based in previous n_steps frame. there is no 
# separation between videos

X, y = make_features(df, 5)

# train on 1mil frames
X_train = X[:1000000]
y_train = y[:1000000]

# test on 100k frames
X_test = X[-100000:]
y_test = y[-100000:]

del df

In [None]:
# set random seed for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# create the model
model = tf.keras.Sequential()
# model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten(), input_shape=(96, 2, 3)))
model.add(tf.keras.layers.LSTM(32, input_shape=(2, 5)))
model.add(tf.keras.layers.Dense(2))

# compile and fit the model
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, verbose=0)

<keras.callbacks.History at 0x7f6ab5d6d970>

In [None]:
err = 0
for i,j in zip(model.predict(X_test), y_test):
    err += math.dist(i,j)
print(err/len(y_test)) # avg prediction error (yards)

0.02592587260091053


In [None]:
# as an exercise, let's suppose we predicted frame position simply as previous position
err = 0
for i,j in zip(X_test, y_test):
    err += math.dist([i[0][4], i[1][4]],j)
print(err/len(y_test)) # avg prediction error (yards)

0.303620646639027
