In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, BatchNormalization
from keras.utils import to_categorical
from keras.metrics import AUC
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
import tensorflow as tf
import keras.backend as K

# Define column data types
column_dtypes = {
    'down': float, 'qtr': float, 'ydstogo': float, 'yardline_100': float, 'time': str, 
    'score_differential': float, 'no_score_prob': float, 'opp_fg_prob': float, 
    'opp_safety_prob': float, 'opp_td_prob': float, 'fg_prob': float, 
    'safety_prob': float, 'td_prob': float, 'play_type': str
}

# Load the dataset
data = pd.read_csv("NFL Play by Play 2009-2018 (v5).csv", dtype=column_dtypes, low_memory=False)

# Convert 'time' column to 'time_elapsed'
def convert_to_seconds(row):
    if isinstance(row['time'], str):
        time_parts = row['time'].split(':')
        if len(time_parts) == 2:
            minutes, seconds = map(int, time_parts)
            time_in_current_qtr = minutes * 60 + seconds
            elapsed_time_previous_qtrs = (row['qtr'] - 1) * 900
            return elapsed_time_previous_qtrs + time_in_current_qtr
    return np.nan

data['time_elapsed'] = data.apply(convert_to_seconds, axis=1)
data = data.dropna(subset=filtered_columns + ['time_elapsed'])

# Filter and Encode 'play_type'
data = data[data['play_type'].isin(['kickoff', 'extra_point', 'pass', 'run', 'punt', 'field_goal'])]
le = LabelEncoder()
data['PlayType_encoded'] = le.fit_transform(data['play_type'])

# Define features and target
filtered_columns = ['down', 'qtr', 'ydstogo', 'yardline_100', 'score_differential', 
                    'no_score_prob', 'opp_fg_prob', 'opp_safety_prob', 'opp_td_prob',
                    'fg_prob', 'safety_prob', 'td_prob']
features = filtered_columns + ['time_elapsed']
X = data[features]
y = data['PlayType_encoded']

# Scaling
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create sequences
SEQUENCE_LENGTH = 5  # Increased sequence length for more context

def create_sequences(data, sequence_length):
    sequences = []
    for index in range(len(data) - sequence_length + 1):
        sequences.append(data[index: index + sequence_length])
    return np.array(sequences)

X_train_sequences = create_sequences(X_train_scaled, SEQUENCE_LENGTH)
X_test_sequences = create_sequences(X_test_scaled, SEQUENCE_LENGTH)
y_train_cat = to_categorical(y_train[SEQUENCE_LENGTH - 1:])
y_test_cat = to_categorical(y_test[SEQUENCE_LENGTH - 1:])

# Custom F1 Score metric
def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1_val

# Define the LSTM model with Batch Normalization
model = Sequential([
    LSTM(100, input_shape=(SEQUENCE_LENGTH, len(features)), return_sequences=True),
    BatchNormalization(),
    Dropout(0.5),
    LSTM(100, return_sequences=True),  # Additional LSTM layer
    BatchNormalization(),
    Dropout(0.5),
    LSTM(100),  # Additional LSTM layer
    BatchNormalization(),
    Dense(64, activation='relu'),  # Additional dense layer
    Dropout(0.5),
    Dense(y_train_cat.shape[1], activation='softmax')
])

# Optimizer with reduced learning rate
optimizer = Adam(learning_rate=0.0005)

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', f1_metric, AUC(name='auc')])

# Train with increased epochs
model.fit(X_train_sequences, y_train_cat, epochs=10, batch_size=64,
          validation_data=(X_test_sequences, y_test_cat), callbacks=[early_stopping], verbose=1)

# Evaluate
score = model.evaluate(X_test_sequences, y_test_cat, verbose=1)
print(f"Test loss: {score[0]}, Test accuracy: {score[1]}, Test F1 Score: {score[2]}, Test AUC: {score[3]}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.5222935676574707, Test accuracy: 0.7167325615882874, Test F1 Score: 0.7165713906288147, Test AUC: 0.935920774936676
