In [1]:
# IMPORT LIBRARIES
import mysql.connector
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# LOAD AND PREPARE DATA
MICHAEL_LORENZEN_2023 = pd.read_csv('MICHAEL_LORENZEN_2023.csv')
MICHAEL_LORENZEN_2023.rename(columns={'pitch_type_condensed': 'pitch_type'}, inplace=True)
pitch_type_dummies = pd.get_dummies(MICHAEL_LORENZEN_2023['pitch_type'])
MICHAEL_LORENZEN_2023_PREP = pd.concat([MICHAEL_LORENZEN_2023, pitch_type_dummies], axis=1)
scaler = MinMaxScaler(feature_range=(0, 1))
MICHAEL_LORENZEN_2023_PREP['pitch_number'] = scaler.fit_transform(MICHAEL_LORENZEN_2023_PREP[['pitch_number']])
all_pitch_types = MICHAEL_LORENZEN_2023_PREP['pitch_type'].unique()
label_encoder = LabelEncoder()
label_encoder.fit(all_pitch_types)

In [3]:
# GROUP DATA BY GAME AND AT-BAT
grouped = MICHAEL_LORENZEN_2023_PREP.groupby(['game_pk', 'at_bat_number'])
grouped_sequences = []
labels = []
for name, group in grouped:
    if len(group) > 1:
        sequences = group.iloc[:-1][['CB', 'CH', 'CT', 'FAHCK', 'FF', 'SI', 'SL']].values.tolist()
        grouped_sequences.append(sequences)
        label = group.iloc[-1]['pitch_type']
        encoded_label = label_encoder.transform([label])[0]
        labels.append(encoded_label)

In [4]:
# PREPARE DATA FOR MODEL
sequences_tensors = [torch.tensor(seq, dtype=torch.float) for seq in grouped_sequences]	
padded_sequences = pad_sequence(sequences_tensors, batch_first=True, padding_value=0)
labels_tensor = torch.tensor(labels, dtype=torch.long)

In [5]:
# DEFINE DATASET CLASS
class PitchDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        return sequence, label

In [6]:
# SPLIT DATA INTO TRAINING, VALIDATION, AND TEST SETS
train_val_seqs, test_seqs, train_val_labels, test_labels = train_test_split(padded_sequences, labels_tensor, test_size=0.2, random_state=42)
train_seqs, valid_seqs, train_labels, valid_labels = train_test_split(train_val_seqs, train_val_labels, test_size=0.25, random_state=42)

In [7]:
# CREATE DATALOADERS
train_dataset = PitchDataset(train_seqs, train_labels)
valid_dataset = PitchDataset(valid_seqs, valid_labels)
test_dataset = PitchDataset(test_seqs, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [8]:
# DEFINE MODEL
class PitchPredictionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers):
        super(PitchPredictionModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        h0, c0 = self.init_hidden(x.size(0))
        out, (hn, cn) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out
    
    def init_hidden(self, batch_size):
        h0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        c0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return [t for t in (h0, c0)]

In [9]:
# TRAIN MODEL
input_dim = 7
hidden_dim = 128
output_dim = len(label_encoder.classes_)
n_layers = 2
model = PitchPredictionModel(input_dim, hidden_dim, output_dim, n_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
best_val_loss = float('inf')
patience = 3
patience_counter = 0
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for sequences, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for sequences, labels in valid_loader:
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    val_loss /= len(valid_loader)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
    if patience_counter >= patience:
        break

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# EVALUATE MODEL
model = PitchPredictionModel(input_dim, hidden_dim, output_dim, n_layers)
model.load_state_dict(torch.load('best_model.pth'))
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for sequences, labels in test_loader:
        outputs = model(sequences)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.view(-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')
conf_matrix = confusion_matrix(true_labels, predictions)
print('Confusion Matrix:\n', conf_matrix)

Accuracy: 26.50%
Confusion Matrix:
 [[ 0  0  0  0  1  0  0]
 [ 0  0  0  0 30  0  0]
 [ 0  0  0  0  3  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0  0  0 31  0  0]
 [ 0  0  0  0 18  0  0]
 [ 0  0  0  0 33  0  0]]
