In [27]:
import mysql.connector
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder

In [None]:
'''
import tensorflow as tf

print(tf.__version__)
print("CUDA Available: ", tf.test.is_built_with_cuda())
print("GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
'''

In [None]:
'''
STATCAST_DB = mysql.connector.connect(
  host=host,
  user=user,
  password=password,
  database=database
)
'''

In [28]:
'''
MICHAEL_LORENZEN_2023 = """SELECT player_name, stand, pitch_type_condensed, game_pk, at_bat_number, pitch_number 
FROM sc_raw 
WHERE pitcher = '547179' AND game_year = '2023'
ORDER BY game_pk, at_bat_number, pitch_number;"""

MICHAEL_LORENZEN_2023 = pd.read_sql_query(MICHAEL_LORENZEN_2023, STATCAST_DB)
'''

# print(MICHAEL_LORENZEN_2023)
# MICHAEL_LORENZEN_2023.to_csv('MICHAEL_LORENZEN_2023.csv', index=False)

MICHAEL_LORENZEN_2023 = pd.read_csv('MICHAEL_LORENZEN_2023.csv')
MICHAEL_LORENZEN_2023.rename(columns={'pitch_type_condensed': 'pitch_type'}, inplace=True)

pitch_type_dummies = pd.get_dummies(MICHAEL_LORENZEN_2023['pitch_type'])
MICHAEL_LORENZEN_2023_PREP = pd.concat([MICHAEL_LORENZEN_2023, pitch_type_dummies], axis=1)

scaler = MinMaxScaler(feature_range=(0, 1))
MICHAEL_LORENZEN_2023_PREP['pitch_number'] = scaler.fit_transform(MICHAEL_LORENZEN_2023_PREP[['pitch_number']])

# print(MICHAEL_LORENZEN_2023_PREP)
# MICHAEL_LORENZEN_2023_PREP.to_csv('MICHAEL_LORENZEN_2023_PREP.csv', index=False)


In [32]:
all_pitch_types = MICHAEL_LORENZEN_2023_PREP['pitch_type'].unique()
label_encoder = LabelEncoder()
label_encoder.fit(all_pitch_types)

grouped = MICHAEL_LORENZEN_2023_PREP.groupby(['game_pk', 'at_bat_number'])
grouped_sequences = []
labels = []

for name, group in grouped:
    if len(group) > 1:
        sequences = group.iloc[:-1][['CB', 'CH', 'CT', 'FAHCK', 'FF', 'SI', 'SL']].values.tolist()
        grouped_sequences.append(sequences)
        
        label = group.iloc[-1]['pitch_type']
        encoded_label = label_encoder.transform([label])[0]
        labels.append(encoded_label)

In [33]:
sequences_tensors = [torch.tensor(seq, dtype=torch.float) for seq in grouped_sequences]
'''
if not sequences:
    raise ValueError("All sequences are empty or no sequence is available.")
    
# Assuming sequences is not empty and contains tensors
try:
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
except RuntimeError as e:
    print(f"RuntimeError encountered: {e}")
    print("Checking sequence lengths and types...")
    for i, seq in enumerate(sequences):
        print(f"Sequence {i} length: {len(seq)}, type: {seq.dtype}")
    raise
'''
# print(sequences)
padded_sequences = pad_sequence(sequences_tensors, batch_first=True, padding_value=0)
labels_tensor = torch.tensor(labels, dtype=torch.long)

In [34]:
from torch.utils.data import Dataset, DataLoader

class PitchDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        return sequence, label
    
dataset = PitchDataset(padded_sequences, labels_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [35]:
import torch.nn as nn

class PitchPredictionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers):
        super(PitchPredictionModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        h0, c0 = self.init_hidden(x.size(0))
        out, (hn, cn) = self.lstm(x, (h0, c0))
        
        out = self.fc(out[:, -1, :])
        return out
    
    def init_hidden(self, batch_size):
        h0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        c0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return [t for t in (h0, c0)]

In [36]:

input_dim = 7
hidden_dim = 128
output_dim = len(label_encoder.classes_)
n_layers = 2

model = PitchPredictionModel(input_dim, hidden_dim, output_dim, n_layers)

In [45]:
from sklearn.model_selection import train_test_split

# Assuming padded_sequences and labels_tensor are your full dataset
# First, split into a combined training+validation set and a separate test set
train_val_seqs, test_seqs, train_val_labels, test_labels = train_test_split(
    padded_sequences, labels_tensor, test_size=0.2, random_state=42)

# Then, split the training+validation set into separate training and validation sets
train_seqs, valid_seqs, train_labels, valid_labels = train_test_split(
    train_val_seqs, train_val_labels, test_size=0.25, random_state=42)  # Adjust test_size as needed

# Now, train_seqs and train_labels are for training, valid_seqs and valid_labels are for validation,
# and test_seqs and test_labels are reserved for final testing.

# Create datasets
train_dataset = PitchDataset(train_seqs, train_labels)
valid_dataset = PitchDataset(valid_seqs, valid_labels)
test_dataset = PitchDataset(test_seqs, test_labels)  # Test dataset

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)  # Test DataLoader



In [None]:
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

best_val_loss = float('inf')
patience = 3
patience_counter = 0

num_epochs = 100

for epoch in range(num_epochs):
    model.train() 
    total_loss = 0
    
    for sequences, labels in train_loader:
        optimizer.zero_grad()  
        outputs = model(sequences)  
        loss = criterion(outputs, labels)  
        loss.backward()  
        optimizer.step()
        
        total_loss += loss.item()
        
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    with torch.no_grad():
        for sequences, labels in valid_loader:
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    
    val_loss /= len(valid_loader)
    print(f'Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)}, Validation Loss: {val_loss}')
    
    # Early Stopping Check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0  # Reset counter
        torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
    else:
        patience_counter += 1
        print(f'Validation loss has not improved for {patience_counter} epoch(s).')
        
    if patience_counter >= patience:
        print("Stopping early due to no improvement in validation loss.")
        break
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

In [47]:
model = PitchPredictionModel(input_dim, hidden_dim, output_dim, n_layers)  # Initialize the model
model.load_state_dict(torch.load('best_model.pth'))  # Load the saved weights
model.eval()  # Set the model to evaluation mode

PitchPredictionModel(
  (lstm): LSTM(7, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=8, bias=True)
)

In [48]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Assuming you have a DataLoader for your test data named 'test_loader'
predictions, true_labels = [], []
with torch.no_grad():
    for sequences, labels in test_loader:  # Replace 'test_loader' with 'valid_loader' if you don't have a separate test set
        outputs = model(sequences)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.view(-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Optionally, compute and print the confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions)
print('Confusion Matrix:\n', conf_matrix)


Accuracy: 37.61%
Confusion Matrix:
 [[ 0  0  0  0  0  0  1]
 [ 0 20  0  0  2  0  8]
 [ 0  3  0  0  0  0  0]
 [ 0  0  0  0  0  0  1]
 [ 0 13  0  0  3  0 15]
 [ 0  2  0  0  0  0 16]
 [ 0 10  0  0  2  0 21]]
