In [None]:
import pandas as pd
import numpy as np
import json
import torch
import random
from torch.utils.data import DataLoader, random_split,WeightedRandomSampler
from utils.dataloader import BaseballDataset
from utils.trainer import BaseballClassifierTrainer
from models.pitch_grader_mlp import PitchGraderMLP
from denseweight import DenseWeight

# Load prepared data
create dataset for without and with training embeddings

In [None]:
with open("data/preprocessed/batter_map_2015_2024.json","r") as infile:
    batter_map = json.load(infile)
with open("data/preprocessed/pitcher_map_2015_2024.json","r") as infile:
    pitcher_map = json.load(infile)

x_data_file = "data/preprocessed/X_train_norm.npy"
y_data_file = "data/preprocessed/Y_train.npy"

full_dataset = BaseballDataset(
    x_data_file,
    y_data_file
)

# Create model

In [None]:
# define hyper parameters
hidden_dim = 64
pitch_features = full_dataset[0][0].shape[0]
# ball, strike, in-play
output_dim = 3
batch_size = 512
epochs = 500
device = 'cuda'
learning_rate = 0.001
verbose = True
torch.manual_seed(42)
# use MLP with 1 input layer, 1 hidden layer, and 1 output layer
model = PitchGraderMLP(
    pitch_features,
    len(batter_map), len(pitcher_map),
    hidden_dim,
    output_dim
)

## Setup our dataloader
More memory efficient than loading the entire dataset into memory and helps shuffle and weight samples

In [None]:
train_split = 0.8
test_split = 0.2
train_dataset, test_dataset = random_split(full_dataset, [train_split, test_split])
# Weight the samples to compensate for class imbalance
dw = DenseWeight(alpha=0.5)
weights = dw.fit(train_dataset[:][1].numpy())
sampler = WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, sampler=sampler)
# test batch size can be as big as memory allows
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False, num_workers=4)

## Setup our model trainer
This defines our training loop and auto evaluates for us periodically

In [None]:
trainer = BaseballClassifierTrainer(
    model, train_loader, test_loader,
    learning_rate = learning_rate,
    device = device, verbose = verbose
)

## Pre-training evaluation
Let us see where we are starting from

In [None]:
trainer.evaluate()

## Train the model

In [None]:
trainer.train(epochs)
torch.save(trainer.model.state_dict(),f'models/checkpoints/pitch_grader_{epochs}.pt')

## Post-training Evaluation

In [None]:
trainer.evaluate()