In [None]:
import os
import pandas as pd
import numpy as np


%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, random_split
import torch.nn as nn

from modules.classifer_utils import NormalizedClassifierDataset, NormalizedClassifierDatasetMetadata, TrainingManager, GeneralNN


device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
games_df = pd.read_parquet("games/statcast-2020.parquet")
print(f'starting with {len(games_df)} records on disk')


## some cleanup stuff

# first off, ditch rows without our target label
rows_to_drop = games_df[games_df.pitch_type.isna()].index
games_df.drop(rows_to_drop, inplace=True)

# for now, let's make this a binary classifier. we can later try to predict specific pitches
LABEL_COLUMN_NAME = "is_fastball"
games_df[LABEL_COLUMN_NAME] = games_df.pitch_type.str.startswith('F').astype(int)
print(f'target label breakdown\n{games_df[LABEL_COLUMN_NAME].value_counts()}')

# turn player ids on base into flags (dont want to bother embedding, too sparse)
for col in ["on_1b", "on_2b", "on_3b"]:
    games_df[col + "_notna"] = games_df[col].notna().astype(int)
    games_df.drop(col, axis=1, inplace=True)

# TODO figure out date as a categorical
DAY_OF_YEAR = 'day_of_year'
games_df[DAY_OF_YEAR] = games_df.game_date.dt.dayofyear

# cast some types
games_df = games_df.astype({
    'pitcher': 'int', 
    'batter': 'int',
}, errors='ignore')


# ok, ready to set up our dataset

ds_meta = NormalizedClassifierDatasetMetadata(LABEL_COLUMN_NAME)
ds_meta.set_ordinal_numeric_cols( [
    "bat_score", 
    "fld_score", 
    "balls", 
    "strikes", 
    "outs_when_up",   
    DAY_OF_YEAR,
    "at_bat_number",
    "pitch_number",
    "n_thruorder_pitcher",
    "age_pit",
    "age_bat",
    "pitcher_days_since_prev_game"
] )

ds_meta.set_categorical_map({
    col : list(games_df[col].unique()) for col in ['p_throws', 'stand']  #, 'if_fielding_alignment', 'of_fielding_alignment']
})

ds_meta.set_embedding_cols(["pitcher", "batter"])

target_df = games_df[ ds_meta.get_columns() ].dropna()


overall_ds = NormalizedClassifierDataset(target_df, ds_meta)


train_ds, test_ds = random_split(overall_ds, [.80, .20])

batch_size = int(len(train_ds) / 10)
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_ds, batch_size=batch_size, shuffle=True, drop_last=True)

print(f'{len(train_ds)} batches with batch_size: {batch_size}, {len(test_ds)} batches for test.')
num_features = overall_ds.get_feature_count()
print(f'datasets have {num_features} features')


starting with 279660 records on disk
target label breakdown
is_fastball
0    156297
1    121669
Name: count, dtype: int64
201266 batches with batch_size: 20126, 50316 batches for test.
datasets have 29 features


In [10]:

dropoutRate = 0.2

input_features = overall_ds.get_feature_count()
model = GeneralNN( input_features, [num_features*2,num_features*2,32,32,16,1], dropoutRate )

total_params = sum(p.numel() for p in model.parameters())
print(f"The model has {total_params} parameters.")

training_mgr = TrainingManager(model)
training_mgr.train(train_dataloader, 100)



The model has 8651 parameters.
trainging using: cpu device
Epoch [1/100], Avg training Loss: 51.4083, Accuracy: 96261/201260 (0.4783)


KeyboardInterrupt: 

In [None]:

num_epochs  = 10

print(f'normed_df has {len(normed_df)} records')


overall_ds = NormalizedClassifierDataset(normed_df, ds_meta)

print(f'overall_ds has {len(overall_ds)} records')

train_ds, test_ds = random_split(overall_ds, [.80, .20])

batch_size = int(len(train_ds) / 10)
dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)


# print(f'overall_ds len {len(overall_ds)} over {len(normed_df)} records')
# for epoch_idx, epoch in enumerate(range(num_epochs)):

#     epoch_records = 0
#     for X, y in dataloader:
#         batch_records = len(X)
#         epoch_records += batch_records
#         print(f'processing in dataloader loop {batch_records} records, {epoch_records} so far')
        
#     print(f'epoch {epoch_idx} of {num_epochs} over: {epoch_records} overall')





In [None]:
training_mgr.eval(test_dataloader)