In [None]:
import os
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader, random_split

from modules.classifer_utils import NormalizedClassifierDataset, NormalizedClassifierDatasetMetadata, TrainingManager, GeneralNN

%load_ext autoreload
%autoreload 2
pd.set_option('display.max_columns', None)


In [None]:

dfs = [ pd.read_parquet(f"games/statcast-{y}.parquet") for y in [2020, 2021, 2022] ]

games_df = pd.concat(dfs) 
print(f'starting with {len(games_df)} records on disk')

## some cleanup stuff

# first off, ditch rows without our target label
rows_to_drop = games_df[games_df.pitch_type.isna()].index
games_df.drop(rows_to_drop, inplace=True)

# for now, let's make this a binary classifier. we can later try to predict specific pitches
LABEL_COLUMN_NAME = "is_fastball"
games_df[LABEL_COLUMN_NAME] = games_df.pitch_type.str.startswith('F').astype(float)
print(f'target label breakdown\n{games_df[LABEL_COLUMN_NAME].value_counts()}')

# turn player ids on base into flags (dont want to bother embedding, too sparse)
for col in ["on_1b", "on_2b", "on_3b"]:
    games_df[col + "_notna"] = games_df[col].notna().astype(int)
    games_df.drop(col, axis=1, inplace=True)

# TODO figure out date as a categorical
DAY_OF_YEAR = 'day_of_year'
games_df[DAY_OF_YEAR] = games_df.game_date.dt.dayofyear

# cast some types
games_df = games_df.astype({
    'pitcher': 'int', 
    'batter': 'int',
}, errors='ignore')

# here we capture the match-up -- just store the index from a map of combos
matchups = pd.Series(zip(games_df.pitcher, games_df.batter))
matchup_idx = {matchup:idx for idx, matchup in enumerate(matchups.unique())}
MATCH_UP = 'matchup' # col name for reuse later
games_df[MATCH_UP] = matchups.map(matchup_idx)

# one last bit of info, the current lead at the time of the pitch
PITCHER_LEAD = 'pitcher_lead'
games_df[PITCHER_LEAD] = games_df.fld_score - games_df.bat_score


# ok, ready to set up our dataset

ds_meta = NormalizedClassifierDatasetMetadata(LABEL_COLUMN_NAME)
ds_meta.set_ordinal_numeric_cols( [
    "inning",
    "bat_score", 
    "fld_score", 
    "home_score",
    "away_score",
    PITCHER_LEAD,
    "balls", 
    "strikes", 
    "outs_when_up",   
    DAY_OF_YEAR,
    "at_bat_number",
    "pitch_number",
    "n_thruorder_pitcher",
    "age_pit",
    "age_bat",
    "pitcher_days_since_prev_game",
    MATCH_UP,
] )

# TODO make assertion that these columns dont have duplicates

ds_meta.set_categorical_map({
    col : list(games_df[col].unique()) for col in ['p_throws', 'stand']  #, 'if_fielding_alignment', 'of_fielding_alignment']
})

# make an embedding out of the matchup only
# ds_meta.set_embedding_cols([MATCH_UP])

target_df = games_df[ ds_meta.get_columns() ].dropna()
overall_ds = NormalizedClassifierDataset(target_df, ds_meta)


train_ds, test_ds = random_split(overall_ds, [.80, .20])

batch_size = int(len(train_ds) / 20)
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_ds, batch_size=batch_size, shuffle=True, drop_last=True)

print(f'{len(train_ds)} batches with batch_size: {batch_size}, {len(test_ds)} batches for test.')
num_features = overall_ds.get_feature_count()
print(f'datasets have {num_features} features')


In [None]:

dropoutRate = 0.2

input_features = overall_ds.get_feature_count()
model = GeneralNN( input_features, [num_features*2,num_features*2,16,16,1], dropoutRate )

total_params = sum(p.numel() for p in model.parameters())
print(f"The model has {total_params} parameters.")

training_mgr = TrainingManager(model)
training_mgr.train(train_dataloader, 100)



In [None]:
training_mgr.eval(test_dataloader)