In [2]:
import pandas as pd
import numpy as np
import json
from pybaseball import cache, pitching_stats_bref, batting_stats_bref,playerid_reverse_lookup
from utils.preprocess_statcast import pull_statcast_multiyear, preprocess, read_from_cache

# Set configuration variables
The statcast data we want is only available after 2015, so lets get everything from then until 2024.

In [6]:
start_year = "2015"
end_year = "2024"
verbose = True
# get players who have thrown or received minimum amount of pitches
min_pitches = 5000
# pitch features after correlation analysis with pitch outcomes
pitch_features = [
    'batter','pitcher','release_pos_y','release_pos_z',
    'release_spin_rate','effective_speed','sz_top','sz_bot',
    'ay','plate_z','pfx_z','zone','arm_angle','api_break_x_arm',
    'api_break_x_batter_in','previous_pitch_speed','previous_zone',
    'previous_plate_z','balls','strikes'
]
# outcome is either ball, strike, or in play
outcome_features = ["type"]
columns = [
    "game_date", "at_bat_number","game_pk","game_type",
    "pitch_number","release_speed","pitch_type","plate_x"
]
columns.extend(pitch_features)
columns.extend(outcome_features)
# the previous pitch features are not in statcast
statcast_columns = [col for col in columns if not col.startswith('previous')]

# Get Statcast pitch data
We will use the pybaseball library to pull statcast data for all pitches in our year range.

To avoid re-running long queries use the caching function.

In [7]:
# enable caching of the statcast data (saves them to disk)
cache.enable()
#df = pull_statcast_multiyear(start_year, end_year, columns=statcast_columns, verbose=verbose)
# uncomment this to read from cache if previously queries
df = read_from_cache(start_year, end_year, columns=statcast_columns,directory="/home/justin/.pybaseball/cache/")

In [None]:
df.game_date = pd.to_datetime(df.game_date)
# add game date and outcome to pitch_features so we retain it after preprocessing
pitch_features.extend(outcome_features)
pitch_features.append('game_date')
# get the preprocessed dataframe and the mapping of player id to embedding index
df, batter_map, pitcher_map = preprocess(df, pitch_features)
df.to_csv("data/statcast/2015-2024_preproc_pitch_outcomes.csv",index=False)
with open("data/preprocessed/batter_map_2015_2024.json","w") as infile:
    json.dump(batter_map,infile,indent=6)
with open("data/preprocessed/pitcher_map_2015_2024.json","w") as infile:
    json.dump(pitcher_map,infile,indent=6)

In [3]:
df = pd.read_csv("data/statcast/2015-2024_preproc_pitch_outcomes.csv",index_col=False)
df.game_date = pd.to_datetime(df.game_date)

In [4]:
df

Unnamed: 0,batter,pitcher,release_pos_y,release_pos_z,release_spin_rate,effective_speed,sz_top,sz_bot,ay,plate_z,...,arm_angle,api_break_x_arm,api_break_x_batter_in,previous_pitch_speed,previous_zone,previous_plate_z,balls,strikes,type,game_date
0,325,33,54.50,5.82,2096.0,84.5,3.09,1.43,29.880000,1.75,...,0.0,0.88,-0.88,0.0,0.0,0.00,0,0,0,2015-04-06
1,325,33,54.50,5.82,2096.0,84.5,3.09,1.43,29.880000,1.75,...,0.0,0.88,-0.88,90.1,8.0,1.75,0,0,0,2015-04-06
2,325,33,54.50,5.80,1851.0,89.7,3.01,1.47,27.740000,1.77,...,0.0,-1.24,1.24,90.1,8.0,1.75,0,1,0,2015-04-06
3,325,33,54.50,5.80,1851.0,89.7,3.01,1.47,27.740000,1.77,...,0.0,-1.24,1.24,79.7,8.0,1.77,0,1,0,2015-04-06
4,325,33,54.50,5.94,2332.0,79.0,3.01,1.47,33.810000,1.94,...,0.0,0.09,-0.09,79.7,8.0,1.77,0,2,0,2015-04-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9539135,338,503,53.36,5.16,2823.0,82.4,3.40,1.65,23.932078,1.91,...,16.5,-1.19,1.19,81.3,9.0,1.91,1,0,0,2024-03-28
9539136,338,503,53.38,5.06,2116.0,97.2,3.40,1.67,31.245332,2.22,...,11.9,1.40,-1.40,81.3,9.0,1.91,1,1,0,2024-03-28
9539137,338,503,53.38,5.06,2116.0,97.2,3.40,1.67,31.245332,2.22,...,11.9,1.40,-1.40,96.0,6.0,2.22,1,1,0,2024-03-28
9539138,338,503,53.08,5.20,1980.0,94.7,3.40,1.65,29.704748,2.32,...,17.4,1.30,-1.30,96.0,6.0,2.22,1,2,2,2024-03-28


# Get data for training and validation

We will do simple min-max scaling to normalize our pitch features

In [None]:
# get min max of pitch features
X_df = df.drop(['type','batter','pitcher','game_date'], axis=1)
X_min = X_df.min()
X_max = X_df.max()
# save min and max
X_min.to_csv("data/preprocessed/X_min.csv",header=None)
X_max.to_csv("data/preprocessed/X_max.csv",header=None)
# train/test on 2015 to 2023, validate on 2024
X_train = df[df.game_date.dt.year < 2024].drop(
    ['type','batter','pitcher','game_date'], axis=1
)
# min max normalize training data
X_train_norm = (X_train-X_min)/(X_max-X_min)
Y_train = df[df.game_date.dt.year < 2024]['type']
# validate on 2024 season
X_val = df[df.game_date.dt.year == 2024].drop(
    ['type','batter','pitcher','game_date'], axis=1
)
# normalize
X_val_norm = (X_val-X_min)/(X_max-X_min)
Y_val = df[df.game_date.dt.year == 2024]['type']

## Get data for without embeddings
We will ignore the pitcher and batters since we are only using the statcast pitch features here

In [None]:
# output to numpy and save
np.save(
    "data/preprocessed/X_train_norm",
    X_train_norm.to_numpy(dtype=np.float32)
)
np.save(
    "data/preprocessed/Y_train",    
    Y_train.to_numpy(dtype=np.float64)
)
np.save(
    "data/preprocessed/X_val_norm",
    X_val_norm.to_numpy(dtype=np.float32)
)
np.save(
    "data/preprocessed/Y_val",
    Y_val.to_numpy(dtype=np.float64)
)

## Get data for with embeddings
We will add back in the pitcher and batters to train pitcher and batter embeddings along with statcast pitch features

In [None]:
X_train_norm['batter'] = df[df.game_date.dt.year < 2024]['batter']
X_train_norm['pitcher'] = df[df.game_date.dt.year < 2024]['pitcher']
X_val_norm['batter'] = df[df.game_date.dt.year == 2024]['batter']
X_val_norm['pitcher'] = df[df.game_date.dt.year == 2024]['pitcher']

# output to numpy and save
np.save(
    "data/preprocessed/X_train_norm_embed",
    X_train_norm.to_numpy(dtype=np.float32)
)
np.save(
    "data/preprocessed/X_val_norm_embed",
    X_val_norm.to_numpy(dtype=np.float32)
)

# Get player information
Get pitchers and batter handedness and statistics for evaluation

In [None]:
# get player_id info from pitcher/batter map, needed to get baseball reference id
pitchers = playerid_reverse_lookup(
    int(x) for x in list(pitcher_map.keys())
)
batters = playerid_reverse_lookup(
    int(x) for x in list(batter_map.keys())
)

In [None]:
def get_players_info(bbrefs):
    """
    Get player info (Position, Bats,Throws) for each player 
    identified by their bbref id.

    Returns:
        dataframe containing (Position, Bats, Throws, key_bbref)
    """
    infos = []
    for bbref in bbrefs:
        try:
            _, info = get_splits(bbref, player_info=True)
            info['key_bbref'] = bbref
            infos.append(info)
        except Exception as e: 
            print(e, bbref)
    return pd.DataFrame(infos)

###  Pull player info and save to csv

In [None]:
pitcher_infos = get_players_info(pitchers['key_bbref'].tolist())
all_pitchers_info = pitchers.join(pitcher_infos.set_index('key_bbref'), on='key_bbref')
all_pitchers_info.to_csv("data/preprocessed/pitchers.csv", index=False)

batter_infos = get_players_info(batters['key_bbref'].tolist())
all_batters_info = batters.join(batter_infos.set_index('key_bbref'), on='key_bbref')
all_batters_info.to_csv("data/preprocessed/batters.csv", index=False)

### Get pitching stats

In [None]:
def get_pitching_stats(start_year, end_year):
    """
    Get the pitching stats of all pitchers based on year.

    Returns: 
        dataframe containing stats and year 
    """
    df = pd.DataFrame()
    for year in range(start_year, end_year+1):
        print(year)
        data = pitching_stats_bref(year)
        data['year'] = year
        df = pd.concat([df, data])
    return df
    
pitcher_stats = get_pitching_stats(2015, 2024)
pitcher_stats.to_csv("data/stats/pitching_stats_2015_2024.csv",index=False)

### Get batting stats

In [None]:
def get_batting_stats(start_year, end_year):
    """
    Get the pitching stats of all pitchers based on year.

    Returns: 
        dataframe containing stats and year 
    """
    df = pd.DataFrame()
    for year in range(start_year, end_year+1):
        print(year)
        data = batting_stats_bref(year)
        data['year'] = year
        df = pd.concat([df, data])
    return df
    
batter_stats = get_batting_stats(2015, 2024)
batter_stats.to_csv("data/stats/batter_stats_2015_2024.csv",index=False)