In [5]:
# %pip install pybaseball pandas torch tiktoken matplotlib

from pybaseball import statcast, cache as pybaseball, playerid_reverse_lookup
pybaseball.enable()

import pandas as pd
import numpy as np

pd.options.display.max_rows = 100 


In [13]:


## the following loads all the pitch by pitch game info year by year

# for yearStr in ["2020", "2021", "2023", "2024","2022",]: 
#     print("processing: ", yearStr)
#     year_df = statcast(start_dt=f"{yearStr}-01-01", end_dt=f"{yearStr}-12-31" )
#     year_df.to_parquet(f"games/statcast-{yearStr}.parquet")


## the following goes through all pitchers and batters and gets their name

combined_player_array = []
for yearStr in [ "2020", "2021", "2022", "2023", "2024",]:
    print("processing players from: ", yearStr)
    year_df = pd.read_parquet(f"games/statcast-{yearStr}.parquet")

    batters = year_df["batter"].unique()
    pitchers = year_df["pitcher"].unique()

    both_player_arrays = np.concat([batters, pitchers])

    players_df = playerid_reverse_lookup(both_player_arrays)

    combined_player_array.append(players_df)


combined_player_df = pd.concat(combined_player_array).drop_duplicates()
combined_player_df = combined_player_df.assign(
    player_fullname = combined_player_df["name_last"].str.capitalize() + ", " + combined_player_df["name_first"].str.capitalize()
)

all_players_df = combined_player_df[["key_mlbam", "player_fullname"]].set_index("key_mlbam")
all_players_df.to_parquet("games/statcast-players-all.parquet",)

all_players_df

processing players from:  2020


processing players from:  2021
processing players from:  2022
processing players from:  2023
processing players from:  2024


Unnamed: 0_level_0,player_fullname
key_mlbam,Unnamed: 1_level_1
594987,"Sadler, Casey"
669242,"Edman, Tommy"
570666,"Cessa, Luis"
666674,"Colina, Edwar"
664180,"Hall, Matt"
...,...
686799,"Kochanowicz, Jack"
680732,"Burke, Sean"
683003,"Jones, Jared"
676661,"Lazar, Max"


In [None]:

## the following prints the games dataframe scheme in n colummns

games_df = pd.read_parquet("games/statcast-2022.parquet").head(1)

rowsize = 4

for i, col in enumerate(games_df.columns.sort_values()):
    print(f"{col:<32}", end="")
    if i%rowsize == rowsize-1:
        print("") # a newline
        


In [19]:
all_players_df = pd.read_parquet("games/statcast-players-all.parquet")
games_df = pd.read_parquet("games/statcast-2022.parquet")

pitches_df = pd.merge(games_df, all_players_df, left_on="batter", right_on="key_mlbam")

pitchesGroupedByGame = pitches_df.groupby("game_pk")

overallPitchesRecorded = 0
for game_pk, g in pitchesGroupedByGame:
    game_date, home_team, away_team = g.iloc[0][["game_date", "home_team", "away_team"]]

    print(f'On {game_date}, with {home_team} playing {away_team} at home in gamae {game_pk}')
    
    for (inning, inning_topbot), i in g.groupby(["inning", "inning_topbot"], sort=True):
        print(f'in the {inning_topbot} of the {inning} inning')

        for (at_bat_number), ab in i.groupby("at_bat_number", sort=True):

            # get atbat info from the first pitch, fp
            fp = ab.iloc[0]

            on_base, baserunners = [], ""
            if pd.notna(fp.on_1b): on_base.append(f"{fp.on_1b} on first")
            if pd.notna(fp.on_2b): on_base.append(f"{fp.on_2b} on second")
            if pd.notna(fp.on_3b): on_base.append(f"{fp.on_3b} on third")
            if len(on_base)>0:
                baserunners = f"with {', '.join(on_base)}"

            print("".join([
                f'{fp.p_throws} handed pitcher {fp.player_name} ({fp.pitcher}) ',
                f'faces {fp.stand} batter {fp.player_fullname} ({fp.batter}), ',
                f'{home_team} {fp.home_score}, {away_team} {fp.away_score} {baserunners}',
            ]))

            for p in ab.sort_values(["pitch_number"]).itertuples():
                # TODO consider dealing with plurals for the count and outs
                print("".join([
                    f'pitch from {p.pitcher} with {p.balls} balls {p.strikes} strikes and ',
                    f'{p.outs_when_up} outs is a {p.pitch_name} {p.description}'
                ]), end="")

                if p.post_bat_score > p.bat_score:
                    print(f" {p.post_bat_score - p.bat_score} runs scored")
                else:
                    print() # just terminate the p status

                overallPitchesRecorded += 1
                if (overallPitchesRecorded > 500):
                    raise UserWarning('Exit Early')



On 2022-04-26 00:00:00, with LAA playing CLE at home in gamae 661032
in the Bot of the 1 inning
R handed pitcher McKenzie, Triston (663474) faces R batter Ward, Taylor (621493), LAA 0, CLE 0 
pitch from 663474 with 0 balls 0 strikes and 0 outs is a 4-Seam Fastball called_strike
pitch from 663474 with 0 balls 1 strikes and 0 outs is a 4-Seam Fastball called_strike
pitch from 663474 with 0 balls 2 strikes and 0 outs is a 4-Seam Fastball foul
pitch from 663474 with 0 balls 2 strikes and 0 outs is a 4-Seam Fastball foul
pitch from 663474 with 0 balls 2 strikes and 0 outs is a Curveball hit_into_play
R handed pitcher McKenzie, Triston (663474) faces L batter Ohtani, Shohei (660271), LAA 0, CLE 0 
pitch from 663474 with 0 balls 0 strikes and 1 outs is a 4-Seam Fastball hit_into_play
R handed pitcher McKenzie, Triston (663474) faces R batter Trout, Mike (545361), LAA 0, CLE 0 
pitch from 663474 with 0 balls 0 strikes and 2 outs is a 4-Seam Fastball called_strike
pitch from 663474 with 0 balls

UserWarning: Exit Early