In [5]:
# %pip install pybaseball pandas torch tiktoken matplotlib

from pybaseball import statcast, cache as pybaseball, playerid_reverse_lookup
pybaseball.enable()

import pandas as pd
import numpy as np

pd.options.display.max_rows = 100 


In [None]:


## the following loads all the pitch by pitch game info year by year

# for yearStr in ["2020", "2021", "2023", "2024","2022",]: 
#     print("processing: ", yearStr)
#     year_df = statcast(start_dt=f"{yearStr}-01-01", end_dt=f"{yearStr}-12-31" )
#     year_df.to_parquet(f"games/statcast-{yearStr}.parquet")


## the following goes through all pitchers and batters and gets their name

## TODO initial caps for the names (or lowercase the player name)
combined_player_array = []
for yearStr in [ "2020", "2021", "2022", "2023", "2024",]:
    print("processing players from: ", yearStr)
    year_df = pd.read_parquet(f"games/statcast-{yearStr}.parquet")

    batters = year_df["batter"].unique()
    pitchers = year_df["pitcher"].unique()

    both_player_arrays = np.concat([batters, pitchers])

    players_df = playerid_reverse_lookup(both_player_arrays)

    combined_player_array.append(players_df)


combined_player_df = pd.concat(combined_player_array).drop_duplicates()
combined_player_df=  combined_player_df.assign(
    pitcher_fullname = combined_player_df["name_last"].capitalize() + ", " + combined_player_df["name_first"].capitalize()
)

all_players_df = combined_player_df[["key_mlbam", "pitcher_fullname"]].set_index("key_mlbam")
all_players_df.to_parquet("games/statcast-players-all.parquet",)

all_players_df

processing players from:  2020


processing players from:  2021
processing players from:  2022
processing players from:  2023
processing players from:  2024


Unnamed: 0_level_0,pitcher_fullname
key_mlbam,Unnamed: 1_level_1
594987,"sadler, casey"
669242,"edman, tommy"
570666,"cessa, luis"
666674,"colina, edwar"
664180,"hall, matt"
...,...
686799,"kochanowicz, jack"
680732,"burke, sean"
683003,"jones, jared"
676661,"lazar, max"


In [None]:

## the following prints the games dataframe scheme in n colummns

games_df = pd.read_parquet("games/statcast-2022.parquet")

rowsize = 4

for i, col in enumerate(games_df.columns.sort_values()):
    print(f"{col:<32}", end="")
    if i%rowsize == rowsize-1:
        print("") # a newline
        


In [None]:
all_pitchers_df = pd.read_parquet("games/statcast-pitchers-all.parquet",)

games_df = pd.read_parquet("games/statcast-2022.parquet")

pitches_df = pd.merge(games_df, all_pitchers_df, left_on="pitcher", right_on="key_mlbam")

pitchesGroupedByGame = pitches_df.groupby("game_pk")

overallPitchesRecorded = 0
for game_pk, g in pitchesGroupedByGame:
    game_date, home_team, away_team = g.iloc[0][["game_date", "home_team", "away_team"]]

    print(f'On {game_date}, with {home_team} playing {away_team} at home in gamae {game_pk}')
    
    for (inning, inning_topbot), i in g.groupby(["inning", "inning_topbot"], sort=True):
        print(f'in the {inning_topbot} of the {inning} inning')

        for (at_bat_number), ab in i.groupby("at_bat_number", sort=True):

            # get atbat info from the first pitch, fp
            fp = ab.iloc[0]

            on_base, baserunners = [], ""
            if pd.notna(fp.on_1b): on_base.append(f"{fp.on_1b} on first")
            if pd.notna(fp.on_2b): on_base.append(f"{fp.on_2b} on second")
            if pd.notna(fp.on_3b): on_base.append(f"{fp.on_3b} on third")
            if len(on_base)>0:
                baserunners = f"with {', '.join(on_base)}"

            print(f'{fp.p_throws} handed pitcher {fp.pitcher_fullname} {fp.pitcher} faces {fp.stand} batter {fp.player_name}, {home_team} {fp.home_score}, {away_team} {fp.away_score} {baserunners}')

            for p in ab.sort_values(["pitch_number"]).itertuples():
                # TODO consider dealing with plurals for the count and outs
                print(f"pitch from {p.pitcher} with {p.balls} balls {p.strikes} strikes {p.outs_when_up} outs is a {p.pitch_name} {p.description}", end="")
                if p.post_bat_score > p.bat_score:
                    print(f" {p.post_bat_score - p.bat_score} runs scored")
                else:
                    print() # just terminate the p status

                overallPitchesRecorded += 1
                if (overallPitchesRecorded > 500):
                    raise UserWarning('Exit Early')

