In [1]:
import pandas as pd
import pybaseball as pyb
from connections import AWS

$\textbf{Non-Injured Cohort Design}$

In [2]:
# initialize AWS connection
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


In [3]:
# load cohort (to immediately exclude injured pitchers)
cohort = aws_connection.load_s3_object('epidemiology/cohorts/injured/pitchers_0825.csv')
cohort_pitchers = cohort['mlbamid'].unique()

In [None]:
# initialize bulk statcast storage
statcast_bulk = {}

# create date strings for looping -- March to November
date_strings = {}
for year in range(2015, 2026):
    date_strings[year] = []
    for month in range(3, 12):
        date_str = f"{year}-{month:02d}-01"
        date_strings[year].append(date_str)

In [None]:
# iterate through all years and dates
for yr, dates in list(date_strings.items())[1:]:
    statcast_bulk[yr] = []                                      # setup list for data storage within the year
    print(f"Processing monthly ranges for year: {yr}")
    
    # iterate through pairs
    for i in range(len(dates) - 1):
        
        try:
            statcast_data = pyb.statcast(start_dt=dates[i], end_dt=dates[i + 1])
            if not statcast_data.empty:
                # filter out pitchers in the cohort
                filtered_data = statcast_data[~statcast_data['pitcher'].isin(cohort_pitchers)]
                if not filtered_data.empty:
                    statcast_bulk[yr].append(filtered_data)
        
        except Exception as e:
            print(f"Error fetching data for {dates[i]} to {dates[i + 1]}: {e}")

    # concatenate all data for the year & upload to S3
    full_year_data = pd.concat(statcast_bulk[yr], ignore_index=True) if statcast_bulk[yr] else pd.DataFrame()
    aws_connection.upload_to_s3(
        full_year_data, 
        f'epidemiology/cohorts/noninjured/bulk_statcast/{yr}.csv'
    )

$\textbf{Determine Non-injured Matches}$

- __Matching criteria__: Height, mass, and pitches thrown prior to injury
- Metadata scraped from each pitcher's webpage using `mlbamid` from Statcast data

In [None]:
from services.scraping import *

In [42]:
# load cohort metadata (mass, height, and pitches prior to injury)
cohort_metadata = aws_connection.load_s3_object('epidemiology/cohorts/injured/pitchers_metadata.csv')
cohort_injured = cohort_metadata.rename(columns={'mlbam_id': 'mlbamid'}).merge(cohort[['mlbamid', 'injury_date']], on='mlbamid', how='left')

In [55]:
""" SCRAPE NON-INJURED PITCHER METADATA """
# initialize list of non-injured player metadata
player_metadata_list = []

# iterate through bulk statcast data
for year, data in list(statcast_bulk.items())[1:]:
    
    print(f"Acquiring pitcher metadata for year: {year}")
    
    if not data:
        continue
    
    # concatenate all data for the year
    full_year_data = pd.concat(data, ignore_index=True)
    
    # iterate through each pitcher in the year
    for pitcher_id in full_year_data['pitcher'].unique():
        pitcher_data = full_year_data[full_year_data['pitcher'] == pitcher_id]
        
        # check if the pitcher is in the cohort
        if pitcher_id not in cohort_pitchers:
            # scrape height and mass
            player_metadata = get_player_metadata(pitcher_id)
            total_pitches = full_year_data[full_year_data['pitcher'] == pitcher_id].shape[0]
            
            # save metadata if available
            if player_metadata:
                # add season, pitches thrown
                player_metadata['pitches_thrown'] = total_pitches
                player_metadata['season'] = year
                
                # append to full list
                player_metadata_list.append(player_metadata)
            else:
                print(f"No metadata found for pitcher ID: {pitcher_id}")

    print(f"Completed metadata acquisition for year: {year}")

Acquiring pitcher metadata for year: 2016
Completed metadata acquisition for year: 2016
Acquiring pitcher metadata for year: 2017
Completed metadata acquisition for year: 2017
Acquiring pitcher metadata for year: 2018
Completed metadata acquisition for year: 2018
Acquiring pitcher metadata for year: 2019
Completed metadata acquisition for year: 2019
Acquiring pitcher metadata for year: 2020
Completed metadata acquisition for year: 2020
Acquiring pitcher metadata for year: 2021
Completed metadata acquisition for year: 2021
Acquiring pitcher metadata for year: 2022
Completed metadata acquisition for year: 2022
Acquiring pitcher metadata for year: 2023
Completed metadata acquisition for year: 2023
Acquiring pitcher metadata for year: 2024
Completed metadata acquisition for year: 2024
Acquiring pitcher metadata for year: 2025
Completed metadata acquisition for year: 2025


In [56]:
# convert to dataframe
all_noninjured_pitchers = pd.DataFrame(player_metadata_list)

# upload to S3
aws_connection.upload_to_s3(
    all_noninjured_pitchers, 
    'epidemiology/cohorts/noninjured/pitchers_0825.csv'
)

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/cohorts/noninjured/pitchers_0825.csv


In [None]:
# TODO: 
    # iterate through injured pitchers:
        # grab pitcher height/mass/season --> find match candidates 
        # get injury date (last pitch before injury) --> search from start of season to injury date to match no. of pitches

Unnamed: 0,mlbamid,full_name,height,mass,pitches_prior_to_injury,injury_date
0,525768,Tim Collins,1.702,75.296272,17,2015-03-11
1,525768,Tim Collins,1.702,75.296272,17,2016-04-15
2,572831,Josh Edgin,1.854,111.130040,12,2015-03-17
3,506433,Yu Darvish,1.956,99.790240,2341,2015-03-17
4,506433,Yu Darvish,1.956,99.790240,2341,2023-08-25
...,...,...,...,...,...,...
707,695549,Jackson Jobe,1.880,86.182480,2244,2025-06-16
708,669203,Corbin Burnes,1.905,111.130040,1297,2025-06-01
709,594902,Ben Lively,1.930,106.594120,1009,2025-06-04
710,669854,Ronel Blanco,1.905,120.201880,1009,2025-06-06


$\textbf{Close AWS Connection}$