In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Pitcher Matching}$

Attempts to find the best possible non-injured match for each pitcher based on: 
- `height`
- `mass`
- `number of pitches thrown`
- `time interval`

The last is the most difficult because it requires re-computing the number of pitches thrown between dates based on each injured pitcher. A manual distance-based algorithm is used to identify matching pitchers, and the closest match is chosen.

In [2]:
# set up AWS connection
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

- Cohorts
- Ball Tracking Data

In [3]:
def clean_ball_tracking_data(
        data: pd.DataFrame,
        model_fts: list = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate', 'spin_axis', 'ax0', 'ay0', 'az0']
) -> pd.DataFrame:
    """
    Cleans the ball tracking data by trimming to necessary columns, removing outliers, etc.
    """
    data.rename(columns={
        'release_speed': 'rel_speed',
        'release_pos_x': 'rel_side',
        'release_pos_z': 'rel_ht',
        'release_spin_rate': 'spin_rate',
        'spin_axis': 'spin_axis',
        'ax': 'ax0',
        'ay': 'ay0',
        'az': 'az0',
    }, inplace=True)

    # setup model dataset for cohort
    clean_data = data[['pitcher', 'p_throws', 'game_date', 'pitcher_days_since_prev_game'] + model_fts].copy().reset_index(names='pitch_id')
    
    return clean_data

# get list of seasons from a date column (e.g., game_date)
    # used to determine season(s) from which a pitcher has pitched
def get_season_from_date(date: pd.Series) -> list:
    """
    Extracts the list of season(s) from a date column.
    """
    date_dt = pd.to_datetime(date, errors='coerce')
    seasons = date_dt.apply(lambda x: x.year if pd.notnull(x) else None)

    return list(seasons.dropna().unique())

In [4]:
""" Injured Cohort """
# load cohort metadata (mass, height, and pitches prior to injury)
cohort = aws.load_s3_object('epidemiology/cohorts/injured/pitcher_info/pitchers_0825.csv')
cohort_metadata = aws.load_s3_object('epidemiology/cohorts/injured/pitcher_info/pitchers_metadata.csv')
cohort_injured = cohort_metadata.rename(columns={'mlbam_id': 'mlbamid'}).merge(cohort[['mlbamid', 'injury_date']], on='mlbamid', how='left')

# add season to injured pitchers
cohort_injured['injury_date'] = pd.to_datetime(cohort_injured['injury_date'], errors='coerce')
cohort_injured['season'] = cohort_injured['injury_date'].apply(lambda x: x.year if pd.notnull(x) else None)

# created injured flag (:= 1)
cohort_injured['injured'] = 1

# update pitches thrown column
cohort_injured.rename(columns={'pitches_prior_to_injury': 'pitches_thrown'}, inplace=True)

""" Non-injured Cohort """
cohort_noninjured = aws.load_s3_object('epidemiology/cohorts/noninjured/pitcher_info/pitchers_0825.csv')

In [None]:
""" Ball Tracking Data """
# all ball tracking data w/ model predictions
ball_tracking_data = aws.load_s3_object('epidemiology/ml/datasets/preds/model_application.csv')

# add season to model predictions
ball_tracking_data['season'] = ball_tracking_data['game_date'].str[:4].astype(int)

$\textbf{Matching}$

Scales based on matching criteria: __height, mass, pitches thrown__.
- __Note__: Pitches thrown is based on the number of pitches w/ model predictions v. the number of counted pitches
    - Sometimes there is a slight discrepancy... this does seem to preseve more matches, though, likely because of Spring Training data
- Pitches thrown are counted between the injured pitcher's first and last pitch
- Only pitchers throwing ±10% of the injured pitcher's total pitches are considered

Euclidean distance is used to compute the matching criteria, selecting the minimum for each pitcher.

In [6]:
import pickle
from sklearn.preprocessing import StandardScaler
from services.matching import compute_matching_info

In [47]:
# compute scaler for matching criteria
matching_cols = ['mlbamid', 'season', 'height', 'mass', 'pitches_thrown']
cohort_matching_data = pd.concat([cohort_injured, cohort_noninjured])

# create scalers for each year
matching_scalers = {}
for s in [i for i in range(2015, 2026)]:
    # filter data for the current season, fit scaler
    season_data = cohort_matching_data[cohort_matching_data['season'] == s][matching_cols]
    scaler = StandardScaler().fit(season_data[['height', 'mass', 'pitches_thrown']])
    
    # save to dictionary
    matching_scalers[s] = scaler

# save to disc
with open('models/matching_scalers.pkl', 'wb') as f:
    pickle.dump(matching_scalers, f)

In [65]:
# iterate through injured pitchers
    # check if pitcher has pitches from season of injury (if not --> pass) 
    # get all pitches in date range, check for non-injured pitchers with pitches in date range --> compute matching info
matches = []
for idx, row in cohort_injured.iterrows():
    
    # extract pitcher, season
    pitcher = row['mlbamid']
    season = row['season']
    
    # get injured pitcher's ball tracking data, season(s) w/ pitches
    inj_data_season = ball_tracking_data[
        (ball_tracking_data['pitcher'].isin(list(cohort_injured['mlbamid']))) & 
        (ball_tracking_data['season'] == season)
    ].copy()
    pitcher_bt = inj_data_season[(inj_data_season['pitcher'] == pitcher) & (inj_data_season['season'] == season)].sort_values('game_date')
    seasons_pitched = get_season_from_date(pitcher_bt['game_date'])

    # skip if pitcher has no pitches in season of injury
    if season not in seasons_pitched or pitcher_bt.empty:
        continue

    else:
        # get all pitcher metadata
        inj_ht = row['height']
        inj_mass = row['mass']
        inj_pitches = pitcher_bt.shape[0]
        inj_first_pitch_date = pitcher_bt['game_date'].min()
        inj_last_pitch_date = pitcher_bt['game_date'].max()

        # get non-injured pitcher ball tracking data from season of injury
        noninj_data_season = ball_tracking_data[
            (ball_tracking_data['pitcher'].isin(list(cohort_noninjured['mlbamid']))) & 
            (ball_tracking_data['season'] == season)
        ].copy()

        # get all pitches btw inj_first_pitch_date & inj_last_pitch_date
            # then get pitch counts for each non-injured pitcher
        noninj_data_season = noninj_data_season[
            (noninj_data_season['game_date'] >= inj_first_pitch_date) & 
            (noninj_data_season['game_date'] <= inj_last_pitch_date)
        ]

        # skip if no non-injured pitchers have pitches in date range (e.g., spring training injury)
        non_inj_pitch_counts = noninj_data_season.groupby(['pitcher', 'season']).size().reset_index(name='pitches_thrown_interval')
        if non_inj_pitch_counts.empty:
            continue
        else:
            # get eligible non-injured pitchers w/ pitches thrown during interval
                # trim to matching columns, rename to match scaler
                # NOTE: only pitchers within 20% of injured pitcher's pitches thrown are considered eligible
            eligible_noninj = cohort_noninjured.merge(non_inj_pitch_counts, left_on=['mlbamid', 'season'], right_on=['pitcher', 'season'], how='inner')
            eligible_noninj = eligible_noninj[['mlbamid', 'height', 'mass', 'pitches_thrown_interval']].rename(columns={'pitches_thrown_interval': 'pitches_thrown'})

            # NOTE (above): filter eligible non-injured pitchers to those within 10% of injured pitcher's pitches thrown
            eligible_noninj = eligible_noninj[
                (eligible_noninj['pitches_thrown'] >= 0.9 * inj_pitches) &
                (eligible_noninj['pitches_thrown'] <= 1.1 * inj_pitches)
            ].reset_index(drop=True)

            # skip if no eligible non-injured pitchers
            if eligible_noninj.empty:
                continue

            # compute matching criteria
            inj_pitcher_info = pd.DataFrame([{
                'mlbamid': pitcher,
                'height': inj_ht,
                'mass': inj_mass,
                'pitches_thrown': inj_pitches
            }])
            matching_cols = ['height', 'mass', 'pitches_thrown']

            # scale matching criteria
            season_scaler = matching_scalers[season]            # load scaler
            inj_scaled = inj_pitcher_info.copy()
            noninj_scaled = eligible_noninj.copy()
            inj_scaled[matching_cols] = season_scaler.transform(inj_pitcher_info[matching_cols])
            noninj_scaled[matching_cols] = season_scaler.transform(eligible_noninj[matching_cols])
            
            # get matching info
            match_info = compute_matching_info(inj_scaled, noninj_scaled, matching_cols, metric='euclidean')

            # store differences btw matched pitchers (inj - noninj)
            match_info['ht_diff'] = inj_ht - eligible_noninj[eligible_noninj['mlbamid'] == match_info['mlbamid_noninjured']]['height'].values[0]
            match_info['mass_diff'] = inj_mass - eligible_noninj[eligible_noninj['mlbamid'] == match_info['mlbamid_noninjured']]['mass'].values[0]
            match_info['pitches_thrown_diff'] = inj_pitches - eligible_noninj[eligible_noninj['mlbamid'] == match_info['mlbamid_noninjured']]['pitches_thrown'].values[0]
            
            # update metadata & store
            match_info['season'] = season
            matches.append(match_info)


In [66]:
match_info

{'mlbamid_injured': 700363,
 'mlbamid_noninjured': 682990.0,
 'min_distance': 0.2233738913261837,
 'ht_diff': 0.0,
 'mass_diff': -2.267960000000002,
 'pitches_thrown_diff': -31,
 'season': 2025}

In [72]:
# create dataframe of matches, upload to S3
    # avg diffs: 
    #   ht: 0.013 m 
    #   mass: 3.01 kg
    #   pitches thrown = 22.72 (mean), 12 (median)
matches_full = pd.DataFrame(matches).drop_duplicates()
aws.upload_to_s3(matches_full, 'epidemiology/cohorts/injured/pitcher_info/matches_0825.csv')

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/cohorts/injured/pitcher_info/matches_0825.csv


$\textbf{Close AWS Connection}$

In [73]:
aws.close()

[AWS]: No active connection to close.
