In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Pitcher Matching}$

Attempts to find the best possible non-injured match for each pitcher based on: 
- `height`
- `mass`
- `number of pitches thrown`
- `time interval`

The last is the most difficult because it requires re-computing the number of pitches thrown between dates based on each injured pitcher. A manual distance-based algorithm is used to identify matching pitchers, and the closest match is chosen.

In [2]:
# set up AWS connection
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

- Cohorts
- Ball Tracking Data

In [3]:
def clean_ball_tracking_data(
        data: pd.DataFrame,
        model_fts: list = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate', 'spin_axis', 'ax0', 'ay0', 'az0']
) -> pd.DataFrame:
    """
    Cleans the ball tracking data by trimming to necessary columns, removing outliers, etc.
    """
    data.rename(columns={
        'release_speed': 'rel_speed',
        'release_pos_x': 'rel_side',
        'release_pos_z': 'rel_ht',
        'release_spin_rate': 'spin_rate',
        'spin_axis': 'spin_axis',
        'ax': 'ax0',
        'ay': 'ay0',
        'az': 'az0',
    }, inplace=True)

    # setup model dataset for cohort
    clean_data = data[['pitcher', 'p_throws', 'game_date', 'pitcher_days_since_prev_game'] + model_fts].copy().reset_index(names='pitch_id')
    
    return clean_data

# get list of seasons from a date column (e.g., game_date)
    # used to determine season(s) from which a pitcher has pitched
def get_season_from_date(date: pd.Series) -> list:
    """
    Extracts the list of season(s) from a date column.
    """
    date_dt = pd.to_datetime(date, errors='coerce')
    seasons = date_dt.apply(lambda x: x.year if pd.notnull(x) else None)

    return list(seasons.dropna().unique())

In [None]:
""" Injured Cohort """
# load cohort metadata (mass, height, and pitches prior to injury)
cohort = aws.load_s3_object('epidemiology/cohorts/injured/pitchers_info/pitchers_0825.csv')
cohort_metadata = aws.load_s3_object('epidemiology/cohorts/injured/pitcher_info/pitchers_metadata.csv')
cohort_injured = cohort_metadata.rename(columns={'mlbam_id': 'mlbamid'}).merge(cohort[['mlbamid', 'injury_date']], on='mlbamid', how='left')

# add season to injured pitchers
cohort_injured['injury_date'] = pd.to_datetime(cohort_injured['injury_date'], errors='coerce')
cohort_injured['season'] = cohort_injured['injury_date'].apply(lambda x: x.year if pd.notnull(x) else None)

# created injured flag (:= 1)
cohort_injured['injured'] = 1

# update pitches thrown column
cohort_injured.rename(columns={'pitches_prior_to_injury': 'pitches_thrown'}, inplace=True)

""" Non-injured Cohort """
cohort_noninjured = aws.load_s3_object('epidemiology/cohorts/noninjured/pitcher_info/pitchers_0825.csv')

In [None]:
""" Ball Tracking Data """
# injured pitcher ball tracking data (to get date of first pitch thrown)
inj_ball_tracking = aws.load_s3_object('epidemiology/cohorts/injured/statcast/data_0825.csv')
inj_bt_clean = clean_ball_tracking_data(inj_ball_tracking)
inj_bt_clean['season'] = pd.to_datetime(cohort_injured['game_date'], errors='coerce').apply(lambda x: x.year if pd.notnull(x) else None)

# all ball tracking data by season
season_range = [i for i in range(2015, 2026)]
noninj_bt = {}
for s in season_range:
    # load, clean, & store ball tracking data for non-injured pitchers
    raw_bt = aws.load_s3_object(f'epidemiology/cohorts/noninjured/bulk_statcast/{s}.csv')
    noninj_bt[s] = clean_ball_tracking_data(raw_bt)

    # add season column
    noninj_bt[s]['season'] = s

In [None]:
# TODO: change to imputed values (see `missing_val_imputation.ipynb`)

$\textbf{Matching}$

- Scales matching criteria (e.g., height, mass, pitches thrown)
- Computes matches using Euclidean distance

In [72]:
import pickle
from sklearn.preprocessing import StandardScaler
from services.matching import compute_matching_info

In [76]:
cohort_noninjured

Unnamed: 0,mlbamid,full_name,height,mass,pitches_thrown,season,injured
0,544727,Jeurys Familia,1.905,108.862080,1407,2015,0
1,595014,Blake Treinen,1.956,102.058200,1067,2015,0
2,407819,Matt Thornton,1.981,106.594120,628,2015,0
3,456776,Alex Torres,1.778,86.182480,650,2015,0
4,453286,Max Scherzer,1.905,94.347136,3455,2015,0
...,...,...,...,...,...,...,...
7665,696131,Mason Black,1.905,103.418976,74,2025,0
7666,676742,Cam Sanders,1.880,79.378600,31,2025,0
7667,670912,Johan Oviedo,1.981,124.737800,43,2025,0
7668,541640,Erasmo Ramírez,1.829,99.790240,28,2025,0


In [78]:
# compute scaler for matching criteria
    # NOTE: column 'pitches_thrown' renamed to 'pitches_thrown_interval' to be consistent with matching process
matching_scaler = StandardScaler()
matching_scaler.fit(cohort_noninjured.rename(columns={'pitches_thrown': 'pitches_thrown_interval'})[['height', 'mass', 'pitches_thrown_interval']])

# save to disc
with open('models/matching_scaler.pkl', 'wb') as f:
    pickle.dump(matching_scaler, f)

In [84]:
def compute_matching_info(
        inj: pd.DataFrame,
        noninj: pd.DataFrame,
        matching_cols: list,
        scaler: StandardScaler = None,
        metric: str = 'euclidean'
) -> dict:
    """ Computes the distance between injured pitcher and all eligible non-injured pitchers. Returns a dictionary with distances and corresponding non-injured pitcher IDs. """
    # compute distance btw injured pitcher & all eligible non-injured pitchers based on matching cols
    distances = cdist(
        noninj[matching_cols].values,
        inj[matching_cols].values,
        metric=metric
    ).flatten()

    # get min. distance
    min_idx = distances.argmin()
    min_distance = distances[min_idx]
    matched_pitcher_id = noninj.iloc[min_idx]['mlbamid']

    if scaler:
        # inverse transform matching criteria back to original scale
        inj[matching_cols] = scaler.inverse_transform(inj[matching_cols])
        noninj[matching_cols] = scaler.inverse_transform(noninj[matching_cols])
    
    return {
        'injured_id': inj['mlbamid'].values[0],
        'matched_id': matched_pitcher_id,
        'min_distance': min_distance,
        'inj_mass': inj['mass'].values[0],
        'inj_height': inj['height'].values[0],
        'inj_pitches_thrown': inj['pitches_thrown_interval'].values[0],
        'noninj_mass': noninj.iloc[min_idx]['mass'],
        'noninj_height': noninj.iloc[min_idx]['height'],
        'noninj_pitches_thrown': noninj.iloc[min_idx]['pitches_thrown_interval'],
    }

In [87]:
# iterate through injured pitchers
    # check if pitcher has pitches from season of injury (if not --> pass) 
    # get all pitches in date range, check for non-injured pitchers with pitches in date range --> compute matching info
matches = []
for idx, row in cohort_injured.iterrows():
    
    # extract pitcher, season
    pitcher = row['mlbamid']
    season = row['season']
    
    # get injured pitcher's ball tracking data, season(s) w/ pitches
    pitcher_bt = inj_bt_clean[(inj_bt_clean['pitcher'] == pitcher) & (inj_bt_clean['season'] == season)].sort_values('game_date')
    seasons_pitched = get_season_from_date(pitcher_bt['game_date'])

    # skip if pitcher has no pitches in season of injury
    if season not in seasons_pitched or pitcher_bt.empty:
        continue

    else:
        # get all pitcher metadata
        inj_ht = row['height']
        inj_mass = row['mass']
        inj_pitches = row['pitches_thrown']
        inj_first_pitch_date = pitcher_bt['game_date'].min()
        inj_last_pitch_date = pitcher_bt['game_date'].max()

        # get non-injured pitcher ball tracking data from season of injury
        noninj_season_bt = noninj_bt[season]
        noninj_season_bt['season'] = season         # TODO: remove after adding season column during load

        # get all pitches btw inj_first_pitch_date & inj_last_pitch_date
            # then get pitch counts for each non-injured pitcher
        noninj_season_bt = noninj_season_bt[
            (noninj_season_bt['game_date'] >= inj_first_pitch_date) & 
            (noninj_season_bt['game_date'] <= inj_last_pitch_date)
        ]
        non_inj_pitch_counts = noninj_season_bt.groupby(['pitcher', 'season']).size().reset_index(name='pitches_thrown_interval')

        # skip if no non-injured pitchers have pitches in date range (e.g., spring training injury)
        if non_inj_pitch_counts.empty:
            continue
        else:
            # get eligible non-injured pitchers w/ pitches thrown during interval
            eligible_noninj = cohort_noninjured.merge(non_inj_pitch_counts, left_on=['mlbamid', 'season'], right_on=['pitcher', 'season'], how='inner')

            # compute matching criteria
            inj_pitcher_info = pd.DataFrame([{
                'mlbamid': pitcher,
                'height': inj_ht,
                'mass': inj_mass,
                'pitches_thrown_interval': inj_pitches
            }])
            matching_cols = ['height', 'mass', 'pitches_thrown_interval']

            # scale matching criteria
            inj_pitcher_info[matching_cols] = matching_scaler.transform(inj_pitcher_info[matching_cols])
            eligible_noninj[matching_cols] = matching_scaler.transform(eligible_noninj[matching_cols])
            
            # get matching info
                # NOTE: re-supply sclaer to return criteria on original scale
            match_info = compute_matching_info(inj_pitcher_info, eligible_noninj, matching_cols, scaler=matching_scaler, metric='euclidean')
            matches.append(match_info)

In [90]:
# create dataframe of matches
matches_full = pd.DataFrame(matches)

# upload to S3
aws.upload_to_s3(matches_full, 'epidemiology/cohorts/injured/pitcher_info/matches_0825.csv')

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/cohorts/injured/pitcher_info/matches_0825.csv


$\textbf{Close AWS Connection}$

In [91]:
aws.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
