In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Pitcher Matching}$

Attempts to find the best possible non-injured match for each pitcher based on: 
- `height`
- `mass`
- `number of pitches thrown`
- `time interval`

The last is the most difficult because it requires re-computing the number of pitches thrown between dates based on each injured pitcher. A manual distance-based algorithm is used to identify matching pitchers, and the closest match is chosen.

In [2]:
# set up AWS connection
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

- Cohorts
- Ball Tracking Data

In [None]:
from services.ball_tracking import *

In [4]:
""" Injured Cohort """
# load cohort metadata (mass, height, and pitches prior to injury)
cohort = aws.load_s3_object('epidemiology/cohorts/injured/pitcher_info/pitchers_0825.csv')
cohort_metadata = aws.load_s3_object('epidemiology/cohorts/injured/pitcher_info/pitchers_metadata.csv')
cohort_injured = cohort_metadata.rename(columns={'mlbam_id': 'mlbamid'}).merge(cohort[['mlbamid', 'injury_date']], on='mlbamid', how='left')

# add season to injured pitchers
cohort_injured['injury_date'] = pd.to_datetime(cohort_injured['injury_date'], errors='coerce')
cohort_injured['season'] = cohort_injured['injury_date'].apply(lambda x: x.year if pd.notnull(x) else None)

# created injured flag (:= 1)
cohort_injured['injured'] = 1

# update pitches thrown column
cohort_injured.rename(columns={'pitches_prior_to_injury': 'pitches_thrown'}, inplace=True)

""" Non-injured Cohort """
cohort_noninjured = aws.load_s3_object('epidemiology/cohorts/noninjured/pitcher_info/pitchers_0825.csv')

In [None]:
""" Ball Tracking Data """
# all ball tracking data w/ model predictions
    # NOTE: some duplicates exist
raw_data = aws.load_s3_object('epidemiology/ml/datasets/full/model_application_data.csv')
model_preds = aws.load_s3_object('epidemiology/ml/datasets/preds/model_application.csv')
model_preds.drop_duplicates(subset=['pitcher', 'game_date', 'pred_peak_evt'], inplace=True)
model_preds['season'] = model_preds['game_date'].str[:4].astype(int)                                # add season to model predictions

In [None]:
# load pitch labels
inj_bt_labels = aws.load_s3_object('epidemiology/ml/datasets/full/injured_pitch_labels.csv')
noninj_bt_labels = aws.load_s3_object('epidemiology/ml/datasets/full/noninjured_pitch_labels.csv')

# map pitch types to condensed groups
pt_map = {
    'FF': 'FB', 'FT': 'FB', 'FC': 'FB', 'FS': 'FB', 'SI': 'FB',
    'CU': 'CB', 'KC': 'CB', 'SL': 'SL', 'KN': 'KN', 'SV': 'CB', 'ST': 'SL', 'SC': 'CB',
    'CH': 'CH', 'FO': 'CH', 'EP': 'CH'
}
inj_bt_labels['pitch_type'] = inj_bt_labels['pitch_type'].map(pt_map).dropna()
noninj_bt_labels['pitch_type'] = noninj_bt_labels['pitch_type'].map(pt_map).dropna()

# aggregate pitch counts & pivot to wide for matching
inj_pt_counts = pivot_pitch_labels(inj_bt_labels)
noninj_pt_counts = pivot_pitch_labels(noninj_bt_labels)

# rename pitcher column to mlbamid
inj_pt_counts.rename(columns={'pitcher': 'mlbamid'}, inplace=True)
noninj_pt_counts.rename(columns={'pitcher': 'mlbamid'}, inplace=True)

# upload counts to S3
aws.upload_to_s3(inj_pt_counts, 'epidemiology/ml/datasets/full/injured_pitch_counts.csv')
aws.upload_to_s3(noninj_pt_counts, 'epidemiology/ml/datasets/full/noninjured_pitch_counts.csv')

$\textbf{Matching}$

Scales based on matching criteria: __height, mass, pitches thrown, pitch type usage__.
- __Note__: Pitches thrown is based on the number of pitches w/ model predictions v. the number of counted pitches
    - Sometimes there is a slight discrepancy... this does seem to preseve more matches, though, likely because of Spring Training data
- Pitches thrown are counted between the injured pitcher's first and last pitch
- Only pitchers throwing ±10% of the injured pitcher's total pitches are considered

Euclidean distance is used to compute the matching criteria, selecting the minimum for each pitcher.

In [180]:
import pickle
from sklearn.preprocessing import StandardScaler
from services.matching import compute_matching_info

In [181]:
# compute scaler for matching criteria
matching_cols = ['mlbamid', 'season', 'height', 'mass', 'pitches_thrown']

# create cohort w/ matching data
cohort_injured_counts = cohort_injured.merge(inj_pt_counts, on=['mlbamid', 'season'], how='inner')
cohort_noninjured_counts = cohort_noninjured.merge(noninj_pt_counts, on=['mlbamid', 'season'], how='inner')
cohort_matching_data = pd.concat([cohort_injured_counts, cohort_noninjured_counts])

# create scalers for each year
matching_scalers = {}
for s in [i for i in range(2015, 2026)]:
    # filter data for the current season, fit scaler
    season_data = cohort_matching_data[cohort_matching_data['season'] == s][matching_cols + [col for col in inj_pt_counts.columns if col not in ['mlbamid', 'season']]]
    scaler = StandardScaler().fit(season_data[['height', 'mass', 'pitches_thrown'] + [col for col in inj_pt_counts.columns if col not in ['mlbamid', 'season']]])
    
    # save to dictionary
    matching_scalers[s] = scaler

# save to disc
with open('models/matching_scalers.pkl', 'wb') as f:
    pickle.dump(matching_scalers, f)

In [185]:
# iterate through injured pitchers
    # check if pitcher has pitches from season of injury (if not --> pass) 
    # get all pitches in date range, check for non-injured pitchers with pitches in date range --> compute matching info
matches = []
for idx, row in cohort_injured_counts.iterrows():
    
    # extract pitcher, season
    pitcher = row['mlbamid']
    season = row['season']
    
    # get injured pitcher's ball tracking data, season(s) w/ pitches
    inj_data_season = model_preds[
        (model_preds['pitcher'].isin(list(cohort_injured['mlbamid']))) & 
        (model_preds['season'] == season)
    ].copy()
    pitcher_bt = inj_data_season[(inj_data_season['pitcher'] == pitcher) & (inj_data_season['season'] == season)].sort_values('game_date')
    seasons_pitched = get_season_from_date(pitcher_bt['game_date'])

    # skip if pitcher has no pitches in season of injury
    if season not in seasons_pitched or pitcher_bt.empty:
        continue

    else:
        # get all pitcher metadata
        inj_ht = row['height']
        inj_mass = row['mass']
        inj_pitches = pitcher_bt.shape[0]
        inj_first_pitch_date = pitcher_bt['game_date'].min()
        inj_last_pitch_date = pitcher_bt['game_date'].max()
        inj_outing_count = pitcher_bt['game_date'].nunique()                                    # for matching based on no. of outings

        # get non-injured pitcher ball tracking data from season of injury
        noninj_data_season = model_preds[
            (model_preds['pitcher'].isin(list(cohort_noninjured['mlbamid']))) & 
            (model_preds['season'] == season)
        ].copy()

        # get all pitches btw inj_first_pitch_date & inj_last_pitch_date
            # then get pitch counts for each non-injured pitcher
        noninj_data_season = noninj_data_season[
            (noninj_data_season['game_date'] >= inj_first_pitch_date) & 
            (noninj_data_season['game_date'] <= inj_last_pitch_date)
        ]

        # get pitch counts & outing counts for non-injured pitchers in date range
        noninj_pitch_counts = noninj_data_season.groupby(['pitcher', 'season']).size().reset_index(name='pitches_thrown_interval')
        noninj_outing_counts = noninj_data_season.groupby(['pitcher', 'season'])['game_date'].nunique().reset_index(name='outings_interval')
        noninj_counts = noninj_pitch_counts.merge(noninj_outing_counts, on=['pitcher', 'season'])
        
        # skip if no non-injured pitchers have pitches in date range (e.g., spring training injury)
        if noninj_counts.empty:
            continue
        
        else:
            # get eligible non-injured pitchers w/ pitches thrown during interval
                # trim to matching columns, rename to match scaler
                # NOTE: only pitchers within 20% of injured pitcher's pitches thrown are considered eligible
            eligible_noninj = cohort_noninjured.merge(noninj_counts, left_on=['mlbamid', 'season'], right_on=['pitcher', 'season'], how='inner')
            eligible_noninj = eligible_noninj[['season', 'mlbamid', 'height', 'mass', 'pitches_thrown_interval', 'outings_interval']].rename(columns={'pitches_thrown_interval': 'pitches_thrown'})

            # NOTE (above): filter eligible non-injured pitchers to those within 10% of injured pitcher's pitches thrown
                # after, join pitch counts
            eligible_noninj_filt = eligible_noninj[
                ((eligible_noninj['pitches_thrown'] >= 0.9 * inj_pitches) &
                (eligible_noninj['pitches_thrown'] <= 1.1 * inj_pitches)) & 
                (abs(eligible_noninj['outings_interval'] - inj_outing_count) <= 2)    # within 1 outing of injured pitcher
            ].reset_index(drop=True)
            eligible_noninj_pitch_counts = eligible_noninj_filt.merge(noninj_pt_counts, on=['mlbamid', 'season'], how='inner')

            # skip if no eligible non-injured pitchers
            if eligible_noninj_filt.empty:
                continue

            # compute matching criteria
            inj_pitcher_info = pd.DataFrame([{
                'season': season,
                'mlbamid': pitcher,
                'height': inj_ht,
                'mass': inj_mass,
                'pitches_thrown': inj_pitches
            }])
            inj_pitcher_info = inj_pitcher_info.merge(inj_pt_counts, on=['mlbamid', 'season'], how='inner')

            # scale matching criteria
            season_scaler = matching_scalers[season]            # load scaler
            inj_scaled = inj_pitcher_info.copy()
            noninj_scaled = eligible_noninj_pitch_counts.copy()
            inj_scaled[season_scaler.feature_names_in_] = season_scaler.transform(inj_pitcher_info[season_scaler.feature_names_in_])
            noninj_scaled[season_scaler.feature_names_in_] = season_scaler.transform(eligible_noninj_pitch_counts[season_scaler.feature_names_in_])
            
            # get matching info
            match_info = compute_matching_info(inj_scaled, noninj_scaled, season_scaler.feature_names_in_, metric='euclidean')
            matched_pitcher = eligible_noninj_pitch_counts[eligible_noninj_pitch_counts['mlbamid'] == match_info['mlbamid_noninjured']]

            # store differences btw matched pitchers (inj - noninj)
            match_info['ht_diff'] = inj_ht - matched_pitcher['height'].values[0]
            match_info['mass_diff'] = inj_mass - matched_pitcher['mass'].values[0]
            match_info['pitches_thrown_diff'] = inj_pitches - matched_pitcher['pitches_thrown'].values[0]
            match_info['outings_diff'] = inj_outing_count - matched_pitcher['outings_interval'].values[0]
            
            # update metadata & store
            match_info['season'] = season
            matches.append(match_info)

In [None]:
# create dataframe of matches, upload to S3
    # avg diffs: 
    #   ht: 0.04 m 
    #   mass: 6.5 kg
    #   pitches thrown = 26.6 (mean), 12 (median)
    #   outings = 0.98
matches_full = pd.DataFrame(matches).drop_duplicates()
aws.upload_to_s3(matches_full, 'epidemiology/cohorts/injured/pitcher_info/matches_0825.csv')

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/cohorts/injured/pitcher_info/matches_0825.csv


$\textbf{Close AWS Connection}$

In [196]:
aws.close()

[AWS]: No active connection to close.
