In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Clinical Comparison}$

Compares model predictions across height-, mass-, and pitch count-match controls.

In [2]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

In [None]:
# load cohort of matches (see matching_create_final_cohort.ipynb for details)
cohort_matches_final = aws.load_s3_object('epidemiology/ml/datasets/full/cohort_matches_final.csv')

In [7]:
# load model predictions
model_preds = aws.load_s3_object('epidemiology/ml/datasets/preds/model_application.csv')
model_preds.drop_duplicates(subset=['pitcher', 'game_date', 'pred_peak_evt'], inplace=True)

# add season to model predictions
model_preds['season'] = model_preds['game_date'].str[:4].astype(int)
model_preds.sort_values(by=['pitcher', 'game_date', 'pitch_id'], inplace=True)

$\textbf{Data Annotations \& Aggregates}$

__Pitch-Level Annotations__

Metadata: 
- `outing_number` (_int_): Specifies the outing number in the season 
- `last_outing_before_injury` (_bool_): Whether or not this was the last outing before a pitcher injury --> can be used for injury prediction
- `outings_until_injury` (_int_): Count of how many outings remain until an injury is observed (if applicable)

Workload: 
- `within_outing_pitch_count` (_int_): Total within outing pitch count for a pitcher
- `within_outing_cumulative_evt_workload` (_float_): Total within outing workload for a pitcher, accumulated at the pitch-level; this is the rolling sum of normalized torques

In [8]:
""" PITCH LEVEL """
# add outing_number column
model_preds['outing_number'] = model_preds.groupby(['pitcher', 'season'])['game_date'].rank(method='dense').astype(int)

# add within outing workload metrics (pitches thrown, evt workload)
model_preds['within_outing_pitch_count'] = model_preds.groupby(['pitcher', 'season', 'outing_number'])['pitch_id'].cumcount() + 1
model_preds['within_outing_cumulative_evt_workload'] = model_preds.groupby(['pitcher', 'season', 'outing_number'])['pred_peak_evt_normalized'].cumsum()

# add outing_before_injury column
last_outings = model_preds[model_preds['injured_cohort_pitcher'] == 1].groupby(['pitcher', 'season'])['game_date'].max().reset_index()
last_outings['last_outing_before_injury'] = 1

# apply merges
model_preds_annotated = model_preds.merge(last_outings, on=['pitcher', 'season', 'game_date'], how='left')
model_preds_annotated['last_outing_before_injury'].fillna(0, inplace=True)
model_preds_annotated.drop_duplicates(inplace=True)

__Outing-Level Aggregates__

Total Workload Metrics:
- `outing_total_pitch_count` (_int_): Total pitches thrown by a pitcher during an outing
- `outing_total_evt_workload` (_float_): Total within outing EVT workload for a pitcher, summed over all pitch-level
- `outing_avg_evt_workload` (_float_): Average per-pitch EVT workload within an outing
- `outing_median_evt_workload` (_float_): Median per-pitch EVT workload within an outing

In [9]:
""" OUTING LEVEL """
# create a copy for aggregation + store metadata
outing_metadata = model_preds_annotated[['pitcher', 'season', 'game_date', 'injured_cohort_pitcher', 'last_outing_before_injury']].drop_duplicates()

# total workload metrics
    # NOTE: avg & median workload help distinguish from pitch counts --> how "intense" was each pitch that was thrown
outing_metrics = model_preds_annotated.groupby(['pitcher', 'season', 'game_date', 'outing_number', 'pitcher_days_since_prev_game',]).agg(
    outing_total_pitch_count=('pitch_id', 'count'),
    outing_total_evt_workload=('within_outing_cumulative_evt_workload', 'max'),
    outing_avg_evt_workload=('pred_peak_evt_normalized', 'mean'),
    outing_median_evt_workload=('pred_peak_evt_normalized', 'median'),
).reset_index()


$\textbf{Gather Model Predictions}$

In [117]:
""" MATCHED COMPARISONS """
#   iterate through matched rows
#       for each match, get all model predictions for injured pitcher, non-injured pitcher (sort & store)
matched_preds = {}
for _, match in cohort_matches_final.sort_values('season').iterrows():

    # get metadata
    season = match['season']
    inj_id = match['mlbamid_injured']
    noninj_id = match['mlbamid_noninjured']

    # get window of injured pitches
    inj_first_pitch = model_preds_annotated[model_preds_annotated['pitcher'] == inj_id]['game_date'].min()
    inj_last_pitch = model_preds_annotated[model_preds_annotated['pitcher'] == inj_id]['game_date'].max()

    # filter model predictions, aggregates to date range of injured pitcher
    model_preds_match = model_preds_annotated[
        (model_preds_annotated['game_date'] >= inj_first_pitch) & 
        (model_preds_annotated['game_date'] <= inj_last_pitch) & 
        (model_preds_annotated['season'] == season) & 
        (model_preds_annotated['pitcher'].isin([inj_id, noninj_id]))
    ].copy()
    outing_metrics_match = outing_metrics[
        (outing_metrics['game_date'] >= inj_first_pitch) & 
        (outing_metrics['game_date'] <= inj_last_pitch) & 
        (outing_metrics['season'] == season) & 
        (outing_metrics['pitcher'].isin([inj_id, noninj_id]))
    ].copy()
    outing_metadata_match = outing_metadata[
        (outing_metadata['game_date'] >= inj_first_pitch) & 
        (outing_metadata['game_date'] <= inj_last_pitch) & 
        (outing_metadata['season'] == season) & 
        (outing_metadata['pitcher'].isin([inj_id, noninj_id]))
    ].copy()

    # get model predictions for injured pitcher
    inj_preds = model_preds_match[model_preds_match['pitcher'] == inj_id]
    noninj_preds = model_preds_match[model_preds_match['pitcher'] == noninj_id]

    # store metadata & pitch/outing-level data in dictionary
    matched_preds[inj_id] = {
        'injured': {
            'metadata': outing_metadata_match[outing_metadata_match['pitcher'] == inj_id].copy(),
            'pitch_level': inj_preds,
            'outing_level': outing_metrics_match[outing_metrics_match['pitcher'] == inj_id].copy()
        },
        'noninjured': {
            'metadata': outing_metadata_match[outing_metadata_match['pitcher'] == noninj_id].copy(),
            'pitch_level': noninj_preds,
            'outing_level': outing_metrics_match[outing_metrics_match['pitcher'] == noninj_id].copy()
        }
    }

$\textbf{Group Comparisons}$

- __Group comparison__ (i.e, inj. vs noninj. means)
- __Pairwise comparison__ (i.e., inj. - noninj. compared to 0)

In [118]:
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest

In [119]:
# setup comparisons
all_pitch_data = []                     # master list of all data (--> group comparison)
all_diffs = {
    'vals': [],
    'inj_grt_than_noninj': [],
} 


# iterate through matches
for inj_id, match in matched_preds.items():
    # pitch data --> add to master list
    pitch_data = pd.concat([
        match['injured']['pitch_level'].assign(injured=1),
        match['noninjured']['pitch_level'].assign(injured=0)
    ]).reset_index(drop=True)
    all_pitch_data.append(pitch_data)

    # setup pairwaise comparison
    inj_mean = match['injured']['pitch_level']['pred_peak_evt'].mean()
    noninj_mean = match['noninjured']['pitch_level']['pred_peak_evt'].mean()
    
    # compute difference
    diff = inj_mean - noninj_mean
    all_diffs['vals'].append(diff)
    all_diffs['inj_grt_than_noninj'].append(int(diff > 0))

# concatenate all pitch data together
all_pitch_df = pd.concat(all_pitch_data).reset_index(drop=True)

In [121]:
""" TEST 1: Group Comparison """
# t-test: compare means of injured vs non-injured groups
    # NOTE: significant (--> injured sig. higher than non-injured)
t_stat, p_value = ttest_ind(
    all_pitch_df[all_pitch_df['injured'] == 1]['pred_peak_evt'], 
    all_pitch_df[all_pitch_df['injured'] == 0]['pred_peak_evt'], 
    alternative='greater',
    equal_var=False
) 
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

""" TEST 2: Pairwise Comparison """
# t-test: compare differences to 0
    # NOTE: not significant (--> not enough evidence to say injured match avgs are greater than non-injured)
diffs = np.array(all_diffs['vals'])
t_stat, p_value = ttest_ind(diffs, np.zeros_like(diffs), alternative='greater', equal_var=False)
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

""" TEST 3: Proportion of Injured > Non-Injured """
# proportion test: compare proportion of matches where injured avg > non-injured avg
    # NOTE: not significant (obvious, proportion was < 0.5)
num_injured_exceed = sum(all_diffs['inj_grt_than_noninj'])
num_noninjured_exceed = len(all_diffs['inj_grt_than_noninj']) - num_injured_exceed
stat, pval = proportions_ztest(num_injured_exceed, num_noninjured_exceed, value=len(all_diffs['inj_grt_than_noninj']), alternative='larger')
print(f"Z-statistic: {stat:.3f}, p-value: {pval:.3f}")

T-statistic: 17.790, p-value: 0.000
T-statistic: -0.165, p-value: 0.565
Z-statistic: -4563.868, p-value: 1.000


$\textbf{Close AWS Connection}$

In [113]:
# close AWS connection
aws.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
