In [None]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Model Application (Development)}$

__NOTE__ (8/11/25): Following some modifications to model development (i.e., retraining on all subjects) and data engineering (e.g., acquisition and storage), this notebook has been deprecated.

In [80]:
""" INITIALIZE AWS CONNECTION """
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textbf{File Loading}$

- Model files (results storage to find best model)
- Data files (CSVs)

In [None]:
import pickle

$\textbf{Model Data + Summary}$

__Includes the following__: 
- All subject models & errors (best model is chosen as subject w/ min. RMSE and. > 15 pitches, ≥ 4 pitch types, and FB > 90 mph)
- Model development data, which is used to summarize the training data

In [44]:
# download model RMSE
aws_connection.s3.download_file(
    aws_connection.bucket_name, 
    'biomechanics/ml/modeling_summary/model_rmse.pkl', 
    'storage/all_subject_rmse.pkl'
    )
with open('storage/all_subject_rmse.pkl', 'rb') as f:
    all_subject_errors = pickle.load(f)

# download model dev data for reference
model_dev_data = aws_connection.load_s3_object('biomechanics/ml/modeling_datasets/model_dev_raw.csv')
model_summary_data = aws_connection.load_s3_object('biomechanics/ml/modeling_datasets/model_data_converted.csv')

In [None]:
# TODO (instead of 2 cells below):  or retrain on all pitchers from D2, report RMSE

In [None]:
# TODO: use best LOOCV model from D2?
    # CRITERIA (must prove some generalizability):
        # > 15 pitches
        # ≥ 4 pitch types
        # throws FB > 90 mph
with open('storage/all_subject_models.pkl', 'rb') as f:
    all_subject_models = pickle.load(f)

# identify subjects meeting criteria --> 36 total
valid_subjects = []
for subject in all_subject_models.keys():
    subject_data = model_dev_data[model_dev_data['subject_id'] == subject]
    if len(subject_data) > 15 and len(subject_data['pitch_type'].unique()) >= 4:
        if subject_data['rel_speed'].max() >= 90:
            valid_subjects.append(subject)

In [None]:
# iterate through subjects to get best model based on RMSE
    # NOTE: this is more complicated because of how errors were saved (no subject ID for reference)
min_rmse = float('inf')
best_subject = None
for subject in valid_subjects:
    subject_idx = list(all_subject_models.keys()).index(subject)            # identify index in list of all models
    subject_rmse = all_subject_errors['baseline_rf'][subject_idx]           # get RMSE for subject
    if subject_rmse < min_rmse:
        min_rmse = subject_rmse
        best_subject = subject

# best subject details:
    # subject ID: 2636 (RHP)
    # RMSE: 1.63 Nm
    # number of pitches: 28
    # pitch types: FB (max: 91.2), CH, SL, CB
    # TODO: dataset summary of non-2636 pitches
print(f'Best subject: {best_subject} with RMSE: {min_rmse}')

""" Store training set details """
    # NOTE: not in metric system; use model_summary_data for metric system
training_data = model_dev_data[model_dev_data['subject_id'] != best_subject].copy()
training_summary = training_data.agg({
    'rel_speed': ['mean', 'std'],
    'rel_side': ['mean', 'std'],
    'rel_ht': ['mean', 'std'],
    'spin_rate': ['mean', 'std'],
    'spin_axis': ['mean', 'std'],
    'ax0': ['mean', 'std'],
    'ay0': ['mean', 'std'],
    'az0': ['mean', 'std'],
    'peak_value': ['mean', 'std'],
    # 'peak_value_normalized': ['mean', 'std'],
}).T

# throwing hand counts
throwing_hand_counts = training_data.groupby('subject_id')['pitcher_throws_rh'].unique().value_counts()

# pitch type counts
pitch_type_map = {
    'Fastball': 'Fastball',
    'Changeup': 'Changeup',
    'Curveball': 'Curveball',
    'Slider': 'Slider',
    'Sinker': 'Fastball',
    'Cutter': 'Other',
    'Splitter': 'Changeup',
    'FastBall': 'Fastball',
    'CurveBall': 'Curveball',
    'TwoSeamFastBall': 'Fastball',
    'TwoSeamFastball': 'Fastball',
    'Other': 'Other',
    # 'Stretch' -- not a pitch type
    # 'QuickPitch' -- not a pitch type
    # 'Hesitation' -- not a pitch type
    # 'Hitch' -- not a pitch type
    '2020FB': 'Fastball',
}
training_data['pitch_type'] = training_data['pitch_type'].map(pitch_type_map)
pitch_type_counts = training_data['pitch_type'].value_counts() / training_data['pitch_type'].count()

Best subject: 2636 with RMSE: 1.6280098603441266


$\textbf{Cohort Data}$

__Includes the following__:
- Cohort details w/ injury date, type
- Ball tracking data from Statcast (~260k  million pitches)
- Additional injury metadata (e.g., tracked pitches prior to injury)
- Bios scraped from Statcast (height, mass, mlbamid)

There are __413__ injured pitchers with pitches tracked prior to injury. 

In [82]:
# load all cohort CSVs
cohort = aws_connection.load_s3_object('epidemiology/cohorts/injured/pitchers_0825.csv')
cohort_ball_tracking = aws_connection.load_s3_object('epidemiology/cohorts/injured/statcast_data.csv')
cohort_metadata = aws_connection.load_s3_object('epidemiology/cohorts/injured/statcast_metadata.csv')

$\textbf{Cohort Ball Tracking Summary}$

Compares summary metrics to the model training set.

In [None]:
# view training summary
    # NOTE: not in metric system, not hand-normalized
training_summary

Unnamed: 0,mean,std
rel_speed,85.518971,6.242109
rel_side,0.771319,1.65626
rel_ht,5.689031,0.411926
spin_rate,2159.506519,338.047131
spin_axis,170.242415,74.714183
ax0,-2.547317,9.387685
ay0,26.07321,5.422035
az0,-22.104256,10.771334
peak_value,142.112568,24.832225


In [171]:
# update column names to match model training data
cohort_ball_tracking = cohort_ball_tracking.rename(columns={
    'release_speed': 'rel_speed',
    'release_pos_x': 'rel_side',
    'release_pos_z': 'rel_ht',
    'release_spin_rate': 'spin_rate',
    'spin_axis': 'spin_axis',
    'ax': 'ax0',
    'ay': 'ay0',
    'az': 'az0',
})
model_fts = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate', 'spin_axis', 'ax0', 'ay0', 'az0']

# setup model dataset for cohort
    # keep days since prev game for later workload analysis
    # drop rows with missing data (unusable by model)
cohort_days_since_prev_game = cohort_ball_tracking[['pitcher', 'game_date', 'pitcher_days_since_prev_game']].dropna().copy().reset_index(names='pitch_id')
cohort_model_data = cohort_ball_tracking[['pitcher'] + model_fts].copy().dropna().reset_index(names='pitch_id')

# get & view summary of cohort data
    # NOTE: not in metric system, not hand-normalized
cohort_summary = cohort_model_data.agg({
    'rel_speed': ['mean', 'std'],
    'rel_side': ['mean', 'std'],
    'rel_ht': ['mean', 'std'],
    'spin_rate': ['mean', 'std'],
    'spin_axis': ['mean', 'std'],
    'ax0': ['mean', 'std'],
    'ay0': ['mean', 'std'],
    'az0': ['mean', 'std']
}).T
cohort_summary

Unnamed: 0,mean,std
rel_speed,89.160242,5.88909
rel_side,-0.864445,1.813908
rel_ht,5.870694,0.440371
spin_rate,2266.035551,351.676205
spin_axis,178.043714,70.537853
ax0,-2.704812,10.382072
ay0,27.049483,3.991918
az0,-23.486279,8.781136


In [None]:
# compare to validation data (RMSE = 1.62 Nm)
    # NOTE: should this more closely resemble the cohort data?
val_data = model_dev_data[model_dev_data['subject_id'] == best_subject].copy()
val_summary = val_data.agg({
    'rel_speed': ['mean', 'std'],
    'rel_side': ['mean', 'std'],
    'rel_ht': ['mean', 'std'],
    'spin_rate': ['mean', 'std'],
    'spin_axis': ['mean', 'std'],
    'ax0': ['mean', 'std'],
    'ay0': ['mean', 'std'],
    'az0': ['mean', 'std'],
    'peak_value': ['mean', 'std'],
    # 'peak_value_normalized': ['mean', 'std'],
}).T
val_summary

Unnamed: 0,mean,std
rel_speed,82.985714,5.499476
rel_side,1.061071,0.156828
rel_ht,6.178571,0.081227
spin_rate,2123.139286,279.416421
spin_axis,169.142857,53.030888
ax0,-3.968571,3.782671
ay0,24.246071,4.174302
az0,-19.943929,8.414144
peak_value,131.033491,7.752627


$\textbf{Run Model}$

In [None]:
# extract model for cohort application
cohort_evt_model = all_subject_models[best_subject]

# generate preds for cohort
    # INJURED MEAN: 0.083 ± 0.008 (normalized) --> slightly higher than full training set mean of 0.081 ± 0.013
cohort_preds = cohort_evt_model.predict(cohort_model_data[model_fts])
cohort_preds_df = pd.DataFrame(cohort_preds, columns=['pred_evt_normalized'], index=cohort_model_data['pitch_id']).reset_index()

# add metadata to convert to Nm
    # INJURED MEAN: 152.68 ± 22.7 Nm --> higher than full training set mean of 142.0 ± 24.8 Nm
cohort_preds_df = cohort_model_data[['pitch_id', 'pitcher']].merge(cohort_preds_df, on='pitch_id', how='left')
cohort_preds_metadata = cohort_metadata.rename(columns={'mlbam_id': 'pitcher'}).merge(cohort_preds_df, on='pitcher', how='inner')
cohort_preds_metadata['pred_evt'] = cohort_preds_metadata['pred_evt_normalized'] * (cohort_preds_metadata['height'] * cohort_preds_metadata['mass'] * 9.81)

# merge days since prev game, sort by date
cohort_preds_metadata = cohort_preds_metadata.merge(cohort_days_since_prev_game, on=['pitcher', 'pitch_id'], how='inner')
cohort_preds_final = cohort_preds_metadata.sort_values(['pitcher', 'game_date']).reset_index(drop=True)

In [205]:
# upload to S3
aws_connection.upload_to_s3(
    cohort_preds_final, 
    'epidemiology/cohorts/injured/preds_final.csv'
)

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/cohorts/injured/preds_final.csv


$\textbf{Post-Model Analysis}$

In [None]:
# TODO:
    # separate starter/reliever?
    # compute cumulative (normalized) torque by pitcher & game; save days btw outings?

In [196]:
# store days btw outings metadata (+ save rest counts for context)
days_btw_outings = cohort_preds_final[['pitcher', 'game_date', 'pitcher_days_since_prev_game']].copy()
rest_counts = pd.DataFrame(days_btw_outings['pitcher_days_since_prev_game'].value_counts()).reset_index()[0:10]

$\textbf{Close AWS Connection}$

In [210]:
aws_connection.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
