In [None]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Model Application (Development)}$

Currently only for injured pitchers.

In [2]:
""" INITIALIZE AWS CONNECTION """
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is in use by process python3.11 (PID 39536). Killing it.
[AWS]: Connected to RDS endpoint.


$\textbf{File Loading}$

- Model files (results storage to find best model)
- Data files (CSVs)

In [None]:
import pickle

$\textit{Model Data + Summary}$

__Includes the following__: 
- All subject models & errors (best model is chosen as subject w/ min. RMSE and. > 15 pitches, ≥ 4 pitch types, and FB > 90 mph)
- Model development data, which is used to summarize the training data

In [44]:
# download model RMSE
aws_connection.s3.download_file(
    aws_connection.bucket_name, 
    'biomechanics/ml/modeling_summary/model_rmse.pkl', 
    'storage/all_subject_rmse.pkl'
    )
with open('storage/all_subject_rmse.pkl', 'rb') as f:
    all_subject_errors = pickle.load(f)

# download model dev data for reference
model_dev_data = aws_connection.load_s3_object('biomechanics/ml/modeling_datasets/model_dev_raw.csv')
model_summary_data = aws_connection.load_s3_object('biomechanics/ml/modeling_datasets/model_data_converted.csv')

In [None]:
# TODO: use best LOOCV model from D2? or retrain on sample of pitchers used for D2 & compare that validation error to D2 RMSE?
    # CRITERIA (must prove some generalizability):
        # > 15 pitches
        # ≥ 4 pitch types
        # throws FB > 90 mph
with open('storage/all_subject_models.pkl', 'rb') as f:
    all_subject_models = pickle.load(f)

# identify subjects meeting criteria --> 36 total
valid_subjects = []
for subject in all_subject_models.keys():
    subject_data = model_dev_data[model_dev_data['subject_id'] == subject]
    if len(subject_data) > 15 and len(subject_data['pitch_type'].unique()) >= 4:
        if subject_data['rel_speed'].max() >= 90:
            valid_subjects.append(subject)

In [63]:
# iterate through subjects to get best model based on RMSE
    # NOTE: this is more complicated because of how errors were saved (no subject ID for reference)
min_rmse = float('inf')
best_subject = None
for subject in valid_subjects:
    subject_idx = list(all_subject_models.keys()).index(subject)            # identify index in list of all models
    subject_rmse = all_subject_errors['baseline_rf'][subject_idx]           # get RMSE for subject
    if subject_rmse < min_rmse:
        min_rmse = subject_rmse
        best_subject = subject

# best subject details:
    # subject ID: 2636 (RHP)
    # RMSE: 1.63 Nm
    # number of pitches: 28
    # pitch types: FB (max: 91.2), CH, SL, CB
    # TODO: dataset summary of non-2636 pitches
print(f'Best subject: {best_subject} with RMSE: {min_rmse}')

""" Store training set details """
    # NOTE: these are in metric system
training_data = model_summary_data[model_summary_data['subject_id'] != best_subject].copy()
training_summary = training_data.agg({
    'rel_speed': ['mean', 'std'],
    'rel_side': ['mean', 'std'],
    'rel_ht': ['mean', 'std'],
    'spin_rate': ['mean', 'std'],
    'spin_axis': ['mean', 'std'],
    'ax0': ['mean', 'std'],
    'ay0': ['mean', 'std'],
    'az0': ['mean', 'std'],
    'peak_value': ['mean', 'std'],
    'peak_value_normalized': ['mean', 'std'],
}).T

# throwing hand counts
throwing_hand_counts = training_data.groupby('subject_id')['pitcher_throws_rh'].unique().value_counts()

# pitch type counts
pitch_type_map = {
    'Fastball': 'Fastball',
    'Changeup': 'Changeup',
    'Curveball': 'Curveball',
    'Slider': 'Slider',
    'Sinker': 'Fastball',
    'Cutter': 'Other',
    'Splitter': 'Changeup',
    'FastBall': 'Fastball',
    'CurveBall': 'Curveball',
    'TwoSeamFastBall': 'Fastball',
    'TwoSeamFastball': 'Fastball',
    'Other': 'Other',
    # 'Stretch' -- not a pitch type
    # 'QuickPitch' -- not a pitch type
    # 'Hesitation' -- not a pitch type
    # 'Hitch' -- not a pitch type
    '2020FB': 'Fastball',
}
training_data['pitch_type'] = training_data['pitch_type'].map(pitch_type_map)
pitch_type_counts = training_data['pitch_type'].value_counts() / training_data['pitch_type'].count()

Best subject: 2636 with RMSE: 1.6280098603441266


$\textit{Cohort Data}$

__Includes the following__:
- Cohort details w/ injury date, type
- Ball tracking data from Statcast (~260k  million pitches)
- Additional injury metadata (e.g., tracked pitches prior to injury)
- Bios scraped from Statcast (height, mass, mlbamid)

There are __357__ injured pitchers with pitches tracked prior to injury. 

In [None]:
# load cohort CSVs
cohort = aws_connection.load_s3_object('epidemiology/cohorts/injured/combined_0825.csv')
cohort_ball_tracking = aws_connection.load_s3_object('epidemiology/cohorts/injured/statcast_data.csv')
cohort_metadata = aws_connection.load_s3_object('epidemiology/cohorts/injured/statcast_metadata.csv')
cohort_bios = aws_connection.load_s3_object('epidemiology/cohorts/injured/statcast_bios.csv')

In [77]:
cohort_ball_tracking

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,SL,2025-05-17,80.4,-2.24,5.80,"Blanco, Ronel",543309,669854,strikeout,swinging_strike,...,4.0,2.90,-0.08,-0.08,41.3,30.524263,-49.946532,31.877045,33.382260,55.061602
1,FF,2025-05-17,89.2,-2.05,5.85,"Blanco, Ronel",543309,669854,,foul,...,4.0,1.44,0.76,0.76,42.4,26.484337,-26.121308,38.658785,31.162650,44.146327
2,SL,2025-05-17,83.5,-2.27,5.80,"Blanco, Ronel",543309,669854,,foul,...,4.0,2.57,-0.08,-0.08,41.4,32.188039,-26.675948,35.313217,36.787864,44.220920
3,CH,2025-05-17,80.6,-1.79,5.95,"Blanco, Ronel",543309,669854,,ball,...,4.0,2.90,1.09,1.09,46.4,,,,,
4,CU,2025-05-17,77.8,-1.96,5.96,"Blanco, Ronel",543309,669854,,ball,...,4.0,3.66,-0.93,-0.93,47.1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155978,CH,2016-04-02,85.1,2.72,6.38,"Doubront, Felix",452655,467094,field_out,hit_into_play,...,,1.78,0.29,0.29,,,,,,
155979,CH,2016-04-02,84.5,2.56,6.53,"Doubront, Felix",452655,467094,,ball,...,,1.54,0.38,0.38,,,,,,
155980,CH,2016-04-02,83.6,2.67,6.46,"Doubront, Felix",452655,467094,,called_strike,...,,1.75,0.18,0.18,,,,,,
155981,CH,2016-04-02,84.4,2.80,6.41,"Doubront, Felix",452655,467094,,ball,...,,1.64,0.53,0.53,,,,,,
