In [3]:
import pickle
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Model Application}$

Loads and applies re-trained peak EVT model to new data (2015 - present).

In [2]:
""" INITIALIZE AWS CONNECTION """
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is in use by process python3.11 (PID 48052). Killing it.
[AWS]: Connected to RDS endpoint.


$\textbf{Model \& Data Loading}$

In [None]:
# load pitcher metadata (e.g., height & mass to postprocess model outputs)
    # NOTE: a little off here that these file names don't match
inj_pitcher_info = aws_connection.load_s3_object('epidemiology/cohorts/injured/pitcher_info/pitchers_metadata.csv')
noninj_pitcher_info = aws_connection.load_s3_object('epidemiology/cohorts/noninjured/pitcher_info/pitchers_0825.csv')

# consolidate into model metadata
    # drop duplicates for merging (metadata is consistent from YTY)
metadata_cols = ['mlbamid', 'height', 'mass']
inj_pitcher_info.rename(columns={'mlbam_id': 'mlbamid'}, inplace=True)
model_metadata = pd.concat([inj_pitcher_info[metadata_cols], noninj_pitcher_info[metadata_cols]], axis=0).reset_index(drop=True).drop_duplicates()

In [25]:
# load models (2015 & "modern" ball tracking -- ie., 2016-present)
with open(f'models/evt_2015.pkl', 'rb') as f:
    evt_model_2015 = pickle.load(f)['model']
with open(f'models/evt_modern.pkl', 'rb') as f:
    evt_model = pickle.load(f)['model']


In [10]:
# load ball tracking data
ball_tracking_data = {}
season_range = [i for i in range(2015, 2026)]
for s in season_range:
    # load injured & non-injured bt data
    inj_data = aws_connection.load_s3_object(f'epidemiology/ml/datasets/{s}/injured_ball_tracking.csv')
    noninj_data = aws_connection.load_s3_object(f'epidemiology/ml/datasets/{s}/noninjured_ball_tracking.csv')

    # add injured/non-injured label for downstream check
    inj_data['injured_cohort_pitcher'] = 1
    noninj_data['injured_cohort_pitcher'] = 0
    
    # combine & store
    ball_tracking_data[s] = pd.concat([inj_data, noninj_data], axis=0).reset_index(drop=True)
    print(f'Loaded {s} data: {inj_data.shape[0]} injured, {noninj_data.shape[0]} non-injured')

Loaded 2015 data: 6406 injured, 556391 non-injured
Loaded 2016 data: 7562 injured, 558757 non-injured
Loaded 2017 data: 7056 injured, 520539 non-injured
Loaded 2018 data: 11860 injured, 502517 non-injured
Loaded 2019 data: 4626 injured, 460887 non-injured
Loaded 2020 data: 7363 injured, 149505 non-injured
Loaded 2021 data: 32054 injured, 391833 non-injured
Loaded 2022 data: 25765 injured, 371984 non-injured
Loaded 2023 data: 43642 injured, 364405 non-injured
Loaded 2024 data: 44174 injured, 395168 non-injured
Loaded 2025 data: 8081 injured, 277705 non-injured


In [83]:
# concatenate ball tracking data & upload to S3
ball_tracking_data_all = pd.concat([ball_tracking_data[s] for s in season_range], axis=0).reset_index(drop=True)
aws_connection.upload_to_s3(
    ball_tracking_data_all,
    'epidemiology/ml/datasets/full/model_application_data.csv'
)

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/full/model_application_data.csv


$\textbf{Apply Models to Data}$

In [58]:
# initialize model preds 
model_preds = {}
for s in season_range:
    print(f'Applying models to {s} data...', end='\r', flush=True)
    
    # extract copy of season's ball tracking data
    data = ball_tracking_data[s].copy()

    # apply model based on year
    if s == 2015:
        model_fts = evt_model_2015.feature_names_in_
        data['pred_peak_evt_normalized'] = evt_model_2015.predict(data[model_fts])
    else:
        model_fts = evt_model.feature_names_in_
        data['pred_peak_evt_normalized'] = evt_model.predict(data[model_fts])

    # post-process model outputs to get predicted peak elbow varus torque (in Nm)
    data_postprocessed = data.merge(model_metadata, left_on='pitcher', right_on='mlbamid', how='left')
    data_postprocessed['pred_peak_evt'] = data_postprocessed['pred_peak_evt_normalized'] * (data_postprocessed['mass'] * 9.81) * (data_postprocessed['height'])
    
    # store preds
    sto_cols = ['pitch_id', 'pitcher', 'game_date', 'pitcher_days_since_prev_game', 'pred_peak_evt_normalized', 'pred_peak_evt', 'injured_cohort_pitcher']
    model_preds[s] = data_postprocessed[sto_cols]

    print(f'Models successfully applied to {s} data.')

Models successfully applied to 2015 data.
Models successfully applied to 2016 data.
Models successfully applied to 2017 data.
Models successfully applied to 2018 data.
Models successfully applied to 2019 data.
Models successfully applied to 2020 data.
Models successfully applied to 2021 data.
Models successfully applied to 2022 data.
Models successfully applied to 2023 data.
Models successfully applied to 2024 data.
Models successfully applied to 2025 data.


In [None]:
# concatenate all model predictions
    # NOTE: separation btw injured and non-injured isn't large here
    # however, pitchers haven't been matched
model_preds_all = pd.concat([model_preds[s] for s in season_range], axis=0).reset_index(drop=True)

# save model predictions to S3
aws_connection.upload_to_s3(
    model_preds_all,
    'epidemiology/ml/datasets/preds/model_application.csv'
)

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/preds/all_ball_tracking_preds.csv


$\textbf{Close AWS Connection}$

In [84]:
aws_connection.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
