In [3]:
import pandas as pd
from connections import AWS
from services.ball_tracking import clean_ball_tracking_data

$\textbf{Epidemiology: Pitch Usage Aggregates}$

Used as additional matching criteria. All pitch type labels are taken from statcast and saved under `epidemiology/ml/datasets/full/<injury_status>_pitch_labels.csv` in S3.

In [2]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


In [13]:
# load all raw statcast data (inj. and non-inj.)
""" Ball Tracking Data """
# injured pitcher ball tracking data (to get date of first pitch thrown)
inj_ball_tracking = aws.load_s3_object('epidemiology/cohorts/injured/statcast/data_0825.csv')
inj_bt_clean = clean_ball_tracking_data(inj_ball_tracking)
inj_bt_labels = inj_bt_clean[['pitch_id', 'pitcher', 'game_date', 'pitch_type']].dropna().reset_index(drop=True)

# all ball tracking data by season
season_range = [i for i in range(2015, 2026)]
noninj_bt = {}
for s in season_range:
    # load, clean, & store ball tracking data for non-injured pitchers
    raw_bt = aws.load_s3_object(f'epidemiology/cohorts/noninjured/bulk_statcast/{s}.csv')
    clean_bt = clean_ball_tracking_data(raw_bt)
    noninj_bt[s] = clean_bt[['pitch_id', 'pitcher', 'game_date', 'pitch_type']].dropna().reset_index(drop=True)

In [18]:
# aggregate all non-injured ball tracking data
noninj_full = []
for s in season_range:
    # add season column
    noninj_bt[s]['season'] = s
    noninj_full.append(noninj_bt[s])

# concatenate into df
noninj_bt_labels = pd.concat(noninj_full).reset_index(drop=True)

# add season to injured pitcher data
inj_bt_labels['season'] = inj_bt_labels['game_date'].str[:4].astype(int)

In [19]:
# upload both datasets to S3
aws.upload_to_s3(inj_bt_labels, 'epidemiology/ml/datasets/full/injured_pitch_labels.csv')
aws.upload_to_s3(noninj_bt_labels, 'epidemiology/ml/datasets/full/noninjured_pitch_labels.csv')

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/full/noninjured_pitch_labels.csv
