In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Acute-to-Chronic Workload Ratio (ACWR)}$

In [2]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


In [3]:
# load datasets
clinical_cohort = aws.load_s3_object('epidemiology/ml/datasets/full/cohort_matches_final.csv')          # cohort of TJ surgeries
clinical_preds = aws.load_s3_object('epidemiology/ml/datasets/preds/model_cohort.csv')                  # pitch-level predictions
clinical_outings = aws.load_s3_object('epidemiology/ml/datasets/full/cohort_outing_metrics.csv')        # outing aggregates from cohort

$\textbf{ACWR Calculations}$

From [Dowling et al. (2020)](https://ijspt.org/wp-content/uploads/2024/03/9-Reinold.pdf): 
- __Acute__: 7-day rolling average
- __Chronic__: 28-day rolling average

Other intervals are recommended to reflect the pitching cycle -- thus, we use a 6-to-30 day window as outlined in the proposal.

In [4]:
import numpy as np
from datetime import timedelta

In [24]:
# get actue & chronic workload for a pitcher
def compute_workload(
        data: pd.DataFrame,
        game_date: str,
        workload_col: str,
        acute_length: int = 6,
        chronic_length: int = 30,
        chronic_halflife_days: int = 15
) -> dict:
        """ 
        Compute acute and chronic workload for a given pitcher based on game_date.

        Args
        - data: DataFrame containing outing metrics.
        - game_date: The date of the game to compute workloads for.
        - workload_col: The column name containing workload metrics.
        - acute_length: Number of days for acute workload.
        - chronic_length: Number of days for chronic workload.
        - chronic_halflife_days: Halflife for the exponential weighted moving average (EWMA) for chronic workload.

        Returns
        - Acute and chronic workloads as a tuple (acute_workload, chronic_workload).
        """
        # define acute/chronic windows as masks
        acute_mask = (data['game_date'] <= game_date) & (data['game_date'] >= game_date - timedelta(days=acute_length))
        chronic_mask = (data['game_date'] <= game_date) & (data['game_date'] >= game_date - timedelta(days=chronic_length))

        # estimate workload using mask
        acute_workload = data.loc[acute_mask, workload_col].mean()
        
        # chronic := EWMA over window, take the *last* EWMA value as the chronic load at this date
        chronic_vals = data.loc[chronic_mask, ['game_date', workload_col ]].sort_values('game_date')
        if chronic_vals.empty:
                chronic_workload = np.nan
        else:
                chronic_series = chronic_vals[workload_col].ewm(halflife=chronic_halflife_days, adjust=False).mean()
                chronic_workload = float(chronic_series.iloc[-1])

        return {
                'date': game_date,
                'workload_col': workload_col,
                'acute_workload': acute_workload,
                'chronic_workload': chronic_workload,
                'acwr': acute_workload / chronic_workload if chronic_workload > 0 else 0
        }
        

In [25]:
# prepare df
clinical_outings['game_date'] = pd.to_datetime(clinical_outings['game_date'])
clinical_outings.sort_values(['pitcher', 'game_date'], inplace=True)

# initialize storage for results, final data
acwr_results = {}
acwr_data = []

# iterate through pitchers
for pitcher_id, group in clinical_outings.groupby('pitcher'):
    acwr_results[pitcher_id] = {
        'outing_total_pitch_count': [],
        'outing_total_evt_workload': [],
        'outing_median_evt_workload': []
    }

    # iterate through the pitcher's season
    for idx, row in group.iterrows():
        # get the date of the outing
        outing_date = row['game_date']

        # calculate workload for each column
        for col in ['outing_total_pitch_count', 'outing_total_evt_workload', 'outing_median_evt_workload']:
            workload_data = compute_workload(
                group,
                outing_date,
                workload_col=col,
                acute_length=6,
                chronic_length=30
            )
            
            # store results
            acwr_results[pitcher_id][col].append(workload_data)
         
    # concatenate all workloads, add pitcher ID
    for col in acwr_results[pitcher_id]:
        acwr_results_col = pd.DataFrame(acwr_results[pitcher_id][col])
        
        # add ID, season
        acwr_results_col.insert(0, 'pitcher', pitcher_id)
        acwr_results_col.insert(1, 'season', group['season'].iloc[0]) 

        # append to final data
        acwr_data.append(acwr_results_col)

In [26]:
# concatenate all results into a single DataFrame
acwr_data = pd.concat(acwr_data, ignore_index=True)

# upload results to S3
aws.upload_to_s3(
    acwr_data,
    'epidemiology/acwr/results_by_metric.csv',
)

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/acwr/results_by_metric.csv


$\textbf{Analysis}$

Compare ACWRs by method at the following:
- Last outing prior to injury
- Second-to-last outing prior to injury
- Third-to-last outing prior to injury

In [None]:
# join metadata to acwr
metadata = clinical_outings[['pitcher', 'game_date', 'injured_cohort_pitcher', 'last_outing_before_injury']]
acwr_with_metadata = acwr_data.merge(
    metadata,
    left_on=['pitcher', 'date'],
    right_on=['pitcher', 'game_date'],
    how='left'
).drop(columns=['date'])

# filter outings
third_to_last = acwr_with_metadata.groupby(['pitcher', 'season', 'workload_col']).nth(-3)
second_to_last = acwr_with_metadata.groupby(['pitcher', 'season', 'workload_col']).nth(-2)
last = acwr_with_metadata.groupby(['pitcher', 'season', 'workload_col']).nth(-1)

In [None]:
# aggregate by timing
    # third to last shows an increase, others are actually decreased
third_to_last_agg = third_to_last.groupby(['workload_col', 'injured_cohort_pitcher'])['acwr'].mean() 
second_to_last_agg = second_to_last.groupby(['workload_col', 'injured_cohort_pitcher'])['acwr'].mean()
last_agg = last.groupby(['workload_col', 'injured_cohort_pitcher'])['acwr'].mean()

$\textbf{Plotting}$

In [28]:
# example matches:
    # 669203 (inj), 670102 (noninj)

# create lists of pitcher IDs for analysis
inj_pitchers = [669203]
noninj_pitchers = [670102]

# extract data
inj_acwr = acwr_data[acwr_data['pitcher'].isin(inj_pitchers)]
noninj_acwr = acwr_data[(acwr_data['pitcher'].isin(noninj_pitchers)) & (acwr_data['date'] <= inj_acwr['date'].max())]       # additionally trim to inj window