In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Acute-to-Chronic Workload Ratio (ACWR)}$

In [2]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


In [3]:
# load datasets
clinical_cohort = aws.load_s3_object('epidemiology/ml/datasets/full/cohort_matches_final.csv')          # cohort of TJ surgeries
clinical_preds = aws.load_s3_object('epidemiology/ml/datasets/preds/model_cohort.csv')                  # pitch-level predictions
clinical_outings = aws.load_s3_object('epidemiology/ml/datasets/full/cohort_outing_metrics.csv')        # outing aggregates from cohort

$\textbf{ACWR Calculations}$

From [Dowling et al. (2020)](https://ijspt.org/wp-content/uploads/2024/03/9-Reinold.pdf): 
- __Acute__: 7-day rolling average
- __Chronic__: 28-day rolling average

Other intervals are recommended to reflect the pitching cycle -- thus, we use a __6-to-30 day window__ as outlined in the proposal.

In [88]:
import numpy as np
from datetime import timedelta

In [89]:
# get actue & chronic workload for a pitcher
def compute_workload(
        data: pd.DataFrame,
        game_date: str,
        workload_col: str,
        acute_length: int = 6,
        chronic_length: int = 30,
        chronic_halflife_days: int = 6
) -> dict:
        """ 
        Compute acute and chronic workload for a given pitcher based on game_date.

        Args
        - data: DataFrame containing outing metrics.
        - game_date: The date of the game to compute workloads for.
        - workload_col: The column name containing workload metrics.
        - acute_length: Number of days for acute workload.
        - chronic_length: Number of days for chronic workload.
        - chronic_halflife_days: Halflife for the exponential weighted moving average (EWMA) for chronic workload.

        Returns
        - Acute and chronic workloads as a tuple (acute_workload, chronic_workload).
        """
        # define acute/chronic windows as masks
        acute_mask = (data['game_date'] <= game_date) & (data['game_date'] >= game_date - timedelta(days=acute_length))
        chronic_mask = (data['game_date'] <= game_date) & (data['game_date'] >= game_date - timedelta(days=chronic_length))

        # estimate workload using mask
        acute_workload = data.loc[acute_mask, workload_col].mean()
        
        # chronic := EWMA over window, take the *last* EWMA value as the chronic load at this date
        chronic_vals = data.loc[chronic_mask, ['game_date', workload_col ]].sort_values('game_date')
        if chronic_vals.empty:
                chronic_workload = np.nan
        else:
                chronic_series = chronic_vals[workload_col].rolling(window=chronic_length, min_periods=1).mean()
                chronic_workload = float(chronic_series.iloc[-1])

        return {
                'date': game_date,
                'workload_col': workload_col,
                'acute_workload': acute_workload,
                'chronic_workload': chronic_workload,
                'acwr': acute_workload / chronic_workload if chronic_workload > 0 else 0
        }
        

In [90]:
# prepare df
clinical_outings['game_date'] = pd.to_datetime(clinical_outings['game_date'])
clinical_outings.sort_values(['pitcher', 'game_date'], inplace=True)

# initialize storage for results, final data
acwr_results = {}
acwr_data = []

# iterate through pitchers
for pitcher_id, group in clinical_outings.groupby('pitcher'):
    acwr_results[pitcher_id] = {
        'outing_total_pitch_count': [],
        'outing_total_evt_workload': [],
        'outing_avg_evt_workload': []
    }

    # iterate through the pitcher's season
    for idx, row in group.iterrows():
        # get the date of the outing
        outing_date = row['game_date']

        # calculate workload for each column
        for col in ['outing_total_pitch_count', 'outing_total_evt_workload', 'outing_avg_evt_workload']:
            workload_data = compute_workload(
                group,
                outing_date,
                workload_col=col,
                acute_length=6,
                chronic_length=30
            )
            
            # store results
            acwr_results[pitcher_id][col].append(workload_data)
         
    # concatenate all workloads, add pitcher ID
    for col in acwr_results[pitcher_id]:
        acwr_results_col = pd.DataFrame(acwr_results[pitcher_id][col])
        
        # add ID, season
        acwr_results_col.insert(0, 'pitcher', pitcher_id)
        acwr_results_col.insert(1, 'season', group['season'].iloc[0]) 

        # append to final data
        acwr_data.append(acwr_results_col)

In [91]:
# concatenate all results into a single DataFrame
acwr_data = pd.concat(acwr_data, ignore_index=True)

# upload results to S3
aws.upload_to_s3(
    acwr_data,
    'epidemiology/acwr/results_by_metric.csv',
)

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/acwr/results_by_metric.csv


$\textbf{Analysis}$

Compare ACWRs by method at the following:
- Last outing prior to injury
- Second-to-last outing prior to injury
- Third-to-last outing prior to injury

In [156]:
# join metadata to acwr
metadata = clinical_outings[['pitcher', 'game_date', 'injured_cohort_pitcher', 'last_outing_before_injury']]
acwr_with_metadata = acwr_data.merge(
    metadata,
    left_on=['pitcher', 'date'],
    right_on=['pitcher', 'game_date'],
    how='left'
).drop(columns=['date'])

# filter outings
third_to_last = acwr_with_metadata.groupby(['pitcher', 'season', 'workload_col']).nth(-3)
second_to_last = acwr_with_metadata.groupby(['pitcher', 'season', 'workload_col']).nth(-2)
last = acwr_with_metadata.groupby(['pitcher', 'season', 'workload_col']).nth(-1)

# aggregate by timing
    # third to last shows an increase, others are actually decreased
third_to_last_agg = third_to_last.groupby(['workload_col', 'injured_cohort_pitcher'])['acwr'].mean() 
second_to_last_agg = second_to_last.groupby(['workload_col', 'injured_cohort_pitcher'])['acwr'].mean()
last_agg = last.groupby(['workload_col', 'injured_cohort_pitcher'])['acwr'].mean()

In [157]:
# TODO: compare all workloads in last 3 outings

$\textbf{Statistical Tests}$

In [106]:
from scipy.stats import ttest_ind

In [160]:
""" TEST 1: Last Outing """
print("Last Outing Tests:")
# outing avg EVT workload
t_stat, p_value = ttest_ind(
    last[(last['injured_cohort_pitcher'] == 1) & (last['workload_col'] == 'outing_avg_evt_workload')]['acwr'], 
    last[(last['injured_cohort_pitcher'] == 0) & (last['workload_col'] == 'outing_avg_evt_workload')]['acwr'], 
    alternative='greater',
) 
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

# outing total EVT workload
t_stat, p_value = ttest_ind(
    last[(last['injured_cohort_pitcher'] == 1) & (last['workload_col'] == 'outing_total_evt_workload')]['acwr'], 
    last[(last['injured_cohort_pitcher'] == 0) & (last['workload_col'] == 'outing_total_evt_workload')]['acwr'], 
    alternative='less',
) 
print(f"(NOTE: LESS) T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

# outing total pitch count
t_stat, p_value = ttest_ind(
    last[(last['injured_cohort_pitcher'] == 1) & (last['workload_col'] == 'outing_total_pitch_count')]['acwr'], 
    last[(last['injured_cohort_pitcher'] == 0) & (last['workload_col'] == 'outing_total_pitch_count')]['acwr'], 
    alternative='less',
) 
print(f"(NOTE: LESS) T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

""" TEST 2: Second to Last Outing """
print("\nSecond to Last Outing Tests:")
# outing avg EVT workload
t_stat, p_value = ttest_ind(
    second_to_last[(second_to_last['injured_cohort_pitcher'] == 1) & (second_to_last['workload_col'] == 'outing_avg_evt_workload')]['acwr'], 
    second_to_last[(second_to_last['injured_cohort_pitcher'] == 0) & (second_to_last['workload_col'] == 'outing_avg_evt_workload')]['acwr'], 
    alternative='greater',
) 
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

# outing total EVT workload
t_stat, p_value = ttest_ind(
    second_to_last[(second_to_last['injured_cohort_pitcher'] == 1) & (second_to_last['workload_col'] == 'outing_total_evt_workload')]['acwr'], 
    second_to_last[(second_to_last['injured_cohort_pitcher'] == 0) & (second_to_last['workload_col'] == 'outing_total_evt_workload')]['acwr'], 
    alternative='greater',
) 
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

# outing total pitch count
t_stat, p_value = ttest_ind(
    second_to_last[(second_to_last['injured_cohort_pitcher'] == 1) & (second_to_last['workload_col'] == 'outing_total_pitch_count')]['acwr'], 
    second_to_last[(second_to_last['injured_cohort_pitcher'] == 0) & (second_to_last['workload_col'] == 'outing_total_pitch_count')]['acwr'], 
    alternative='greater',
) 
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

""" TEST 3: Third to Last Outing """
print(f"\nThird to Last Outing Tests:")
# outing avg EVT workload
t_stat, p_value = ttest_ind(
    third_to_last[(third_to_last['injured_cohort_pitcher'] == 1) & (third_to_last['workload_col'] == 'outing_avg_evt_workload')]['acwr'], 
    third_to_last[(third_to_last['injured_cohort_pitcher'] == 0) & (third_to_last['workload_col'] == 'outing_avg_evt_workload')]['acwr'], 
    alternative='less',
) 
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

# outing total EVT workload
t_stat, p_value = ttest_ind(
    third_to_last[(third_to_last['injured_cohort_pitcher'] == 1) & (third_to_last['workload_col'] == 'outing_total_evt_workload')]['acwr'], 
    third_to_last[(third_to_last['injured_cohort_pitcher'] == 0) & (third_to_last['workload_col'] == 'outing_total_evt_workload')]['acwr'], 
    alternative='greater',
) 
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

# outing total pitch count
t_stat, p_value = ttest_ind(
    third_to_last[(third_to_last['injured_cohort_pitcher'] == 1) & (third_to_last['workload_col'] == 'outing_total_pitch_count')]['acwr'], 
    third_to_last[(third_to_last['injured_cohort_pitcher'] == 0) & (third_to_last['workload_col'] == 'outing_total_pitch_count')]['acwr'], 
    alternative='greater',
) 
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3f}")

Last Outing Tests:
T-statistic: 0.423, p-value: 0.336
(NOTE: LESS) T-statistic: -1.379, p-value: 0.084
(NOTE: LESS) T-statistic: -1.342, p-value: 0.090

Second to Last Outing Tests:
T-statistic: -0.536, p-value: 0.704
T-statistic: -1.108, p-value: 0.866
T-statistic: -1.031, p-value: 0.848

Third to Last Outing Tests:
T-statistic: -1.372, p-value: 0.086
T-statistic: 2.304, p-value: 0.011
T-statistic: 2.286, p-value: 0.011


$\textbf{Close AWS Connection}$

In [136]:
aws.close()

[AWS]: No active connection to close.
