In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Clinical Model Setup}$

Injury risk estimation at the pitch-level using variable-length sequences. This notebook sets up the data structures and model architecture for development; the full, cleaned versions of each model are trained in `clinical_model_training.ipynb`.

In [2]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

[AWS]: Port 5433 is in use by process python3.11 (PID 27916). Killing it.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

- Cohort data (matches, preds, statcast)
- Ball flight aggregates 

In [3]:
import ast

In [4]:
# helper function for calculating averages
def get_avgs(
        data: pd.DataFrame,
        group_cols: list,
        avg_cols: list = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate']
) -> pd.DataFrame:
    """
    Calculate averages for specified columns grouped by the given columns.
    
    Args:
        data (pd.DataFrame): The input DataFrame containing the data.
        group_cols (list): List of columns to group by.
        avg_cols (list): List of columns to calculate averages for. Defaults to ball flight features used in injury risk model.
    
    Returns:
        pd.DataFrame: A DataFrame with the grouped columns and their corresponding averages.
    """
    return data.groupby(group_cols)[avg_cols].mean().reset_index()

# filter pitches to date range for a given ID
def filter_pitches_by_date(
        pitch_data: pd.DataFrame, 
        pitcher_id: int, 
        start_date: str, 
        end_date: str
) -> pd.DataFrame:
    """
    Filter pitch data for a specific pitcher within a date range.
    
    Args:
        pitch_data (pd.DataFrame): The DataFrame containing pitch data.
        pitcher_id (int): The ID of the pitcher to filter by.
        start_date (str): The start date in 'YYYY-MM-DD' format.
        end_date (str): The end date in 'YYYY-MM-DD' format.
    
    Returns:
        pd.DataFrame: Filtered DataFrame containing pitches for the specified pitcher and date range.
    """
    return pitch_data[
        (pitch_data['pitcher'] == pitcher_id) &
        (pitch_data['game_date'] >= start_date) &
        (pitch_data['game_date'] <= end_date)
    ]

# add 'outing_before_injury' column for outing-level model: 1 if injured pitcher and second-to-last outing in season, else 0
def get_outing_before_injury(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    
    # get unique outing dates per pitcher/season; find second-to-last outing
    outing_dates = df.groupby(['pitcher', 'season'])['outing_number'].unique()
    second_last_outing = outing_dates.apply(lambda x: sorted(x)[-2] if len(x) >= 2 else pd.NaT)
    
    second_last_df = pd.DataFrame(second_last_outing).reset_index()
    second_last_df['outing_before_injury'] = 1

    return second_last_df


In [5]:
# load/collect all matches
n_matches = 5
cohort_matches = aws.load_s3_object(f'epidemiology/cohorts/injured/pitcher_info/matches_{n_matches}_per_pitcher.csv')

# organize all IDs w/ season
cohort_info = []
for _, row in cohort_matches.iterrows():
    cohort_info.append({
        'pitcher': row['mlbamid_injured'],
        'season': row['season'],
        'injured': 1
    })

    # append all non-injured pitchers
    for mlbamid in ast.literal_eval(row['mlbamid_noninjured']):
        cohort_info.append({
            'pitcher': mlbamid,
            'season': row['season'],
            'injured': 0
        })

# concatenate all pitcher info
cohort_info = pd.DataFrame(cohort_info)

In [6]:
# load all statcast data & model generated predictions
    # likely fts: velo, release position, spin rate (no pitch labels)
statcast_data = aws.load_s3_object('epidemiology/ml/datasets/full/model_application_data.csv')
statcast_preds = aws.load_s3_object('epidemiology/ml/datasets/preds/model_application.csv')

In [7]:
# merge statcast data with model predictions
    # create season column
statcast_full = statcast_data.merge(statcast_preds, on=['pitch_id', 'pitcher', 'game_date', 'pitcher_days_since_prev_game', 'injured_cohort_pitcher'])
statcast_full['season'] = statcast_full['game_date'].str[0:4].astype(int)

# clip to last 90 days for each pitcher
pitcher_last_outings = statcast_full.groupby(['pitcher', 'season'])['game_date'].max().reset_index()
pitcher_last_outings.rename(columns={'game_date': 'last_outing_date'}, inplace=True)

# merge last outing date w/ statcast data
statcast_full = statcast_full.merge(pitcher_last_outings, on=['pitcher', 'season'], how='left')

# compute time until last outing
statcast_full['days_until_last_outing'] = (
    pd.to_datetime(statcast_full['last_outing_date']) -
    pd.to_datetime(statcast_full['game_date'])
).dt.days

# filter to last 90 days
statcast_full_45 = statcast_full[statcast_full['days_until_last_outing'] <= 45].reset_index(drop=True)

$\textit{Baseline Ball Flight Averages by Pitcher}$

In [8]:
# get baseline season-long ball flight averages for reference
    # NOTE: these may be referenced for the pitch level model
ball_flight_fts = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate']
ball_flight_season_avgs = get_avgs(
    statcast_full_45,
    group_cols=['pitcher', 'season'],
    avg_cols=ball_flight_fts
)


$\textbf{Train/Test Splits}$

Loads previously set train/test splits (see `clinical_splits.ipynb`) and applies to `cohort_preds_final`.

In [9]:
# load splits
path_stem = 'epidemiology/ml/datasets/full'
cohort_train_ids = aws.load_s3_object(f'{path_stem}/cohort_train_ids.csv')
cohort_test_ids = aws.load_s3_object(f'{path_stem}/cohort_test_ids.csv')

# filter full dataset by train/test
cohort_preds_train = statcast_full_45.merge(
    cohort_train_ids,
    on=['pitcher', 'season'],
    how='inner'
)
cohort_preds_test = statcast_full_45.merge(
    cohort_test_ids,
    on=['pitcher', 'season'],
    how='inner'
)

In [None]:
# save test data for post-eval
cohort_preds_test.to_csv('storage/model_test_data.csv', index=False)

$\textbf{Setup Data Sequences}$

Two features are manually created:
- Outing number
- Within outing cumulative EVT workload

All sequences must then be converted to arrays (then tensors) for model development.

In [10]:
import numpy as np

In [11]:
# create outing number
def create_outing_number(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create an outing number for each pitcher in the dataset.
    
    Args:
        data (pd.DataFrame): The DataFrame containing pitch data.
    
    Returns:
        pd.DataFrame: DataFrame with an additional 'outing_number' column.
    """
    df = data.copy()
    df['outing_number'] = df.groupby(['pitcher', 'season'])['game_date'].rank(method='dense').astype(int)
    
    return df

# create within outing cumulative EVT workload
def create_within_outing_cumulative_evt_workload(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create a cumulative EVT workload for each outing.
    
    Args:
        data (pd.DataFrame): The DataFrame containing pitch data.
    
    Returns:
        pd.DataFrame: DataFrame with an additional 'within_outing_cumulative_evt_workload' column.
    """
    df = data.copy()
    df['within_outing_cumulative_evt_workload'] = df.groupby(['pitcher', 'season', 'outing_number'])['pred_peak_evt_normalized'].cumsum()
    return df


In [12]:
# create addt'l features
trn_final = create_within_outing_cumulative_evt_workload(
    create_outing_number(cohort_preds_train)
)
test_final = create_within_outing_cumulative_evt_workload(
    create_outing_number(cohort_preds_test)
)

# sort by pitcher, season, outing number
trn_final = trn_final.sort_values(by=['pitcher', 'season', 'outing_number'])
test_final = test_final.sort_values(by=['pitcher', 'season', 'outing_number'])

In [13]:
# set model features -- contextual & time series layers
CONTEXTUAL_FTS = [
    'p_throws',
    'pitcher_days_since_prev_game',
    # 'outing_number'
]
TIME_SERIES_FTS = [
    # predicted load
    'pred_peak_evt_normalized',
    # 'within_outing_cumulative_evt_workload',
    
    # ball flight
    'rel_speed', 
    'rel_side',
    'rel_ht', 
    'spin_rate'
]

# combine for full features
feature_set = CONTEXTUAL_FTS + TIME_SERIES_FTS

In [14]:
# create training and testing set arrays
trn_arrays = np.array([rows[feature_set].values for _, rows in trn_final.groupby(['pitcher', 'season'])])
test_arrays = np.array([rows[feature_set].values for _, rows in test_final.groupby(['pitcher', 'season'])])

$\textbf{Create Outcomes}$

In [15]:
import torch
import numpy as np

In [65]:
# wehther or not to label outcomes by pitch
PER_PITCH = False

In [68]:
# create outcome arrays -- one per pitcher
if not PER_PITCH:
    trn_outcomes = trn_final[['pitcher', 'season', 'injured']].drop_duplicates()['injured'].values
    test_outcomes = test_final[['pitcher', 'season', 'injured']].drop_duplicates()['injured'].values
    # convert to 1D arrays
    

else:
    # create outcome arrays -- per-pitch
        # NOTE: all pitches in the last outing are labeled as "injured"
        # group by pitcher & season and store outcome arrays separately
    trn_outcomes = [
        np.where(
            (group['injured'] == 1) & (group['days_until_last_outing'] == 0),
            1,
            0
        )
        for _, group in trn_final.groupby(['pitcher', 'season'])
    ]
    test_outcomes = [
        np.where(
            (group['injured'] == 1) & (group['days_until_last_outing'] == 0),
            1,
            0
        )
        for _, group in test_final.groupby(['pitcher', 'season'])
    ]

$\textbf{Aggregate for Streamlined Data Storage}$

In [44]:
import pickle

In [80]:
# reset all into a dictionary
pitch_level = {
    'trn': {
        'inputs': trn_arrays,
        'outcomes': np.array(trn_outcomes)
    },
    'val': {
        'inputs': test_arrays, 
        'outcomes': np.array(test_outcomes)
    }
}

# save to local disk
with open('storage/pitch_level_arrays.pkl', 'wb') as f:
    pickle.dump(pitch_level, f)

# TODO: upload to AWS (--> pytorch folder)
# with open('storage/pitch_level_arrays.pkl', 'rb') as f:
#     content = f.read()
#     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/pitch_level_arrays.pkl')

$\textbf{Tensor Setup}$

In [82]:
import torch
from torch.nn.utils.rnn import pad_sequence

In [83]:
# convert to tensors and pad to the same length
def create_padded_tensor(sequences: list) -> torch.Tensor:
    """ 
    Convert a list of sequences to a padded tensor.
    
    Args:
        sequences (list): A list of sequences (arrays) to be converted.
    
    Returns:
        torch.Tensor: A padded tensor of shape (batch, max_seq_len, features).
    """
    seq_tensors = [torch.tensor(np.array(seq, dtype=np.float32)) for seq in sequences]
    padded = pad_sequence(seq_tensors, batch_first=True, padding_value=0)        # shape: (batch, max_seq_len, features)

    return padded

# helper function for creating sorted tensor dictionaries
def setup_all_tensors(
        input_arrays: dict,
        output_arrays: dict,
        pad_outcomes: bool = False
) -> dict:
    """ Helper function for creating sorted (pitch- or outing-level) tensor dictionaries. Returns sequences, mask, and lengths for each. """
    # setup tensor dictionaries
    sorted_tensors = {
        'seq': None,
        'mask': None,
        'lengths': None,
        'binary': None
    }

    # create tensors
        # shape: (batch, max_seq_len, features)
        # also add mask and lengths to denote actual values for training
    sorted_tensors['seq'] = create_padded_tensor(input_arrays)
    sorted_tensors['mask'] = (sorted_tensors['seq'].abs().sum(dim=2) != 0) 
    sorted_tensors['lengths'] = sorted_tensors['mask'].sum(dim=1)

    # convert outcomes to padded tensors
    if pad_outcomes:
        sorted_tensors['binary'] = create_padded_tensor(output_arrays)
    else:
        sorted_tensors['binary'] = torch.tensor(output_arrays, dtype=torch.float32)

    return sorted_tensors


In [84]:
# create tensors from arrays
pitch_level_tensors = {
    'trn': None,
    'val': None
}

# set up tensors
if PER_PITCH:
    pitch_level_tensors['trn'] = setup_all_tensors(
        pitch_level['trn']['inputs'], 
        pitch_level['trn']['outcomes'], 
        pad_outcomes=True
    )
    pitch_level_tensors['val'] = setup_all_tensors(
        pitch_level['val']['inputs'], 
        pitch_level['val']['outcomes'],
        pad_outcomes=True
    )

else:
    pitch_level_tensors['trn'] = setup_all_tensors(
        pitch_level['trn']['inputs'], 
        pitch_level['trn']['outcomes'], 
        pad_outcomes=False
    )
    pitch_level_tensors['val'] = setup_all_tensors(
        pitch_level['val']['inputs'], 
        pitch_level['val']['outcomes'],
        pad_outcomes=False
    )

In [85]:
# save to local disk
if PER_PITCH:
    with open('storage/pitch_level_tensors.pkl', 'wb') as f:
        pickle.dump(pitch_level_tensors, f)

    # TODO: upload to AWS (--> pytorch folder)
    # with open('storage/pitch_level_tensors.pkl', 'rb') as f:
    #     content = f.read()
    #     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/pitch_level_tensors.pkl')

else:
    with open('storage/outing_level_tensors.pkl', 'wb') as f:
        pickle.dump(pitch_level_tensors, f)

    # upload to AWS (--> pytorch folder)
    # with open('storage/outing_level_tensors.pkl', 'rb') as f:
    #     content = f.read()
    #     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/outing_level_tensors.pkl')

$\textbf{Close AWS Connection}$

In [27]:
aws.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
