In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Clinical Model}$

Injury risk estimation from two perspectives: 
- __Season__ (i.e., post-outing injury probability; postgame)
- __Pitch-Level__ (i.e., next-pitch injury probability; within game)

In [2]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

- Cohort data (matches, preds, statcast)
- Ball flight aggregates 

In [3]:
# helper function for calculating averages
def get_avgs(
        data: pd.DataFrame,
        group_cols: list,
        avg_cols: list = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate']
) -> pd.DataFrame:
    """
    Calculate averages for specified columns grouped by the given columns.
    
    Args:
        data (pd.DataFrame): The input DataFrame containing the data.
        group_cols (list): List of columns to group by.
        avg_cols (list): List of columns to calculate averages for. Defaults to ball flight features used in injury risk model.
    
    Returns:
        pd.DataFrame: A DataFrame with the grouped columns and their corresponding averages.
    """
    return data.groupby(group_cols)[avg_cols].mean().reset_index()

# filter pitches to date range for a given ID
def filter_pitches_by_date(
        pitch_data: pd.DataFrame, 
        pitcher_id: int, 
        start_date: str, 
        end_date: str
) -> pd.DataFrame:
    """
    Filter pitch data for a specific pitcher within a date range.
    
    Args:
        pitch_data (pd.DataFrame): The DataFrame containing pitch data.
        pitcher_id (int): The ID of the pitcher to filter by.
        start_date (str): The start date in 'YYYY-MM-DD' format.
        end_date (str): The end date in 'YYYY-MM-DD' format.
    
    Returns:
        pd.DataFrame: Filtered DataFrame containing pitches for the specified pitcher and date range.
    """
    return pitch_data[
        (pitch_data['pitcher'] == pitcher_id) &
        (pitch_data['game_date'] >= start_date) &
        (pitch_data['game_date'] <= end_date)
    ]


In [4]:
# load cohort of matches, model predictions
cohort_matches = aws.load_s3_object('epidemiology/ml/datasets/full/cohort_matches_final.csv')
cohort_preds = aws.load_s3_object('epidemiology/ml/datasets/preds/model_cohort.csv')

# load all statcast data
    # likely fts: velo, release position, spin rate (no pitch labels)
statcast_data = aws.load_s3_object('epidemiology/ml/datasets/full/model_application_data.csv')
cohort_preds_statcast = cohort_preds.merge(statcast_data, on=['pitch_id', 'pitcher', 'game_date', 'pitcher_days_since_prev_game', 'injured_cohort_pitcher'])

In [5]:
# TODO: update dataset to match windows btw each injured & noninjured pitcher
cohort_preds_final = []
for id in cohort_matches['mlbamid_injured'].unique(): 
    # get date of first and last pitch for each pitcher
    first_pitch = cohort_preds_statcast[cohort_preds_statcast['pitcher'] == id]['game_date'].min()
    last_pitch = cohort_preds_statcast[cohort_preds_statcast['pitcher'] == id]['game_date'].max()

    # get all pitches for this pitcher, append to final dataset
    pitcher_data = filter_pitches_by_date(
        cohort_preds_statcast, 
        id, 
        first_pitch, 
        last_pitch
    ).reset_index(drop=True)
    cohort_preds_final.append(pitcher_data)

    # get pitches for match
    matched_id = cohort_matches[cohort_matches['mlbamid_injured'] == id]['mlbamid_noninjured'].values[0]
    matched_data = filter_pitches_by_date(
        cohort_preds_statcast, 
        matched_id, 
        first_pitch, 
        last_pitch
    ).reset_index(drop=True)

    # append to final dataset
    cohort_preds_final.append(matched_data)

# concatenate all data
cohort_preds_final = pd.concat(cohort_preds_final, ignore_index=True)
injured_proportion = cohort_preds_final['injured_cohort_pitcher'].mean()        # check proportion of injured pitches --> should be close-ish to 50%

In [6]:
# get baseline season-long ball flight averages for reference
    # NOTE: these may be referenced for the pitch level model
ball_flight_fts = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate']
ball_flight_season_avgs = get_avgs(
    cohort_preds_final,
    group_cols=['pitcher', 'season'],
    avg_cols=ball_flight_fts
)


$\textbf{Setup Data Sequences}$

- __Outing-Level Model__: Up to last outing prior to injury (last outing not included)
- __Pitch-Level Model__: Up to final pitch prior to injury

Both will be stored as days prior to last outing/pitch (7-, 15-, 30-, 45-, and 90-days prior). All sequences must be converted to arrays (then likely tensors) for model development.

In [7]:
# days until second-to-last outing (outing-level model)
def days_until_2nd_last_outing(group):
    """
    Calculate the number of days until the second-to-last outing for each pitcher in a group.

    Args:
        group (pd.DataFrame): A DataFrame containing outings for a specific pitcher and season.

    Returns:
        pd.Series: A Series with the number of days until the second-to-last outing for each row in the group.
    """
    # get unique outing dates for the pitcher
    unique_dates = sorted(pd.to_datetime(group['game_date']).unique())
    
    # check if there are at least two unique outing dates
    if len(unique_dates) < 2:
        # if not enough outings (eg., only 1), fill with NaN
        return pd.Series([pd.NA] * len(group), index=group.index)
    
    # get the second-to-last unique date
    second_last = unique_dates[-2]
    
    return (second_last - pd.to_datetime(group['game_date'])).dt.days

# days until last outing (pitch-level model)
def days_until_last_outing(group):
    """
    Calculate the number of days until the last outing for each pitcher in a group.

    Args:
        group (pd.DataFrame): A DataFrame containing outings for a specific pitcher and season.

    Returns:
        pd.Series: A Series with the number of days until the last outing for each row in the group.
    """
    last_date = pd.to_datetime(group['game_date']).max()
    return (last_date - pd.to_datetime(group['game_date'])).dt.days


In [108]:
# set model features -- contextual layer
CONTEXTUAL_FTS = [
    'p_throws',
    'pitcher_days_since_prev_game',
    'outing_number'
]

# set model features -- time series layer
TIME_SERIES_FTS = [
    # predicted load
    'pred_peak_evt_normalized',
    'within_outing_cumulative_evt_workload',
    
    # ball flight
    'rel_speed', 
    'rel_side',
    'rel_ht', 
    'spin_rate'
]


In [119]:
# setup pitch-level dataset
    # pitch_uuid: unique identifier for model dataset
pitch_dataset = cohort_preds_final.copy()
pitch_dataset.insert(0, 'pitch_uuid', pitch_dataset.index)

# compute days until last outing for each pitcher
pitch_dataset['days_until_last_outing'] = pitch_dataset.groupby(['pitcher', 'season']).apply(days_until_last_outing).reset_index(level=[0,1], drop=True)

# store sequences by days until last outing prior to injury
pitch_level_sequences = {
    7: [],
    15: [],
    30: [],
    45: [],
    90: []
}
for day in pitch_level_sequences.keys():
    pitch_level_sequences[day] = [
        group[TIME_SERIES_FTS + CONTEXTUAL_FTS].values
            for _, group in pitch_dataset[pitch_dataset['days_until_last_outing'] <= day].groupby(['pitcher', 'season'])
    ]

In [None]:
# setup outing-level dataset
    # originally: 27,462 distinct outings --> these will be reduced
outing_dataset = cohort_preds_final.copy()
outing_dataset.insert(0, 'outing_id', outing_dataset.groupby(['pitcher', 'game_date']).ngroup())

# compute days until second-to-last outing for each pitcher
    # NOTE: second-to-last because we're trying to predict "next" outing injury
    # last outings will have negative days --> drop
outing_dataset['days_until_2nd_last_outing'] = outing_dataset.groupby(
    ['pitcher', 'season']
).apply(days_until_2nd_last_outing).reset_index(level=[0,1], drop=True)
outing_dataset_clean = outing_dataset[outing_dataset['days_until_2nd_last_outing'] >= 0].reset_index(drop=True)

# store sequence lengths by days until second-to-last outing (ie., prior to injured outing)
outing_level_sequences = {
    7: [],
    15: [],
    30: [],
    45: [],
    90: []
}
for day in outing_level_sequences.keys():
    outing_level_sequences[day] = [
    group[TIME_SERIES_FTS + CONTEXTUAL_FTS].values
        for _, group in outing_dataset_clean[outing_dataset_clean['days_until_2nd_last_outing'] <= day].groupby(['pitcher', 'season'])
    ]  
    

$\textbf{Create Outcome Probability Grid}$

Binary outcome (__1 := injured, 0 := non-injured__) is also converted to a probability for injured pitchers to encourage model learning:
- For _outing sequences_, applies a linear increase with each outing until 1 is reached
- For _pitch-level sequences_, a sigmoid is created over all pitches

In [45]:
import numpy as np

In [81]:
def sigmoid(length: int) -> np.ndarray:
    """ Generate a sigmoid curve for a given length."""
    return 1 / (1 + np.exp(-np.linspace(-6, 6, length)))

def update_outcome_probabilities(
        data: pd.DataFrame, 
        model_type: str,
        outcome_col: str = 'injured_cohort_pitcher',
) -> pd.DataFrame:
    """
    Update the outcome probabilities for each pitcher based on their outing history.
    
    Args:
        data (pd.DataFrame): The DataFrame containing the data.
        model_type (str): The type of model being used ('pitch_level' or 'outing_level').
        outcome_col (str): The column containing the binary outcome. Defaults to 'injured_cohort_pitcher'.
    
    Returns:
        pd.DataFrame: The DataFrame with updated probabilities.
    """
    # update group columns based on model type
    match model_type:
        case 'pitch_level':
            pass
        case 'outing_level':
            group_cols = ['pitcher', 'outing_id', 'season']

    # trim to outcomes
        # applies a linear increasing probability with each outing for injured pitchers
    outcomes_orig = data[group_cols + [outcome_col]].drop_duplicates().reset_index(drop=True).copy()
    outcome_probs = outcomes_orig[outcome_col] / (outcomes_orig.sort_values(by=group_cols, ascending=False).groupby(['pitcher', 'season']).cumcount() + 1)

    # clean outcomes to binary
    outcomes_binary = np.where((outcome_probs < 1) & (outcome_probs > 0), 0, outcome_probs)

    return outcome_probs, outcomes_binary


In [168]:
# setup outcome dictionaries
pitch_level_outcomes = {
    7: {
        'probs': [],
        'binary': []
    },
    15: {
        'probs': [],
        'binary': []
    },
    30: {
        'probs': [],
        'binary': []
    },
    45: {
        'probs': [],
        'binary': []
    },
    90: {
        'probs': [],
        'binary': []
    }
}
outing_level_outcomes = {
    7: {
        'probs': [],
        'binary': []
    },
    15: {
        'probs': [],
        'binary': []
    },
    30: {
        'probs': [],
        'binary': []
    },
    45: {
        'probs': [],
        'binary': []
    },
    90: {
        'probs': [],
        'binary': []
    }
}

# iterate through outing-level and update (linear probs & binary)
for day in outing_level_outcomes.keys():
    day_outings = outing_dataset_clean[outing_dataset_clean['days_until_2nd_last_outing'] <= day].reset_index(drop=True)
    
    # iterate through pitcher-season combos in the data
    for group, rows in day_outings.groupby(['pitcher', 'season']):
        day_probs, day_binary = update_outcome_probabilities(
            rows,
            model_type='outing_level'
        )
        outing_level_outcomes[day]['probs'].append(list(day_probs))
        outing_level_outcomes[day]['binary'].append(list(day_binary))

 # iterate through pitch-level and update (sigmoid probs & binary)
for day in pitch_level_outcomes.keys():
    day_pitches = pitch_dataset[pitch_dataset['days_until_last_outing'] <= day].reset_index(drop=True)
    
    # iterate through pitcher-season combos in the df
    for group, rows in day_pitches.groupby(['pitcher', 'season']):
        # get sigmoid probs
        sigmoid_probs = sigmoid(rows.shape[0])
        pitch_level_outcomes[day]['probs'].append(list(sigmoid_probs))

        # convert to binary outcomes
        pitch_level_outcomes[day]['binary'].append(list(np.where(sigmoid_probs == sigmoid_probs.max(), 1, 0)))


$\textbf{Tensor Setup}$

In [195]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# convert to tensors and pad to the same length
def create_padded_tensor(sequences: list) -> torch.Tensor:
    """ 
    Convert a list of sequences to a padded tensor.
    
    Args:
        sequences (list): A list of sequences (arrays) to be converted.
    
    Returns:
        torch.Tensor: A padded tensor of shape (batch, max_seq_len, features).
    """
    seq_tensors = [torch.tensor(seq) for seq in sequences]
    padded = pad_sequence(seq_tensors, batch_first=True, padding_value=0)        # shape: (batch, max_seq_len, features)

    return padded


In [227]:
# setup tensor dictionaries
pitch_level_tensors = {
    7: {
        'seq': None,
        'mask': None,
        'lengths': None
    },
    15: {
        'seq': None,
        'mask': None,
        'lengths': None
    },
    30: {
        'seq': None,
        'mask': None,
        'lengths': None
    },
    45: {
        'seq': None,
        'mask': None,
        'lengths': None
    },
    90: {
        'seq': None,
        'mask': None,
        'lengths': None
    }
}
outing_level_tensors = {
    7: {
        'seq': None,
        'mask': None,
        'lengths': None
    },
    15: {
        'seq': None,
        'mask': None,
        'lengths': None
    },
    30: {
        'seq': None,
        'mask': None,
        'lengths': None
    },
    45: {
        'seq': None,
        'mask': None,
        'lengths': None
    },
    90: {
        'seq': None,
        'mask': None,
        'lengths': None
    }
}

# iterate through outing-level and create tensors
    # shape: (batch, max_seq_len, features)
    # also add mask and lengths to denote actual values for training
for day in outing_level_tensors.keys():
    outing_level_tensors[day]['seq'] = create_padded_tensor(outing_level_sequences[day]) 
    outing_level_tensors[day]['mask'] = (outing_level_tensors[day]['seq'].abs().sum(dim=2) != 0) 
    outing_level_tensors[day]['lengths'] = outing_level_tensors[day]['mask'].sum(dim=1)

    # convert outcomes to padded tensors
    outing_level_outcomes[day]['probs'] = create_padded_tensor(outing_level_outcomes[day]['probs'])
    outing_level_outcomes[day]['binary'] = create_padded_tensor(outing_level_outcomes[day]['binary'])

# iterate through pitch-level and create tensors
    # shape: (batch, max_seq_len, features)
    # also add mask and lengths to denote actual values for training
for day in pitch_level_tensors.keys():
    pitch_level_tensors[day]['seq'] = create_padded_tensor(pitch_level_sequences[day])
    pitch_level_tensors[day]['mask'] = (pitch_level_tensors[day]['seq'].abs().sum(dim=2) != 0) 
    pitch_level_tensors[day]['lengths'] = pitch_level_tensors[day]['mask'].sum(dim=1)

    # convert outcomes to padded tensors
    pitch_level_outcomes[day]['probs'] = create_padded_tensor(pitch_level_outcomes[day]['probs'])
    pitch_level_outcomes[day]['binary'] = create_padded_tensor(pitch_level_outcomes[day]['binary'])
    

In [233]:
# setup tensor datasets
    # also store relevant model information: loss, num. of sequences, etc
pitch_level_datasets = {
    7: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None,
        'max_seq_len': None
    },
    15: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    },
    30: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    },
    45: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    },
    90: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    }
}
outing_level_datasets = {
    7: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    },
    15: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    },
    30: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    },
    45: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    },
    90: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None
    }
}

# update datasets (binary & probs)
for day in [7, 15, 30, 45, 90]:
    for outcome_type in ['binary', 'probs']:
        pitch_level_datasets[day][outcome_type] = TensorDataset(
            pitch_level_tensors[day]['seq'], 
            pitch_level_outcomes[day][outcome_type],
            pitch_level_tensors[day]['mask'],
            pitch_level_tensors[day]['lengths']
        )
        outing_level_datasets[day][outcome_type] = TensorDataset(
            outing_level_tensors[day]['seq'], 
            outing_level_outcomes[day][outcome_type],
            outing_level_tensors[day]['mask'],
            outing_level_tensors[day]['lengths']
        )

        # update metadata
        pitch_level_datasets[day]['num_sequences'] = pitch_level_tensors[day]['seq'].shape[0]
        outing_level_datasets[day]['num_sequences'] = outing_level_tensors[day]['seq'].shape[0]
        pitch_level_datasets[day]['max_seq_len'] = pitch_level_tensors[day]['seq'].shape[1]
        outing_level_datasets[day]['max_seq_len'] = outing_level_tensors[day]['seq'].shape[1]

        # TODO: set loss function

$\textbf{Model Architecture}$

Both models use a CNN-(bi)LSTM architecture.

In [235]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
""" MODEL ARCHITECTURES """
# model architecture: pitch-level estimates
class PitchLevelModel(nn.Module):
    """
    Pitch-level model for estimating injury risk. Uses CNN layers for local feature extraction and a bi-directional LSTM for long-term patterns.

    **Args (Instantiation)**:
        num_fts (int): Number of input features.
        cnn_channels (int): Number of convolutional channels. Default is 64.
        lstm_hidden (int): Number of hidden units in the LSTM. Default is 96.
        kernel (int): Kernel size for the convolutional layer. Default is 5.
        dropout (float): Dropout rate. Default is 0.1.
        bidir (bool): Whether to use a bidirectional LSTM. Default is True.

    **Inputs**:
        x (torch.Tensor): Input tensor of shape [B, T, K] where B is batch size, T is sequence length, and K is number of features.
        lengths (torch.Tensor): Lengths of each sequence in the batch of shape [B].

    **Outputs**:
        logits_step (torch.Tensor): Output logits for each time step of shape [B, T].
    
    **Note**:
        Use BCEWithLogitsLoss with a mask for training.
    """
    def __init__(
            self,
            num_fts: int, 
            cnn_channels: int = 64, 
            lstm_hidden: int = 96, 
            kernel: int = 5, 
            dropout: float = 0.1, 
            bidir: bool = True
    ) -> None:
        super().__init__()
        
        # CNN layers
        self.conv = nn.Conv1d(num_fts, cnn_channels, kernel_size=kernel, padding=kernel//2)
        
        # activation (ReLU) + dropout
        self.act = nn.ReLU()
        self.drop = nn.Dropout(dropout)
        
        # LSTM layers
        self.lstm = nn.LSTM(cnn_channels, lstm_hidden, batch_first=True, bidirectional=bidir)
        
        # output heads
        hdim = lstm_hidden * (2 if bidir else 1)
        self.head_step = nn.Linear(hdim, 1)

    def forward(
            self, 
            x: torch.Tensor, 
            lengths: torch.Tensor
    ) -> torch.Tensor:
        # [B,T,K] -> conv over time
        z = self.act(self.conv(x.transpose(1,2))).transpose(1,2)        # [B,T,C]
        z = self.drop(z)

        # pack padded sequence for LSTM
        packed = pack_padded_sequence(
            z, 
            lengths.cpu(), 
            batch_first=True, 
            enforce_sorted=False
        )
        packed_out, _ = self.lstm(packed)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)      # [B,T,H]
        
        # get logits per timestep
        logits_step = self.head_step(out).squeeze(-1)                   # [B,T]
        
        return logits_step



In [240]:
""" LOSS FUNCTIONS """
def pitch_level_loss(
        logits_step: torch.Tensor, 
        y_step: torch.Tensor, 
        mask: torch.Tensor, 
        pos_weight: bool = False
) -> torch.Tensor:
    """ 
    Compute pitch-level loss given ground truth. Valid for binary or smoothed (e.g., sigmoid) outcome labels.

    Args:
        logits_step (torch.Tensor): Logits from the model of shape [B, T].
        y_step (torch.Tensor): Ground truth labels of shape [B, T]. Should be in [0, 1].
        mask (torch.Tensor): Mask indicating valid time steps of shape [B, T].
        pos_weight (bool, optional): Whether or not to use weights for positive class in BCE loss. Default is False.
    """
    if pos_weight is None:
        # setup weights
        pos = y_step[mask].sum()
        neg = mask.sum() - pos
        pos_weight = neg / pos.clamp(min=1.0)
        
        return F.binary_cross_entropy_with_logits(logits_step[mask], y_step[mask])
    
    return F.binary_cross_entropy_with_logits(logits_step[mask], y_step[mask], pos_weight=pos_weight)



$\textbf{Sandbox: Development}$

$\textit{Outcome Handling}$

In [None]:
# gather outcomes
    # non-injured pitcher --> outcomes = 0
    # injured pitcher --> outcomes = ...
example = outing_level_sequences[7]
example_outcomes = example[['pitcher', 'season', 'outing_id', 'injured_cohort_pitcher']].drop_duplicates().reset_index(drop=True).copy()
test_probs, test_binary = update_outcome_probabilities(example, model_type='outing_level')

In [51]:
example_pitches = pitch_level_sequences[7]

In [None]:
# iterate through all pitcher-pitches in the datespan
for group, rows in example_pitches.groupby(['pitcher', 'season']):
    rows

In [80]:
(1 / (1 + np.exp(-np.linspace(-6, 6, rows.shape[0]))))

array([0.00247262, 0.00273425, 0.00302348, 0.0033432 , 0.0036966 ,
       0.00408721, 0.0045189 , 0.00499596, 0.0055231 , 0.00610552,
       0.00674895, 0.00745967, 0.00824462, 0.0091114 , 0.01006839,
       0.01112476, 0.01229059, 0.01357692, 0.01499583, 0.01656054,
       0.01828548, 0.02018641, 0.02228046, 0.0245863 , 0.02712415,
       0.02991593, 0.03298531, 0.03635781, 0.04006084, 0.04412375,
       0.04857786, 0.05345645, 0.05879472, 0.06462967, 0.07100002,
       0.07794595, 0.08550885, 0.09373097, 0.10265494, 0.11232324,
       0.12277754, 0.1340579 , 0.14620194, 0.15924378, 0.17321298,
       0.18813338, 0.20402187, 0.22088711, 0.23872837, 0.25753434,
       0.27728217, 0.29793663, 0.31944957, 0.3417597 , 0.36479277,
       0.38846205, 0.41266938, 0.43730648, 0.46225681, 0.48739763,
       0.51260237, 0.53774319, 0.56269352, 0.58733062, 0.61153795,
       0.63520723, 0.6582403 , 0.68055043, 0.70206337, 0.72271783,
       0.74246566, 0.76127163, 0.77911289, 0.79597813, 0.81186