In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Clinical Model Setup}$

Injury risk estimation from two perspectives: 
- __Season__ (i.e., post-outing injury probability; postgame)
- __Pitch-Level__ (i.e., next-pitch injury probability; within game)

This notebook sets up the data structures and model architecture for development; the full, cleaned versions of each model are trained in `clinical_model_training.ipynb`.

In [2]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

- Cohort data (matches, preds, statcast)
- Ball flight aggregates 

In [3]:
# helper function for calculating averages
def get_avgs(
        data: pd.DataFrame,
        group_cols: list,
        avg_cols: list = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate']
) -> pd.DataFrame:
    """
    Calculate averages for specified columns grouped by the given columns.
    
    Args:
        data (pd.DataFrame): The input DataFrame containing the data.
        group_cols (list): List of columns to group by.
        avg_cols (list): List of columns to calculate averages for. Defaults to ball flight features used in injury risk model.
    
    Returns:
        pd.DataFrame: A DataFrame with the grouped columns and their corresponding averages.
    """
    return data.groupby(group_cols)[avg_cols].mean().reset_index()

# filter pitches to date range for a given ID
def filter_pitches_by_date(
        pitch_data: pd.DataFrame, 
        pitcher_id: int, 
        start_date: str, 
        end_date: str
) -> pd.DataFrame:
    """
    Filter pitch data for a specific pitcher within a date range.
    
    Args:
        pitch_data (pd.DataFrame): The DataFrame containing pitch data.
        pitcher_id (int): The ID of the pitcher to filter by.
        start_date (str): The start date in 'YYYY-MM-DD' format.
        end_date (str): The end date in 'YYYY-MM-DD' format.
    
    Returns:
        pd.DataFrame: Filtered DataFrame containing pitches for the specified pitcher and date range.
    """
    return pitch_data[
        (pitch_data['pitcher'] == pitcher_id) &
        (pitch_data['game_date'] >= start_date) &
        (pitch_data['game_date'] <= end_date)
    ]

# add 'outing_before_injury' column for outing-level model: 1 if injured pitcher and second-to-last outing in season, else 0
def get_outing_before_injury(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    
    # get unique outing dates per pitcher/season; find second-to-last outing
    outing_dates = df.groupby(['pitcher', 'season'])['outing_number'].unique()
    second_last_outing = outing_dates.apply(lambda x: sorted(x)[-2] if len(x) >= 2 else pd.NaT)
    
    second_last_df = pd.DataFrame(second_last_outing).reset_index()
    second_last_df['outing_before_injury'] = 1

    return second_last_df


In [4]:
# load cohort of matches, model predictions
cohort_matches = aws.load_s3_object('epidemiology/ml/datasets/full/cohort_matches_final.csv')
cohort_preds = aws.load_s3_object('epidemiology/ml/datasets/preds/model_cohort.csv')

# load all statcast data
    # likely fts: velo, release position, spin rate (no pitch labels)
statcast_data = aws.load_s3_object('epidemiology/ml/datasets/full/model_application_data.csv')
cohort_preds_statcast = cohort_preds.merge(statcast_data, on=['pitch_id', 'pitcher', 'game_date', 'pitcher_days_since_prev_game', 'injured_cohort_pitcher'])

In [5]:
# update dataset to match windows btw each injured & noninjured pitcher
cohort_preds_final = []
for id in cohort_matches['mlbamid_injured'].unique(): 
    # get date of first and last pitch for each pitcher
    first_pitch = cohort_preds_statcast[cohort_preds_statcast['pitcher'] == id]['game_date'].min()
    last_pitch = cohort_preds_statcast[cohort_preds_statcast['pitcher'] == id]['game_date'].max()

    # get all pitches for this pitcher, append to final dataset
    pitcher_data = filter_pitches_by_date(
        cohort_preds_statcast, 
        id, 
        first_pitch, 
        last_pitch
    ).reset_index(drop=True)
    cohort_preds_final.append(pitcher_data)

    # get pitches for match
    matched_id = cohort_matches[cohort_matches['mlbamid_injured'] == id]['mlbamid_noninjured'].values[0]
    matched_data = filter_pitches_by_date(
        cohort_preds_statcast, 
        matched_id, 
        first_pitch, 
        last_pitch
    ).reset_index(drop=True)

    # append to final dataset
    cohort_preds_final.append(matched_data)

# concatenate all data
cohort_preds_final = pd.concat(cohort_preds_final, ignore_index=True)
injured_proportion = cohort_preds_final['injured_cohort_pitcher'].mean()        # check proportion of injured pitches --> should be close-ish to 50%

# create outing_before_injury column to use as outcome for outing-level model
outings_before_injury = get_outing_before_injury(cohort_preds_final)
cohort_preds_final = cohort_preds_final.merge(
    outings_before_injury, 
    on=['pitcher', 'season', 'outing_number'], 
    how='left'
)
cohort_preds_final['outing_before_injury'] = cohort_preds_final['outing_before_injury'].fillna(0)

In [158]:
cohort_preds_final.shape

(230779, 22)

In [6]:
# get baseline season-long ball flight averages for reference
    # NOTE: these may be referenced for the pitch level model
ball_flight_fts = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate']
ball_flight_season_avgs = get_avgs(
    cohort_preds_final,
    group_cols=['pitcher', 'season'],
    avg_cols=ball_flight_fts
)


$\textbf{Train/Test Splits}$

Loads previously set train/test splits (see `clinical_splits.ipynb`) and applies to `cohort_preds_final`.

In [11]:
# load splits
path_stem = 'epidemiology/ml/datasets/full'
cohort_train_ids = aws.load_s3_object(f'{path_stem}/cohort_train_ids.csv')
cohort_test_ids = aws.load_s3_object(f'{path_stem}/cohort_test_ids.csv')

# filter full dataset by train/test
cohort_preds_train = cohort_preds_final[cohort_preds_final['pitcher'].isin(list(cohort_train_ids['mlbamid']))]
cohort_preds_test = cohort_preds_final[cohort_preds_final['pitcher'].isin(list(cohort_test_ids['mlbamid']))]

$\textbf{Setup Data Sequences}$

- __Outing-Level Model__: Up to last outing prior to injury (last outing not included)
- __Pitch-Level Model__: Up to final pitch prior to injury

Both will be stored as days prior to last outing/pitch (7-, 15-, 30-, 45-, and 90-days prior). All sequences must be converted to arrays (then likely tensors) for model development.

In [12]:
# days until second-to-last outing (outing-level model)
def days_until_2nd_last_outing(group):
    """
    Calculate the number of days until the second-to-last outing for each pitcher in a group.

    Args:
        group (pd.DataFrame): A DataFrame containing outings for a specific pitcher and season.

    Returns:
        pd.Series: A Series with the number of days until the second-to-last outing for each row in the group.
    """
    # get unique outing dates for the pitcher
    unique_dates = sorted(pd.to_datetime(group['game_date']).unique())
    
    # check if there are at least two unique outing dates
    if len(unique_dates) < 2:
        # if not enough outings (eg., only 1), fill with NaN
        return pd.Series([pd.NA] * len(group), index=group.index)
    
    # get the second-to-last unique date
    second_last = unique_dates[-2]
    
    return (second_last - pd.to_datetime(group['game_date'])).dt.days

# days until last outing (pitch-level model)
def days_until_last_outing(group):
    """
    Calculate the number of days until the last outing for each pitcher in a group.

    Args:
        group (pd.DataFrame): A DataFrame containing outings for a specific pitcher and season.

    Returns:
        pd.Series: A Series with the number of days until the last outing for each row in the group.
    """
    last_date = pd.to_datetime(group['game_date']).max()
    return (last_date - pd.to_datetime(group['game_date'])).dt.days

# create pitch-level sequence dictionary
def create_pitch_level_sequences(
        data: pd.DataFrame,
        features: list
) -> dict:
    """ 
    Creates a dictionary of pitch level sequences organized by days until last outing (currently 7, 15, 39, 45, and 90 days).
    
    Args:
        data (pd.DataFrame): A sorted DataFrame containing all pitch-level sequences.
        features (list): The columns to be used as features in the dataset.
    
    Returns:
        dict: A dictionary structured as days until last outing as keys and sequences as values.
    """
    
    # setup pitch-level dataset
        # pitch_uuid: unique identifier for model dataset
    pitch_dataset = data.copy()
    pitch_dataset.insert(0, 'pitch_uuid', pitch_dataset.index)

    # compute days until last outing for each pitcher
    pitch_dataset['days_until_last_outing'] = pitch_dataset.groupby(['pitcher', 'season']).apply(days_until_last_outing).reset_index(level=[0,1], drop=True)

    # store sequences by days until last outing prior to injury
    pitch_level_sequences = {
        7: [],
        15: [],
        30: [],
        45: [],
        90: []
    }
    for day in pitch_level_sequences.keys():
        pitch_level_sequences[day] = [
            group[features].values
                for _, group in pitch_dataset[pitch_dataset['days_until_last_outing'] <= day].groupby(['pitcher', 'season'])
        ]

    return pitch_level_sequences

# create outing-level sequence dictionary
def create_outing_level_sequences(
        data: pd.DataFrame,
        features: list
):
    """ 
    Creates a dictionary of outing level sequences organized by days until **second-to-last** outing (ie., outing prior to injury occurrence). 
    Currently 7, 15, 39, 45, and 90 days of data are included.
    
    Args:
        data (pd.DataFrame): A sorted DataFrame containing all pitch-level sequences.
        features (list): The columns to be used as features in the dataset.
    
    Returns:
        dict: A dictionary structured as days until second-to-last outing as keys and sequences as values.
    """
    # setup outing-level dataset
    outing_dataset = data.copy()
    outing_dataset.insert(0, 'outing_id', outing_dataset.groupby(['pitcher', 'game_date']).ngroup())

    # compute days until second-to-last outing for each pitcher
        # NOTE: second-to-last because we're trying to predict "next" outing injury
        # last outings will have negative days --> drop
    outing_dataset['days_until_2nd_last_outing'] = outing_dataset.groupby(
        ['pitcher', 'season']
    ).apply(days_until_2nd_last_outing).reset_index(level=[0,1], drop=True)
    outing_dataset_clean = outing_dataset[outing_dataset['days_until_2nd_last_outing'] >= 0].reset_index(drop=True)

    # store sequence lengths by days until second-to-last outing (ie., prior to injured outing)
    outing_level_sequences = {
        7: [],
        15: [],
        30: [],
        45: [],
        90: []
    }
    for day in outing_level_sequences.keys():
        outing_level_sequences[day] = [
        group[features].values
            for _, group in outing_dataset_clean[outing_dataset_clean['days_until_2nd_last_outing'] <= day].groupby(['pitcher', 'season'])
        ]  

    return outing_level_sequences
        


In [13]:
# set model features -- contextual & time series layers
CONTEXTUAL_FTS = [
    'p_throws',
    'pitcher_days_since_prev_game',
    'outing_number'
]
TIME_SERIES_FTS = [
    # predicted load
    'pred_peak_evt_normalized',
    'within_outing_cumulative_evt_workload',
    
    # ball flight
    'rel_speed', 
    'rel_side',
    'rel_ht', 
    'spin_rate'
]

# combine for full features
feature_set = CONTEXTUAL_FTS + TIME_SERIES_FTS

In [14]:
# setup pitch- and outing-level training sets: inputs
pitch_level_trn = create_pitch_level_sequences(cohort_preds_train, feature_set)
outing_level_trn = create_outing_level_sequences(cohort_preds_train, feature_set)

# setup pitch- and outing-level validation sets: inputs
pitch_level_val = create_pitch_level_sequences(cohort_preds_test, feature_set)
outing_level_val = create_outing_level_sequences(cohort_preds_test, feature_set)

$\textbf{Create Outcome Probability Grid}$

Binary outcome (__1 := injured, 0 := non-injured__) is also converted to a probability for injured pitchers to encourage model learning:
- For _outing sequences_, applies a linear increase with each outing until 1 is reached
- For _pitch-level sequences_, a sigmoid is created over all pitches

In [15]:
import torch
import numpy as np

In [136]:
# helper function to approximate a sigmoid given a sequence length
def sigmoid(length: int) -> np.ndarray:
    """ Generate a sigmoid curve for a given length."""
    return 1 / (1 + np.exp(-np.linspace(-6, 6, length)))

# update outcome probabilities based on sequence length
    # --> creates soft probabilities to aid with loss calculations
def update_outcome_probabilities(
        data: pd.DataFrame, 
        model_type: str,
        outcome_col: str = 'injured_cohort_pitcher',
) -> pd.DataFrame:
    """
    Update the outcome probabilities for each pitcher based on their outing history.
    
    Args:
        data (pd.DataFrame): The DataFrame containing the data.
        model_type (str): The type of model being used ('pitch_level' or 'outing_level').
        outcome_col (str): The column containing the binary outcome. Defaults to 'injured_cohort_pitcher'.
    
    Returns:
        pd.DataFrame: The DataFrame with updated probabilities.
    """
    # update group columns based on model type
    match model_type:
        case 'pitch_level':
            pass
        case 'outing_level':
            group_cols = ['pitcher', 'outing_id', 'season']

    # trim to outcomes
        # applies a linear increasing probability with each outing for injured pitchers
    outcomes_orig = data[group_cols + [outcome_col]].drop_duplicates().reset_index(drop=True).copy()
    outcome_probs = outcomes_orig[outcome_col] / (outcomes_orig.sort_values(by=group_cols, ascending=False).groupby(['pitcher', 'season']).cumcount() + 1)

    # clean outcomes to binary
    outcomes_binary = np.where(outcome_probs == 1, 1, outcome_probs)

    return outcome_probs, outcomes_binary

# create pitch level outcomes 
    # uses outcome & last outing cols to create a model outcome --> approx 20% of pitches will be labeled as injured
    # returns binary and soft labels
def create_pitch_level_outcomes(
        input_data: pd.DataFrame,
        outcome_col: str = 'injured_cohort_pitcher',
        last_outing_col: str = 'days_until_last_outing',
        model_outcome_name: str = 'injured_pitch'
) -> dict:
    """ 
    Creates a dictionary of pitch level outcomes organized by days until last outing (currently 7, 15, 39, 45, and 90 days).
    
    Args:
        input_data (pd.DataFrame): Original data used to create the sequences.
        outcome_col (str): The column containing the binary outcome. Defaults to 'injured_cohort_pitcher'.
        last_outing_col (str): The column containing the days until last outing. Defaults to 'days_until_last_outing'.
        model_outcome_name (str): The name of the model outcome column to be created. Defaults to 'injured_pitch'.
    
    Returns:
        dict: A dictionary structured as days until last outing as keys and sequences as values.
    """
    
    # setup outcome dictionaries
    pitch_level_outcomes = {
        7: {
            'probs': [],
            'binary': []
        },
        15: {
            'probs': [],
            'binary': []
        },
        30: {
            'probs': [],
            'binary': []
        },
        45: {
            'probs': [],
            'binary': []
        },
        90: {
            'probs': [],
            'binary': []
        }
    }

     # iterate through pitch-level and update (sigmoid probs & binary)
    
    # iterate through days 
    for day in pitch_level_outcomes.keys():
        day_pitches = input_data.copy()

        # filter to proper date range given by the key
        day_pitches['days_until_last_outing'] = day_pitches.groupby(['pitcher', 'season']).apply(days_until_last_outing).reset_index(level=[0,1], drop=True)
        day_pitches = day_pitches[day_pitches['days_until_last_outing'] <= day].reset_index(drop=True)

        # create model outcome col
        day_pitches[model_outcome_name] = np.where(
            (day_pitches[outcome_col] == 1) & (day_pitches[last_outing_col] == 0),
            1, 
            0
        )
        
        # iterate through pitcher-season combos in the df
        for _, rows in day_pitches.groupby(['pitcher', 'season']):
            # get sigmoid probs
            sigmoid_probs = 1 / (1 + np.exp(-(np.array(rows[model_outcome_name]))))
            pitch_level_outcomes[day]['probs'].append(list(sigmoid_probs))

            # convert to binary outcomes
            pitch_level_outcomes[day]['binary'].append(list(np.where(sigmoid_probs > 0.5, 1, 0)))

    return pitch_level_outcomes

# create pitch level outcomes 
    # returns binary and soft labels
def create_outing_level_outcomes(
        input_data: pd.DataFrame,
        outcome_col: str = 'outing_before_injury'
) -> dict:
    """ 
    Creates a dictionary of outing level outcomes organized by days until **second-to-last** last outing (currently 7, 15, 39, 45, and 90 days).
    
    Args:
        input_data (pd.DataFrame): Original data used to create the sequences.
    
    Returns:
        dict: A dictionary structured as days until last outing as keys and sequences as values.
    """

    # setup outcomes dictionary
    outing_level_outcomes = {
        7: {
            'probs': [],
            'binary': []
        },
        15: {
            'probs': [],
            'binary': []
        },
        30: {
            'probs': [],
            'binary': []
        },
        45: {
            'probs': [],
            'binary': []
        },
        90: {
            'probs': [],
            'binary': []
        }
    }

    # iterate through outing-level and update (linear probs & binary)
    for day in outing_level_outcomes.keys():
        # setup data for day
            # --> compute days until second-to-last outing for each pitcher
        outing_dataset = input_data.copy()
        outing_dataset.insert(0, 'outing_id', outing_dataset.groupby(['pitcher', 'game_date']).ngroup())
        outing_dataset['days_until_2nd_last_outing'] = outing_dataset.groupby(
            ['pitcher', 'season']
        ).apply(days_until_2nd_last_outing).reset_index(level=[0,1], drop=True)
        
        # drop negative days, then filter to window
        outing_dataset_clean = outing_dataset[outing_dataset['days_until_2nd_last_outing'] >= 0].reset_index(drop=True)
        day_outings = outing_dataset_clean[outing_dataset_clean['days_until_2nd_last_outing'] <= day].reset_index(drop=True)
        
        # iterate through pitcher-season combos in the df
        for _, rows in day_outings.groupby(['pitcher', 'season']):
            # get sigmoid probs
            sigmoid_probs = 1 / (1 + np.exp(-(np.array(rows[outcome_col]))))
            outing_level_outcomes[day]['probs'].append(list(sigmoid_probs))

            # convert to binary outcomes
            outing_level_outcomes[day]['binary'].append(list(rows[outcome_col]))

    return outing_level_outcomes



In [137]:
# setup pitch- and outing-level training sets: outcomes
pitch_level_trn_y = create_pitch_level_outcomes(cohort_preds_train)
outing_level_trn_y = create_outing_level_outcomes(cohort_preds_train)

# setup pitch- and outing-level validation sets: outcomes
pitch_level_val_y = create_pitch_level_outcomes(cohort_preds_test)
outing_level_val_y = create_outing_level_outcomes(cohort_preds_test)

$\textbf{Aggregate for Streamlined Data Storage}$

Combine into pitch- and outing-level dictionaries for better storage, downstream processing.

In [144]:
import pickle

In [145]:
# reset all into a dictionary
pitch_level = {
    'trn': {
        'inputs': pitch_level_trn,
        'outputs': pitch_level_trn_y
    },
    'val': {
        'inputs': pitch_level_val, 
        'outputs': pitch_level_val_y
    }
}
outing_level = {
    'trn': {
        'inputs': outing_level_trn,
        'outputs': outing_level_trn_y
    },
    'val': {
        'inputs': outing_level_val, 
        'outputs': outing_level_val_y
    }
}

# save to local disk
with open('storage/pitch_level.pkl', 'wb') as f:
    pickle.dump(pitch_level, f)
with open('storage/outing_level.pkl', 'wb') as f:
    pickle.dump(outing_level, f)

# TODO: upload to AWS (--> pytorch folder)
# with open('storage/pitch_level.pkl', 'rb') as f:
#     content = f.read()
#     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/pitch_level_arrays.pkl')
# with open('storage/outing_level.pkl', 'rb') as f:
#     content = f.read()
#     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/outing_level_arrays.pkl')

$\textbf{Tensor Setup}$

In [146]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

In [147]:
# convert to tensors and pad to the same length
def create_padded_tensor(sequences: list) -> torch.Tensor:
    """ 
    Convert a list of sequences to a padded tensor.
    
    Args:
        sequences (list): A list of sequences (arrays) to be converted.
    
    Returns:
        torch.Tensor: A padded tensor of shape (batch, max_seq_len, features).
    """
    seq_tensors = [torch.tensor(np.array(seq, dtype=np.float32)) for seq in sequences]
    padded = pad_sequence(seq_tensors, batch_first=True, padding_value=0)        # shape: (batch, max_seq_len, features)

    return padded

# helper function for creating sorted tensor dictionaries
def setup_all_tensors(
        input_arrays: dict,
        output_arrays: dict
) -> dict:
    """ Helper function for creating sorted (pitch- or outing-level) tensor dictionaries. Returns sequences, mask, and lengths for each. """
    # setup tensor dictionaries
    sorted_tensors = {
        7: {
            'seq': None,
            'mask': None,
            'lengths': None
        },
        15: {
            'seq': None,
            'mask': None,
            'lengths': None
        },
        30: {
            'seq': None,
            'mask': None,
            'lengths': None
        },
        45: {
            'seq': None,
            'mask': None,
            'lengths': None
        },
        90: {
            'seq': None,
            'mask': None,
            'lengths': None
        }
    }

    # iterate through pitch-level and create tensors
    # shape: (batch, max_seq_len, features)
    # also add mask and lengths to denote actual values for training
    for day in sorted_tensors.keys():
        sorted_tensors[day]['seq'] = create_padded_tensor(input_arrays[day])
        sorted_tensors[day]['mask'] = (sorted_tensors[day]['seq'].abs().sum(dim=2) != 0) 
        sorted_tensors[day]['lengths'] = sorted_tensors[day]['mask'].sum(dim=1)

        # convert outcomes to padded tensors
        sorted_tensors[day]['probs'] = create_padded_tensor(output_arrays[day]['probs'])
        sorted_tensors[day]['binary'] = create_padded_tensor(output_arrays[day]['binary'])

    return sorted_tensors



In [148]:
# create tensors -- pitch level
pitch_level_tensors = {
    'trn': None,
    'val': None
}
pitch_level_tensors['trn'] = setup_all_tensors(pitch_level['trn']['inputs'], pitch_level['trn']['outputs'])
pitch_level_tensors['val'] = setup_all_tensors(pitch_level['val']['inputs'], pitch_level['val']['outputs'])

# create tensors -- outing level
outing_level_tensors = {
    'trn': None,
    'val': None
}
outing_level_tensors['trn'] = setup_all_tensors(outing_level['trn']['inputs'], outing_level['trn']['outputs'])
outing_level_tensors['val'] = setup_all_tensors(outing_level['val']['inputs'], outing_level['val']['outputs'])

In [149]:
# save to local disk
with open('storage/pitch_level_tensors.pkl', 'wb') as f:
    pickle.dump(pitch_level_tensors, f)
with open('storage/outing_level_tensors.pkl', 'wb') as f:
    pickle.dump(outing_level_tensors, f)

# TODO: upload to AWS (--> pytorch folder)
# with open('storage/pitch_level_tensors.pkl', 'rb') as f:
#     content = f.read()
#     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/pitch_level_tensors.pkl')
# with open('storage/outing_level_tensors.pkl', 'rb') as f:
#     content = f.read()
#     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/outing_level_tensors.pkl')

$\textit{Create Tensor Datasets}$

__Note__: In the future, these can be created using dictionaries loaded from above. They won't be necessary if manual batch iteration is used.

In [150]:
# setup tensor datasets
    # also store relevant model information: loss, num. of sequences, etc
pitch_level_datasets = {
    'trn': {
        7: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None,
        'max_seq_len': None
    },
        15: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        30: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        45: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        90: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        }
    },
    'val': {
        7: {
        'probs': None,
        'binary': None,
        'loss_function': None,
        'num_sequences': None,
        'max_seq_len': None
    },
        15: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        30: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        45: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        90: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        }
    }
}
outing_level_datasets = {
    'trn': {
        7: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        15: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        30: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        45: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        90: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        }
    },
    'val': {
        7: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        15: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        30: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        45: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        },
        90: {
            'probs': None,
            'binary': None,
            'loss_function': None,
            'num_sequences': None
        }
    }
}

# update datasets (binary & probs)
for set in ['trn', 'val']:
    for day in [7, 15, 30, 45, 90]:
        for outcome_type in ['binary', 'probs']:
            pitch_level_datasets[set][day][outcome_type] = TensorDataset(
                pitch_level_tensors[set][day]['seq'], 
                pitch_level_tensors[set][day][outcome_type],
                pitch_level_tensors[set][day]['mask'],
                pitch_level_tensors[set][day]['lengths']
            )
            outing_level_datasets[set][day][outcome_type] = TensorDataset(
                outing_level_tensors[set][day]['seq'], 
                outing_level_tensors[set][day][outcome_type],
                outing_level_tensors[set][day]['mask'],
                outing_level_tensors[set][day]['lengths']
            )

            # update metadata
            pitch_level_datasets[set][day]['num_sequences'] = pitch_level_tensors[set][day]['seq'].shape[0]
            outing_level_datasets[set][day]['num_sequences'] = outing_level_tensors[set][day]['seq'].shape[0]
            pitch_level_datasets[set][day]['max_seq_len'] = pitch_level_tensors[set][day]['seq'].shape[1]
            outing_level_datasets[set][day]['max_seq_len'] = outing_level_tensors[set][day]['seq'].shape[1]

            # TODO: set loss function?

$\textbf{Model Architecture}$

Both models use a __CNN-(bi)LSTM__ architecture:
- The CNN layers progressively extract salient features
- The bi-LSTM looks for temporal patterns that may distinguish injured and non-injured pitchers

In [151]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [152]:
""" MODEL ARCHITECTURES """
# CNN block for local patterns
class CNNBlock(nn.Module):
    """ 
    Depthwise-separable 1D convolutional block over time with residual. 

    Args:
        num_channels (int): Number of input channels.
        kernel (int): Size of the convolutional kernel. Defaults to 7.
        dropout (float): Dropout rate. Defaults to 0.1.
    
    Returns:
        None
    
    **Note**: CNN expects tensor with shape [B, C, T]. 
    """
    def __init__(
            self, 
            num_channels: int, 
            kernel: int = 7, 
            dropout: float = 0.1
    ) -> None:
        super().__init__()
        pad = kernel // 2
        
        # depthwise convolution
        self.dw = nn.Conv1d(num_channels, num_channels, kernel_size=kernel, padding=pad, groups=num_channels)
        self.pw = nn.Conv1d(num_channels, num_channels, kernel_size=1)
        
        # batch normalization, activation, and dropout
        self.bn = nn.BatchNorm1d(num_channels)
        self.act = nn.GELU()
        self.drop = nn.Dropout(dropout)

    def forward(
            self, 
            x: torch.Tensor
    ):  # x: [B,C,T]
        residual = x
        
        # apply layers
        x = self.dw(x)
        x = self.pw(x)
        
        # batch normalization, activation, and dropout
        x = self.bn(x)
        x = self.act(x)
        x = self.drop(x)
        
        return x + residual

class CNNbiLSTM(nn.Module):
    """
    CNN + BiLSTM model for time series data with per-pitch head.

    Args:
        k_in (int): Number of input features (K).
        stem (int): Number of channels in the stem layer. Defaults to 64.
        c (int): Number of channels after projection. Defaults to 96.
        kernel (int): Size of the convolutional kernel. Defaults to 7.
        lstm_hidden (int): Hidden size for the LSTM layer. Defaults to 128.
        dropout (float): Dropout rate. Defaults to 0.1.
        bidir (bool): Whether to use a bidirectional LSTM. Defaults to True.

    Returns:
        None

    **Note**: CNN expects tensor with shape [B, K, T]. 
    """
    def __init__(
            self, 
            k_in: int, 
            stem: int = 64, 
            c: int = 96, 
            kernel: int = 7, 
            lstm_hidden: int = 128, 
            dropout: float = 0.1, 
            bidir: bool = True
    ) -> None:
        super().__init__()
        
        # 1x1 stem to mix K features into 'stem' channels
        self.stem = nn.Sequential(
            nn.Conv1d(k_in, stem, kernel_size=1),
            nn.GELU(),
            nn.Dropout(dropout),
        )
        
        # progressive temporal convs
        self.conv1 = CNNBlock(stem, kernel=kernel, dropout=dropout)
        self.proj1 = nn.Conv1d(stem, c, kernel_size=1)                    # project to C channels
        self.conv2 = CNNBlock(c, kernel=kernel, dropout=dropout)

        # BiLSTM over time
        self.lstm = nn.LSTM(input_size=c, hidden_size=lstm_hidden, batch_first=True, bidirectional=bidir)
        hdim = lstm_hidden * (2 if bidir else 1)

        # per-pitch head
        self.head_step = nn.Sequential(
            nn.LayerNorm(hdim),
            nn.Linear(hdim, 1)
        )

    def forward(
            self, 
            x: torch.Tensor, 
            lengths: torch.Tensor
    ) -> None:
        # CNN expects [B,K,T]
        x = x.transpose(1, 2)                  # [B,K,T]
        x = self.stem(x)                       # [B,stem,T]
        x = self.conv1(x)                      # [B,stem,T]
        x = self.proj1(x)                      # [B,C,T]
        x = self.conv2(x)                      # [B,C,T]
        x = x.transpose(1, 2)                  # [B,T,C] for LSTM

        # pack to ignore padding inside LSTM
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.lstm(packed)
        out, _ = pad_packed_sequence(packed_out, batch_first=True, total_length=x.size(1))    # [B,T,H]

        # apply head to get logits for each pitch
        logits_step = self.head_step(out).squeeze(-1)  # [B,T]
        
        return logits_step


In [153]:
""" LOSS FUNCTIONS """
def pitch_level_loss(
        logits_step: torch.Tensor, 
        y_step: torch.Tensor, 
        mask: torch.Tensor, 
        pos_weight: bool = False
) -> torch.Tensor:
    """ 
    Compute pitch-level loss given ground truth. Valid for binary or smoothed (e.g., sigmoid) outcome labels.

    Args:
        logits_step (torch.Tensor): Logits from the model of shape [B, T].
        y_step (torch.Tensor): Ground truth labels of shape [B, T]. Should be in [0, 1].
        mask (torch.Tensor): Mask indicating valid time steps of shape [B, T].
        pos_weight (bool, optional): Whether or not to use weights for positive class in BCE loss. Default is False.
    """
    if pos_weight is None:
        # setup weights
        pos = y_step[mask].sum()
        neg = mask.sum() - pos
        pos_weight = neg / pos.clamp(min=1.0)
        
        return F.binary_cross_entropy_with_logits(logits_step[mask], y_step[mask])
    
    return F.binary_cross_entropy_with_logits(logits_step[mask], y_step[mask], pos_weight=pos_weight)

@torch.no_grad()
def compute_pos_weight(
    train_loader: DataLoader, 
    device: torch.device = torch.device('cpu')
) -> torch.Tensor:
    pos = 0.0
    tot = 0.0
    
    # iterate through training data to compute positive and total counts
    for x, y, m, L in train_loader:
        y, m = y.to(device), m.to(device)
        pos += y[m].sum().item()
        tot += m.sum().item()
    
    # compute positive weight
    neg = max(tot - pos, 1.0)
    pos = max(pos, 1.0)
    
    return torch.tensor(neg / pos, device=device, dtype=torch.float32)


$\textbf{Model Development}$

Some additional notes: 
- __Train/Test Split__: Applied above. 75% of matches are used for the training set; 25% are held out for validation. 
- __Data Scaling__: Applied to training splits.
- __Number of Epochs__: Run until early stopping criterion is met.

In [154]:
from tqdm import tqdm

In [155]:
def compute_masked_scalers(x, mask):
    # x: [B,T,K], mask: [B,T]
    m = mask.unsqueeze(-1)                  # [B,T,1]
    num = m.sum(dim=(0,1)).clamp(min=1)     # [K]
    
    # compute mean, var, std
    mean = (x * m).sum(dim=(0,1)) / num
    var  = ((x - mean) * m).pow(2).sum(dim=(0,1)) / num
    std  = var.sqrt().clamp(min=1e-6)
    
    return mean, std

# normalize all splits in-place (or create new tensors)
def apply_scalers(x, mean, std): 
    return (x - mean) / std


In [156]:
# set number of epochs, batch size
NUM_EPOCHS = 5      # if none, run intil early stopping
BATCH_SIZE = 32

## TODO: create compile_model() function --->

# standardize example sequence --> example training tensor sequence (x)
example_mean, example_std = compute_masked_scalers(pitch_level_tensors['trn'][15]['seq'], pitch_level_tensors['trn'][15]['mask'])
x = apply_scalers(pitch_level_tensors['trn'][15]['seq'], example_mean, example_std)

# setup (device, shapes)
device = "cuda" if torch.cuda.is_available() else "cpu"
B, T, K = x.shape
print(f'Input shape: {x.shape}, Device: {device}')

# move full tensors to device once (since we’re not using a DataLoader yet)
x               = x.float().to(device)
y_step          = pitch_level_tensors['trn'][15]['probs'].float().to(device)
y_step_binary   = pitch_level_tensors['trn'][15]['binary'].float().to(device)
mask            = pitch_level_tensors['trn'][15]['mask'].bool().to(device)
lengths         = pitch_level_tensors['trn'][15]['lengths'].long().to(device)

# setup model
model = CNNbiLSTM(k_in=K, stem=64, c=96, kernel=7, lstm_hidden=128, dropout=0.1, bidir=True).to(device)

# optimizer + class weights
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

# optional: compute pos_weight over valid steps once
    # NOTE: for probs this should be 1.0
with torch.no_grad():
    pos = y_step_binary[mask].sum()
    tot = mask.sum()
    neg = tot - pos
    pos_weight = (neg / pos.clamp(min=1)).float()

print("pos_weight =", float(pos_weight))


Input shape: torch.Size([321, 510, 9]), Device: cpu
pos_weight = 6.168867588043213


In [None]:
for epoch in range(1, NUM_EPOCHS+1):
    # set training info
    model.train()
    idx = torch.randperm(B, device=device)

    # loss counters
    running_loss = 0.0
    running_correct = 0
    running_total = 0

    # wrap range() with tqdm
    pbar = tqdm(range(0, B, BATCH_SIZE), desc=f"Epoch {epoch}/{NUM_EPOCHS}", leave=False)
    for start in pbar:
        end = min(start + BATCH_SIZE, B)
        bidx = idx[start:end]

        # forward pass
        xb = x[bidx]
        yb = y_step[bidx]
        mb = mask[bidx]
        Lb = lengths[bidx]

        # update loss
        logits = model(xb, Lb)
        loss = F.binary_cross_entropy_with_logits(
            logits[mb], yb[mb], pos_weight=pos_weight
        )

        # back propagation
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # running loss
        bs = xb.size(0)
        running_loss += loss.item() * bs
        
        # running accuracy
        probs  = torch.sigmoid(logits[mb])
        preds  = (probs > 0.5).float()
        correct = (preds == y_step_binary[bidx][mb]).sum().item()
        total   = mb.sum().item()

        # update total
        running_correct += correct
        running_total += total

        # update accuracy
        run_avg_loss = running_loss / ((start // BATCH_SIZE + 1) * bs)
        run_acc      = running_correct / running_total

        # show both current and running losses in the bar
        pbar.set_postfix(loss=f"{loss.item():.4f}", acc=f"{run_acc:.4f}")

    # update epoch loss
    epoch_loss = running_loss / B
    epoch_acc  = running_correct / running_total
    
    print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.3f} | Training Accuracy: {epoch_acc:.3f}")

$\textbf{Close AWS Connection}$

In [159]:
aws.close()

[AWS]: No active connection to close.


$\textbf{Sandbox: Development}$

In [288]:
# list of features for reference
TIME_SERIES_FTS + CONTEXTUAL_FTS

['pred_peak_evt_normalized',
 'within_outing_cumulative_evt_workload',
 'rel_speed',
 'rel_side',
 'rel_ht',
 'spin_rate',
 'p_throws',
 'pitcher_days_since_prev_game',
 'outing_number']

$\textit{Example Training Sequence}$

In [None]:
# standardize example sequence --> example training tensor sequence (x)
example_mean, example_std = compute_masked_scalers(pitch_level_tensors[7]['seq'], pitch_level_tensors[7]['mask'])
x = apply_scalers(pitch_level_tensors[7]['seq'], example_mean, example_std)

In [332]:
# setup (device, shapes)
device = "cuda" if torch.cuda.is_available() else "cpu"
B, T, K = x.shape
print(f'Input shape: {x.shape}, Device: {device}')

# move full tensors to device once (since we’re not using a DataLoader yet)
x               = x.float().to(device)
y_step          = pitch_level_outcomes[7]['probs'].float().to(device)
y_step_binary   = pitch_level_outcomes[7]['binary'].float().to(device)
mask            = pitch_level_tensors[7]['mask'].bool().to(device)
lengths         = pitch_level_tensors[7]['lengths'].long().to(device)

# setup model
model = CNNbiLSTM(k_in=K, stem=64, c=96, kernel=7, lstm_hidden=128, dropout=0.1, bidir=True).to(device)

Input shape: torch.Size([401, 348, 9]), Device: cpu


In [333]:
# optimizer + class weights
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

# optional: compute pos_weight over valid steps once
    # NOTE: for probs this should be 1.0
with torch.no_grad():
    pos = y_step[mask].sum()
    tot = mask.sum()
    neg = tot - pos
    pos_weight = (neg / pos.clamp(min=1)).float()
print("pos_weight =", float(pos_weight))


pos_weight = 1.0


In [None]:
epochs     = 1
batch_size = 32

for epoch in range(1, epochs+1):
    model.train()

    # shuffle indices each epoch
    idx = torch.randperm(B, device=device)

    running_loss = 0.0
    seen = 0

    # iterate through mini-batches
    for start in range(0, B, batch_size):
        end = min(start + batch_size, B)
        bidx = idx[start:end]                    # [b]

        xb = x[bidx]                             # [b,T,K]
        yb = y_step[bidx]                        # [b,T]
        mb = mask[bidx]                          # [b,T]
        Lb = lengths[bidx].float()                       # [b]

        # forward
        logits = model(xb, Lb)                   # [b,T]

        # masked BCE with optional pos_weight
        loss = F.binary_cross_entropy_with_logits(
            logits[mb], yb[mb],
            pos_weight=pos_weight
        )

        # back propagation
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # update running loss
        running_loss += loss.item() * xb.size(0)
        seen += xb.size(0)

    epoch_loss = running_loss / max(seen, 1)
    print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f}")


epoch 01 | train_loss 0.7532


In [336]:
model.eval()
with torch.no_grad():
    logits = model(x, lengths)             # [B,T]
    probs  = torch.sigmoid(logits)         # [B,T]

    # mask out padded steps
    preds  = (probs >= 0.5).float()[mask]  # [N_valid]
    labels = y_step_binary[mask]                  # [N_valid]

    acc = (preds == labels).float().mean().item()
    
    print(f"Per-pitch accuracy = {acc:.3f}")


Per-pitch accuracy = 0.561


In [337]:
from sklearn.metrics import roc_auc_score

y_true = labels.cpu().numpy()
y_score = probs[mask].cpu().numpy()
auc = roc_auc_score(y_true, y_score) if y_true.min() < y_true.max() else float("nan")
print(f"Per-pitch AUC = {auc:.3f}")


Per-pitch AUC = 0.653


In [338]:
preds.mean()

tensor(0.8792)

In [326]:
labels

tensor([0., 0., 0.,  ..., 1., 1., 1.])

In [325]:
preds

tensor([0., 0., 1.,  ..., 1., 1., 1.])

$\textit{Outcome Handling}$

In [None]:
# gather outcomes
    # non-injured pitcher --> outcomes = 0
    # injured pitcher --> outcomes = ...
example = outing_level_sequences[7]
example_outcomes = example[['pitcher', 'season', 'outing_id', 'injured_cohort_pitcher']].drop_duplicates().reset_index(drop=True).copy()
test_probs, test_binary = update_outcome_probabilities(example, model_type='outing_level')

In [51]:
example_pitches = pitch_level_sequences[7]

In [None]:
# iterate through all pitcher-pitches in the datespan
for group, rows in example_pitches.groupby(['pitcher', 'season']):
    rows

In [80]:
(1 / (1 + np.exp(-np.linspace(-6, 6, rows.shape[0]))))

array([0.00247262, 0.00273425, 0.00302348, 0.0033432 , 0.0036966 ,
       0.00408721, 0.0045189 , 0.00499596, 0.0055231 , 0.00610552,
       0.00674895, 0.00745967, 0.00824462, 0.0091114 , 0.01006839,
       0.01112476, 0.01229059, 0.01357692, 0.01499583, 0.01656054,
       0.01828548, 0.02018641, 0.02228046, 0.0245863 , 0.02712415,
       0.02991593, 0.03298531, 0.03635781, 0.04006084, 0.04412375,
       0.04857786, 0.05345645, 0.05879472, 0.06462967, 0.07100002,
       0.07794595, 0.08550885, 0.09373097, 0.10265494, 0.11232324,
       0.12277754, 0.1340579 , 0.14620194, 0.15924378, 0.17321298,
       0.18813338, 0.20402187, 0.22088711, 0.23872837, 0.25753434,
       0.27728217, 0.29793663, 0.31944957, 0.3417597 , 0.36479277,
       0.38846205, 0.41266938, 0.43730648, 0.46225681, 0.48739763,
       0.51260237, 0.53774319, 0.56269352, 0.58733062, 0.61153795,
       0.63520723, 0.6582403 , 0.68055043, 0.70206337, 0.72271783,
       0.74246566, 0.76127163, 0.77911289, 0.79597813, 0.81186