In [5]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Clinical Model Setup}$

Injury risk estimation at the pitch-level using variable-length sequences. This notebook sets up the data structures and model architecture for development; the full, cleaned versions of each model are trained in `clinical_model_training.ipynb`.

In [2]:
""" INITIALIZE AWS CONNECTION """
aws = AWS()
aws.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

- Cohort data (matches, preds, statcast)
- Ball flight aggregates 

In [18]:
import ast

In [42]:
# helper function for calculating averages
def get_avgs(
        data: pd.DataFrame,
        group_cols: list,
        avg_cols: list = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate']
) -> pd.DataFrame:
    """
    Calculate averages for specified columns grouped by the given columns.
    
    Args:
        data (pd.DataFrame): The input DataFrame containing the data.
        group_cols (list): List of columns to group by.
        avg_cols (list): List of columns to calculate averages for. Defaults to ball flight features used in injury risk model.
    
    Returns:
        pd.DataFrame: A DataFrame with the grouped columns and their corresponding averages.
    """
    return data.groupby(group_cols)[avg_cols].mean().reset_index()

# filter pitches to date range for a given ID
def filter_pitches_by_date(
        pitch_data: pd.DataFrame, 
        pitcher_id: int, 
        start_date: str, 
        end_date: str
) -> pd.DataFrame:
    """
    Filter pitch data for a specific pitcher within a date range.
    
    Args:
        pitch_data (pd.DataFrame): The DataFrame containing pitch data.
        pitcher_id (int): The ID of the pitcher to filter by.
        start_date (str): The start date in 'YYYY-MM-DD' format.
        end_date (str): The end date in 'YYYY-MM-DD' format.
    
    Returns:
        pd.DataFrame: Filtered DataFrame containing pitches for the specified pitcher and date range.
    """
    return pitch_data[
        (pitch_data['pitcher'] == pitcher_id) &
        (pitch_data['game_date'] >= start_date) &
        (pitch_data['game_date'] <= end_date)
    ]

# add 'outing_before_injury' column for outing-level model: 1 if injured pitcher and second-to-last outing in season, else 0
def get_outing_before_injury(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    
    # get unique outing dates per pitcher/season; find second-to-last outing
    outing_dates = df.groupby(['pitcher', 'season'])['outing_number'].unique()
    second_last_outing = outing_dates.apply(lambda x: sorted(x)[-2] if len(x) >= 2 else pd.NaT)
    
    second_last_df = pd.DataFrame(second_last_outing).reset_index()
    second_last_df['outing_before_injury'] = 1

    return second_last_df


In [30]:
# load/collect all matches
n_matches = 5
cohort_matches = aws.load_s3_object(f'epidemiology/cohorts/injured/pitcher_info/matches_{n_matches}_per_pitcher.csv')

# organize all IDs w/ season
cohort_info = []
for _, row in cohort_matches.iterrows():
    cohort_info.append({
        'pitcher': row['mlbamid_injured'],
        'season': row['season'],
        'injured': 1
    })

    # append all non-injured pitchers
    for mlbamid in ast.literal_eval(row['mlbamid_noninjured']):
        cohort_info.append({
            'pitcher': mlbamid,
            'season': row['season'],
            'injured': 0
        })

# concatenate all pitcher info
cohort_info = pd.DataFrame(cohort_info)

In [None]:
# load all statcast data & model generated predictions
    # likely fts: velo, release position, spin rate (no pitch labels)
statcast_data = aws.load_s3_object('epidemiology/ml/datasets/full/model_application_data.csv')
statcast_preds = aws.load_s3_object('epidemiology/ml/datasets/preds/model_application.csv')

In [199]:
# merge statcast data with model predictions
    # create season column
statcast_full = statcast_data.merge(statcast_preds, on=['pitch_id', 'pitcher', 'game_date', 'pitcher_days_since_prev_game', 'injured_cohort_pitcher'])
statcast_full['season'] = statcast_full['game_date'].str[0:4].astype(int)

# clip to last 90 days for each pitcher
pitcher_last_outings = statcast_full.groupby(['pitcher', 'season'])['game_date'].max().reset_index()
pitcher_last_outings.rename(columns={'game_date': 'last_outing_date'}, inplace=True)

# merge last outing date w/ statcast data
statcast_full = statcast_full.merge(pitcher_last_outings, on=['pitcher', 'season'], how='left')

# compute time until last outing
statcast_full['days_until_last_outing'] = (
    pd.to_datetime(statcast_full['last_outing_date']) -
    pd.to_datetime(statcast_full['game_date'])
).dt.days

# filter to last 90 days
statcast_full_45 = statcast_full[statcast_full['days_until_last_outing'] <= 45].reset_index(drop=True)

$\textit{Baseline Ball Flight Averages by Pitcher}$

In [200]:
# get baseline season-long ball flight averages for reference
    # NOTE: these may be referenced for the pitch level model
ball_flight_fts = ['rel_speed', 'rel_side', 'rel_ht', 'spin_rate']
ball_flight_season_avgs = get_avgs(
    statcast_full_45,
    group_cols=['pitcher', 'season'],
    avg_cols=ball_flight_fts
)


$\textbf{Train/Test Splits}$

Loads previously set train/test splits (see `clinical_splits.ipynb`) and applies to `cohort_preds_final`.

In [201]:
# load splits
path_stem = 'epidemiology/ml/datasets/full'
cohort_train_ids = aws.load_s3_object(f'{path_stem}/cohort_train_ids.csv')
cohort_test_ids = aws.load_s3_object(f'{path_stem}/cohort_test_ids.csv')

# filter full dataset by train/test
cohort_preds_train = statcast_full_45.merge(
    cohort_train_ids,
    on=['pitcher', 'season'],
    how='inner'
)
cohort_preds_test = statcast_full_45.merge(
    cohort_test_ids,
    on=['pitcher', 'season'],
    how='inner'
)

$\textbf{Setup Data Sequences}$

Two features are manually created:
- Outing number
- Within outing cumulative EVT workload

All sequences must then be converted to arrays (then tensors) for model development.

In [202]:
import numpy as np

In [203]:
# create outing number
def create_outing_number(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create an outing number for each pitcher in the dataset.
    
    Args:
        data (pd.DataFrame): The DataFrame containing pitch data.
    
    Returns:
        pd.DataFrame: DataFrame with an additional 'outing_number' column.
    """
    df = data.copy()
    df['outing_number'] = df.groupby(['pitcher', 'season'])['game_date'].rank(method='dense').astype(int)
    
    return df

# create within outing cumulative EVT workload
def create_within_outing_cumulative_evt_workload(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create a cumulative EVT workload for each outing.
    
    Args:
        data (pd.DataFrame): The DataFrame containing pitch data.
    
    Returns:
        pd.DataFrame: DataFrame with an additional 'within_outing_cumulative_evt_workload' column.
    """
    df = data.copy()
    df['within_outing_cumulative_evt_workload'] = df.groupby(['pitcher', 'season', 'outing_number'])['pred_peak_evt_normalized'].cumsum()
    return df


In [204]:
# create addt'l features
trn_final = create_within_outing_cumulative_evt_workload(
    create_outing_number(cohort_preds_train)
)
test_final = create_within_outing_cumulative_evt_workload(
    create_outing_number(cohort_preds_test)
)

# sort by pitcher, season, outing number
trn_final = trn_final.sort_values(by=['pitcher', 'season', 'outing_number'])
test_final = test_final.sort_values(by=['pitcher', 'season', 'outing_number'])

In [205]:
# set model features -- contextual & time series layers
CONTEXTUAL_FTS = [
    'p_throws',
    'pitcher_days_since_prev_game',
    # 'outing_number'
]
TIME_SERIES_FTS = [
    # predicted load
    'pred_peak_evt_normalized',
    # 'within_outing_cumulative_evt_workload',
    
    # ball flight
    'rel_speed', 
    'rel_side',
    'rel_ht', 
    'spin_rate'
]

# combine for full features
feature_set = CONTEXTUAL_FTS + TIME_SERIES_FTS

In [206]:
# create training and testing set arrays
trn_arrays = np.array([rows[feature_set].values for _, rows in trn_final.groupby(['pitcher', 'season'])])
test_arrays = np.array([rows[feature_set].values for _, rows in test_final.groupby(['pitcher', 'season'])])

$\textbf{Create Outcomes}$

In [207]:
import torch
import numpy as np

In [208]:
# create outcome arrays -- 1 per pitcher-season (ie., sequence)
trn_outcomes = trn_final[['pitcher', 'season', 'injured']].drop_duplicates()['injured'].values
test_outcomes = test_final[['pitcher', 'season', 'injured']].drop_duplicates()['injured'].values

$\textbf{Aggregate for Streamlined Data Storage}$

In [209]:
import pickle

In [210]:
# reset all into a dictionary
pitch_level = {
    'trn': {
        'inputs': trn_arrays,
        'outcomes': trn_outcomes
    },
    'val': {
        'inputs': trn_arrays, 
        'outcomes': trn_outcomes
    }
}

# save to local disk
with open('storage/pitch_level_arrays.pkl', 'wb') as f:
    pickle.dump(pitch_level, f)

# TODO: upload to AWS (--> pytorch folder)
# with open('storage/pitch_level_arrays.pkl', 'rb') as f:
#     content = f.read()
#     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/pitch_level_arrays.pkl')

$\textbf{Tensor Setup}$

In [211]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

In [212]:
# convert to tensors and pad to the same length
def create_padded_tensor(sequences: list) -> torch.Tensor:
    """ 
    Convert a list of sequences to a padded tensor.
    
    Args:
        sequences (list): A list of sequences (arrays) to be converted.
    
    Returns:
        torch.Tensor: A padded tensor of shape (batch, max_seq_len, features).
    """
    seq_tensors = [torch.tensor(np.array(seq, dtype=np.float32)) for seq in sequences]
    padded = pad_sequence(seq_tensors, batch_first=True, padding_value=0)        # shape: (batch, max_seq_len, features)

    return padded

# helper function for creating sorted tensor dictionaries
def setup_all_tensors(
        input_arrays: dict,
        output_arrays: dict,
        pad_outcomes: bool = False
) -> dict:
    """ Helper function for creating sorted (pitch- or outing-level) tensor dictionaries. Returns sequences, mask, and lengths for each. """
    # setup tensor dictionaries
    sorted_tensors = {
        'seq': None,
        'mask': None,
        'lengths': None,
        'probs': None,
        'binary': None
    }

    # create tensors
        # shape: (batch, max_seq_len, features)
        # also add mask and lengths to denote actual values for training
    sorted_tensors['seq'] = create_padded_tensor(input_arrays)
    sorted_tensors['mask'] = (sorted_tensors['seq'].abs().sum(dim=2) != 0) 
    sorted_tensors['lengths'] = sorted_tensors['mask'].sum(dim=1)

    # convert outcomes to padded tensors
    if pad_outcomes:
        sorted_tensors['probs'] = create_padded_tensor(output_arrays['probs'])
        sorted_tensors['binary'] = create_padded_tensor(output_arrays['binary'])
    else:
        sorted_tensors['binary'] = torch.tensor(output_arrays, dtype=torch.float32)

    return sorted_tensors


In [213]:
# create tensors from arrays
pitch_level_tensors = {
    'trn': None,
    'val': None
}
pitch_level_tensors['trn'] = setup_all_tensors(pitch_level['trn']['inputs'], pitch_level['trn']['outcomes'])
pitch_level_tensors['val'] = setup_all_tensors(pitch_level['val']['inputs'], pitch_level['val']['outcomes'])

In [214]:
# save to local disk
with open('storage/pitch_level_tensors.pkl', 'wb') as f:
    pickle.dump(pitch_level_tensors, f)

# TODO: upload to AWS (--> pytorch folder)
# with open('storage/pitch_level_tensors.pkl', 'rb') as f:
#     content = f.read()
#     aws.upload_to_s3(content, 'epidemiology/ml/datasets/pytorch/pitch_level_tensors.pkl')

$\textbf{Close AWS Connection}$

In [215]:
aws.close()

[AWS]: No active connection to close.


$\textbf{Sandbox: Development}$

In [288]:
# list of features for reference
TIME_SERIES_FTS + CONTEXTUAL_FTS

['pred_peak_evt_normalized',
 'within_outing_cumulative_evt_workload',
 'rel_speed',
 'rel_side',
 'rel_ht',
 'spin_rate',
 'p_throws',
 'pitcher_days_since_prev_game',
 'outing_number']

$\textit{Example Training Sequence}$

In [None]:
# standardize example sequence --> example training tensor sequence (x)
example_mean, example_std = compute_masked_scalers(pitch_level_tensors[7]['seq'], pitch_level_tensors[7]['mask'])
x = apply_scalers(pitch_level_tensors[7]['seq'], example_mean, example_std)

In [332]:
# setup (device, shapes)
device = "cuda" if torch.cuda.is_available() else "cpu"
B, T, K = x.shape
print(f'Input shape: {x.shape}, Device: {device}')

# move full tensors to device once (since we’re not using a DataLoader yet)
x               = x.float().to(device)
y_step          = pitch_level_outcomes[7]['probs'].float().to(device)
y_step_binary   = pitch_level_outcomes[7]['binary'].float().to(device)
mask            = pitch_level_tensors[7]['mask'].bool().to(device)
lengths         = pitch_level_tensors[7]['lengths'].long().to(device)

# setup model
model = CNNbiLSTM(k_in=K, stem=64, c=96, kernel=7, lstm_hidden=128, dropout=0.1, bidir=True).to(device)

Input shape: torch.Size([401, 348, 9]), Device: cpu


In [333]:
# optimizer + class weights
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

# optional: compute pos_weight over valid steps once
    # NOTE: for probs this should be 1.0
with torch.no_grad():
    pos = y_step[mask].sum()
    tot = mask.sum()
    neg = tot - pos
    pos_weight = (neg / pos.clamp(min=1)).float()
print("pos_weight =", float(pos_weight))


pos_weight = 1.0


In [None]:
epochs     = 1
batch_size = 32

for epoch in range(1, epochs+1):
    model.train()

    # shuffle indices each epoch
    idx = torch.randperm(B, device=device)

    running_loss = 0.0
    seen = 0

    # iterate through mini-batches
    for start in range(0, B, batch_size):
        end = min(start + batch_size, B)
        bidx = idx[start:end]                    # [b]

        xb = x[bidx]                             # [b,T,K]
        yb = y_step[bidx]                        # [b,T]
        mb = mask[bidx]                          # [b,T]
        Lb = lengths[bidx].float()                       # [b]

        # forward
        logits = model(xb, Lb)                   # [b,T]

        # masked BCE with optional pos_weight
        loss = F.binary_cross_entropy_with_logits(
            logits[mb], yb[mb],
            pos_weight=pos_weight
        )

        # back propagation
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # update running loss
        running_loss += loss.item() * xb.size(0)
        seen += xb.size(0)

    epoch_loss = running_loss / max(seen, 1)
    print(f"Epoch {epoch:02d} | Training Loss: {epoch_loss:.4f}")


epoch 01 | train_loss 0.7532


In [336]:
model.eval()
with torch.no_grad():
    logits = model(x, lengths)             # [B,T]
    probs  = torch.sigmoid(logits)         # [B,T]

    # mask out padded steps
    preds  = (probs >= 0.5).float()[mask]  # [N_valid]
    labels = y_step_binary[mask]                  # [N_valid]

    acc = (preds == labels).float().mean().item()
    
    print(f"Per-pitch accuracy = {acc:.3f}")


Per-pitch accuracy = 0.561


In [337]:
from sklearn.metrics import roc_auc_score

y_true = labels.cpu().numpy()
y_score = probs[mask].cpu().numpy()
auc = roc_auc_score(y_true, y_score) if y_true.min() < y_true.max() else float("nan")
print(f"Per-pitch AUC = {auc:.3f}")


Per-pitch AUC = 0.653


In [338]:
preds.mean()

tensor(0.8792)

In [326]:
labels

tensor([0., 0., 0.,  ..., 1., 1., 1.])

In [325]:
preds

tensor([0., 0., 1.,  ..., 1., 1., 1.])

$\textit{Outcome Handling}$

In [None]:
# gather outcomes
    # non-injured pitcher --> outcomes = 0
    # injured pitcher --> outcomes = ...
example = outing_level_sequences[7]
example_outcomes = example[['pitcher', 'season', 'outing_id', 'injured_cohort_pitcher']].drop_duplicates().reset_index(drop=True).copy()
test_probs, test_binary = update_outcome_probabilities(example, model_type='outing_level')

In [51]:
example_pitches = pitch_level_sequences[7]

In [None]:
# iterate through all pitcher-pitches in the datespan
for group, rows in example_pitches.groupby(['pitcher', 'season']):
    rows

In [80]:
(1 / (1 + np.exp(-np.linspace(-6, 6, rows.shape[0]))))

array([0.00247262, 0.00273425, 0.00302348, 0.0033432 , 0.0036966 ,
       0.00408721, 0.0045189 , 0.00499596, 0.0055231 , 0.00610552,
       0.00674895, 0.00745967, 0.00824462, 0.0091114 , 0.01006839,
       0.01112476, 0.01229059, 0.01357692, 0.01499583, 0.01656054,
       0.01828548, 0.02018641, 0.02228046, 0.0245863 , 0.02712415,
       0.02991593, 0.03298531, 0.03635781, 0.04006084, 0.04412375,
       0.04857786, 0.05345645, 0.05879472, 0.06462967, 0.07100002,
       0.07794595, 0.08550885, 0.09373097, 0.10265494, 0.11232324,
       0.12277754, 0.1340579 , 0.14620194, 0.15924378, 0.17321298,
       0.18813338, 0.20402187, 0.22088711, 0.23872837, 0.25753434,
       0.27728217, 0.29793663, 0.31944957, 0.3417597 , 0.36479277,
       0.38846205, 0.41266938, 0.43730648, 0.46225681, 0.48739763,
       0.51260237, 0.53774319, 0.56269352, 0.58733062, 0.61153795,
       0.63520723, 0.6582403 , 0.68055043, 0.70206337, 0.72271783,
       0.74246566, 0.76127163, 0.77911289, 0.79597813, 0.81186