In [1]:
import pandas as pd
from connections import AWS

$\textbf{Epidemiology: Missing Value Imputation}$

Done to maximize the number of available pitches. Imputers are created separately to account for YTY changes in ball tracking systems.

In [2]:
# set up AWS connection
aws = AWS()
aws.connect()

[AWS]: Port 5433 is in use by process python3.11 (PID 70678). Killing it.
[AWS]: Connected to RDS endpoint.


$\textbf{Data Loading}$

In [None]:
from services.ball_tracking import clean_ball_tracking_data

In [None]:
""" Ball Tracking Data """
# injured pitcher ball tracking data (to get date of first pitch thrown)
inj_ball_tracking = aws.load_s3_object('epidemiology/cohorts/injured/statcast_data.csv')
inj_bt_clean = clean_ball_tracking_data(inj_ball_tracking)

# all ball tracking data by season
season_range = [i for i in range(2015, 2026)]
noninj_bt = {}
for s in season_range:
    # load, clean, & store ball tracking data for non-injured pitchers
    raw_bt = aws.load_s3_object(f'epidemiology/cohorts/noninjured/bulk_statcast/{s}.csv')
    noninj_bt[s] = clean_ball_tracking_data(raw_bt)

$\textbf{Imputer Development}$

Imputers fit separately for each season (non-injured data), then applied to injured pitcher data.

In [16]:
import pickle
from sklearn.impute import KNNImputer

In [9]:
def update_throws_col(col: pd.Series) -> pd.Series:
    """ Update the `p_throws` column to be 1 (R) or 0 (L) """
    return col.apply(lambda x: 1 if x == 'R' else 0)


In [None]:
# specify cols for imputer
imputer_cols = [
    'p_throws', 'rel_speed', 'rel_side', 'rel_ht',
    'spin_rate', 'spin_axis', 'ax0', 'ay0', 'az0'
]

# get missing counts
missing_counts = {
    'injured': inj_bt_clean[imputer_cols].isna().sum(),
    'noninjured': {s: noninj_bt[s][imputer_cols].isna().sum() for s in season_range}
}

# update p_throws column for each df
inj_bt_clean['p_throws'] = update_throws_col(inj_bt_clean['p_throws'])
for s in season_range:
    noninj_bt[s]['p_throws'] = update_throws_col(noninj_bt[s]['p_throws'])

In [None]:
# fit imputers to non-injured data for each season
missing_pitches = {}
imputers = {}
for s in season_range:
    # add missing pitch IDs to list in dictionary
    missing_pitches[s] = noninj_bt[s][noninj_bt[s][imputer_cols].isna().any(axis=1)]['pitch_id'].tolist()
    
    # fit imputer to non-injured data using nearest neighbors
    imputer = KNNImputer(n_neighbors=10, weights='distance')
    imputers[s] = imputer.fit(noninj_bt[s][imputer_cols])

# write imputers to disk
with open(f'models/imputers.pkl', 'wb') as f:
    pickle.dump(imputers, f)

# upload imputers to S3
with open(f'models/imputers.pkl', 'rb') as f:
    imputer_content = f.read()
aws.upload_to_s3(
    imputer_content,
    f'epidemiology/ml/imputers/models_by_year.pkl', 
)

$\textbf{Imputer Evaluation}$

In [49]:
# run imputers on non-injured data
    # NOTE: 2015 model must be re-trained without spin axis; other columns will be imputed
imputed_noninj_bt = {}
for s in season_range:
    print(f'Imputing missing values for {s} data...', end='\r', flush=True)
    imputed_noninj_bt[s] = pd.DataFrame(imputers[s].transform(noninj_bt[s][imputer_cols]))

    # update column names
    if s == 2015:
        imputed_noninj_bt[s].columns = ['p_throws', 'rel_speed', 'rel_side', 'rel_ht', 'spin_rate', 'ax0', 'ay0', 'az0']
    else:
        imputed_noninj_bt[s].columns = imputer_cols

    print(f'Finished imputing missing values for {s} data.')

Finished imputing missing values for 2015 data.
Finished imputing missing values for 2016 data.
Finished imputing missing values for 2017 data.
Finished imputing missing values for 2018 data.
Finished imputing missing values for 2019 data.
Finished imputing missing values for 2020 data.
Finished imputing missing values for 2021 data.
Finished imputing missing values for 2022 data.
Finished imputing missing values for 2023 data.
Finished imputing missing values for 2024 data.
Finished imputing missing values for 2025 data.


In [50]:
# compare imputed/non-imputed means
imputed_means = {s: imputed_noninj_bt[s].mean() for s in season_range}
nonimputed_means = {s: noninj_bt[s][imputer_cols].mean() for s in season_range}

In [64]:
# apply imputer to injured data
imputed_inj_bt = {}
for s in season_range:
    print(f'Imputing missing values for injured data from {s}...', end='\r', flush=True)
    
    # extract season data & imputer
    season_data = inj_bt_clean[pd.to_datetime(inj_bt_clean['game_date']).dt.year == s].copy()
    season_imputer = imputers[s]

    # setup imputer
        # note that 2015 was trained w/ spin axis but doesn't actually use it
    impute_cols = season_imputer.feature_names_in_
    if s == 2015:
        output_cols = ['p_throws', 'rel_speed', 'rel_side', 'rel_ht', 'spin_rate', 'ax0', 'ay0', 'az0']
    else:
        output_cols = impute_cols
    
    # apply imputer, update columns
    season_data[output_cols] = season_imputer.transform(season_data[impute_cols])

    # store imputed data
    imputed_inj_bt[s] = season_data

    print(f'Finished imputing missing values for injured data from {s}.')

Finished imputing missing values for injured data from 2015.
Finished imputing missing values for injured data from 2016.
Finished imputing missing values for injured data from 2017.
Finished imputing missing values for injured data from 2018.
Finished imputing missing values for injured data from 2019.
Finished imputing missing values for injured data from 2020.
Finished imputing missing values for injured data from 2021.
Finished imputing missing values for injured data from 2022.
Finished imputing missing values for injured data from 2023.
Finished imputing missing values for injured data from 2024.
Finished imputing missing values for injured data from 2025.


$\textbf{S3 Uploads}$

In [53]:
# upload means to S3
imputer_summary = {
    'imputed_means': imputed_means,
    'nonimputed_means': nonimputed_means
}
with open('storage/imputer_summary.pkl', 'wb') as f:
    pickle.dump(imputer_summary, f)
with open('storage/imputer_summary.pkl', 'rb') as f:
    imputer_summary_content = f.read()
aws.upload_to_s3(
    imputer_summary_content,
    'epidemiology/ml/imputers/imputer_summary.pkl',
)


[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/imputers/imputer_summary.pkl


In [80]:
# upload bulk data to S3
for s in season_range:
    print(f'Uploading imputed data for {s}...', end='\r', flush=True)
    
    """ non-injured pitcher data """
    # NOTE: these were not preprocessed correctly, so it's handled here
    season_data_noninj = noninj_bt[s].copy()
    season_data_noninj_imp = imputed_noninj_bt[s].copy()
    season_data_noninj[season_data_noninj_imp.columns] = season_data_noninj_imp
    
    # upload correctly formatted data to S3, dropping NaNs in `pitcher_days_since_prev_game`
    aws.upload_to_s3(
        season_data_noninj.dropna(subset='pitcher_days_since_prev_game'),
        f'epidemiology/ml/datasets/{s}/noninjured_ball_tracking.csv'
    )

    # data upload (injured)
        # NOTE: these are already preprocessed correctly
    aws.upload_to_s3(
        imputed_inj_bt[s].dropna(subset='pitcher_days_since_prev_game'),
        f'epidemiology/ml/datasets/{s}/injured_ball_tracking.csv'
    )

    print(f'Finished uploading imputed data for {s}.')

[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/2015/noninjured_ball_tracking.csv
[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/2015/injured_ball_tracking.csv
Finished uploading imputed data for 2015.
[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/2016/noninjured_ball_tracking.csv
[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/2016/injured_ball_tracking.csv
Finished uploading imputed data for 2016.
[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/2017/noninjured_ball_tracking.csv
[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/2017/injured_ball_tracking.csv
Finished uploading imputed data for 2017.
[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/2018/noninjured_ball_tracking.csv
[AWS]: Uploaded object to s3://pitch-ml/epidemiology/ml/datasets/2018/injured_ball_tracking.csv
Finished uploading imputed data for 2018.
[AWS]: Uploaded object to s3://pitch-ml/epidemiology

$\textbf{Close AWS Connection}$

In [81]:
aws.close()

[AWS]: No active connection to close.
