In [413]:
import io
import numpy as np
import pandas as pd
from connections import AWS

$\textbf{Functions}$

In [175]:
# load a .sto file (e.g., results from an JRA run)
def load_sto_file(path: str, skip_rows: int = 10) -> pd.DataFrame:
    return pd.read_csv(path, sep='\\s+', skiprows=skip_rows)

# mirror columns to match RHP
def mirror_columns(
        data: pd.DataFrame,
        cols: list
):
    data_mirrored = data.copy()
    for col in cols:
        data_mirrored.loc[data['throws'] == 'left', col] *= -1
    
    return data_mirrored

$\textbf{Data Loading}$

- AWS connection
- Result summaries, subject info, etc.
- All IK and JRA files
- Ball flight data

In [2]:
""" INITIALIZE AWS CONNECTION """
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is in use by process python3.11 (PID 45748). Killing it.
[AWS]: Connected to RDS endpoint.


In [None]:
""" S3 OBJECTS """

# load all files from S3
s3_objects = aws_connection.list_s3_objects(prefix='subjects/')

# filter to IK, JRA results files
ik_result_files = [obj for obj in s3_objects if 'ik_results' in obj]
jra_result_files = [obj for obj in s3_objects if 'jra_results' in obj]

In [None]:
""" SUMMARIES & SUBJECT INFO """

# subject info
subject_info = aws_connection.load_subject_info()

# JRA summary (at peak)
    # NOTE: outliers not included here --> 3,401 trials
jra_summary_bytes = aws_connection.load_s3_object('subjects/summary/results_jra.csv', return_info=False)
jra_summary = pd.read_csv(io.BytesIO(jra_summary_bytes))

In [414]:
""" LOAD JRA RESULTS """

# initialize jra results 
jra_results = []

# read all JRA result CSVs
for file in jra_result_files:
    
    # get study ID, check for eligible trial
    study_id = file.split('/')[-1].split('_jra')[0]
    if study_id not in jra_summary['study_id'].values:
        continue  # skip if not eligible
    
    # load JRA results
    jra_bytes = aws_connection.load_s3_object(file, return_info=False)
    jra_df = pd.read_csv(io.BytesIO(jra_bytes), sep='\\s+', skiprows=11)

    # insert study ID
    jra_df.insert(0, 'study_id', study_id)

    # append to lists
    jra_results.append(jra_df)

# concatenate all JRA results (NOTE: do not reset index)
jra_results_df = pd.concat(jra_results, ignore_index=False)

In [416]:
# separate JRA throwing hand columns
MOMENT_COLS_LEFT = [
    'study_id', 'time', 'elbow_l_on_ulna_l_in_ulna_l_mx', 'elbow_l_on_ulna_l_in_ulna_l_my', 'elbow_l_on_ulna_l_in_ulna_l_mz'
]
MOMENT_COLS_RIGHT = [
    'study_id', 'time', 'elbow_r_on_ulna_r_in_ulna_r_mx', 'elbow_r_on_ulna_r_in_ulna_r_my', 'elbow_r_on_ulna_r_in_ulna_r_mz'
]

# extract separate dataframes
jra_left = jra_results_df[MOMENT_COLS_LEFT].dropna(how='any')
jra_right = jra_results_df[MOMENT_COLS_RIGHT].dropna(how='any')

# rename columns of each
MOMENT_COLS = [
    'study_id', 'time', 'elbow_mx', 'elbow_my', 'elbow_mz'
]
jra_left.columns = MOMENT_COLS
jra_right.columns = MOMENT_COLS

# combine left and right dfs
jra_combined = pd.concat([jra_left, jra_right], ignore_index=False).sort_values(by=['study_id', 'time'])

# compute elbow moment magnitude
ELBOW_MOMENT_COLS = ['elbow_mx', 'elbow_my', 'elbow_mz']
jra_combined['elbow_magnitude'] = np.sqrt(jra_combined['elbow_mx']**2 + jra_combined['elbow_my']**2 + jra_combined['elbow_mz']**2)


In [417]:
# extract peak elbow moment magnitude for each trial
peak_moments = {}
for trial in jra_combined['study_id'].unique():
    trial_data = jra_combined[jra_combined['study_id'] == trial]
    peak_idx = trial_data['elbow_magnitude'].idxmax()

    # store results
    peak_moments[trial] = {
        'study_id': trial,
        'peak_idx': peak_idx,
        'peak_elbow_magnitude': trial_data.loc[peak_idx, 'elbow_magnitude']
    }

# convert to DataFrame
peak_moments_final = pd.DataFrame.from_dict(peak_moments).T

In [342]:
""" LOAD IK RESULTS """

# set column names
ik_cols = [
    'study_id', 'time', 'arm_flex', 'arm_add', 'arm_rot', 'humerus_tx', 'humerus_ty', 
    'humerus_tz', 'elbow_flex', 'pro_sup', 'wrist_flex', 'wrist_dev'
]

# initialize lists of results
ik_results = []
ik_peaks = []

# read all IK result CSVs
for file in ik_result_files:
    
    # extract study ID, check if in JRA summary
    study_id = file.split('/')[-1].split('_ik')[0]
    if study_id not in jra_summary['study_id'].values:
        continue
    
    ik_bytes = aws_connection.load_s3_object(file, return_info=False)
    ik_df = pd.read_csv(io.BytesIO(ik_bytes))

    # update column names & insert study ID
    ik_df.insert(0, 'study_id', study_id)
    ik_df.columns = ik_cols

    # extract values at peak torque
        # NOTE: now uses peak moment magnitude
    peak_idx = peak_moments_final[peak_moments_final['study_id'] == study_id]['peak_idx'].values[0]
    peak_angles = pd.DataFrame(ik_df.iloc[peak_idx, :]).T

    # append to lists 
    ik_results.append(ik_df)
    ik_peaks.append(peak_angles)

# concatenate all IK results (NOTE: do not reset index)
ik_results_df = pd.concat(ik_results, ignore_index=False)
ik_peaks_df = pd.concat(ik_peaks, ignore_index=False)

In [102]:
""" LOAD IK ERRORS """

# list all error files (by trial)
ik_error_files = [obj for obj in s3_objects if 'ik_errors' in obj]

ik_errors = []
for file in ik_error_files:
    
    # extract study ID, check if in JRA summary
    study_id = file.split('/')[-1].split('_ik')[0]
    if study_id not in jra_summary['study_id'].values:
        continue
    
    # load error file
    error_bytes = aws_connection.load_s3_object(file, return_info=False)
    error_df = pd.read_csv(io.BytesIO(error_bytes))

    # insert study ID
    error_df.insert(0, 'study_id', study_id)

    # append to list
    ik_errors.append(error_df)

# concatenate all IK errors (NOTE: do not reset index)
ik_errors_df = pd.concat(ik_errors, ignore_index=False)

# summarize errors & filter to threshold
    # 3,505 trials
THRESHOLD = 0.025
ik_errors_avg = ik_errors_df.groupby('study_id')['marker_error_RMS'].mean().reset_index()
ik_eligible_trials = ik_errors_avg[ik_errors_avg['marker_error_RMS'] < THRESHOLD]['study_id'].tolist()

In [139]:
""" BALL TRACKING DATA """

# get ball tracking objects (trackman)
trackman_clean_bytes = aws_connection.load_s3_object('subjects/summary/ball_tracking_trackman.csv', return_info=False)
trackman_clean = pd.read_csv(io.BytesIO(trackman_clean_bytes))
trackman_clean.rename(columns={'taggedPitchType': 'pitchType'}, inplace=True)

# get ball tracking objects (rapsodo)
rapsodo_clean_bytes = aws_connection.load_s3_object('subjects/summary/ball_tracking_rapsodo.csv', return_info=False)
rapsodo_clean = pd.read_csv(io.BytesIO(rapsodo_clean_bytes))

In [359]:
""" FINAL MODELING DATA """

# modeling data: biomech
peak_moments_final.insert(
    0, 
    'subject_id', 
    peak_moments_final['study_id'].str.split('_').str[0].astype(int)
)
model_data_biomech = peak_moments_final.merge(
    ik_peaks_df,
    on='study_id',
    how='left',
)

# modeling data: ball tracking
ball_flight_cols = [
    'study_id', 'pitchType', 'relSpeed', 'vertRelAngle', 
    'horzRelAngle', 'spinRate', 'spinAxis', 'relHeight', 
    'relSide', 'inducedVertBreak', 'horzBreak', 
    'x0', 'y0', 'z0', 'vx0', 'vy0', 'vz0', 'ax0', 'ay0', 'az0' 
]
model_data_trackman = trackman_clean[ball_flight_cols]
model_data_rapsodo = rapsodo_clean[ball_flight_cols]
model_data_ball = pd.concat([model_data_trackman, model_data_rapsodo])

# drop NAs
model_data_ball.dropna(subset='pitchType', inplace=True)

# merge data together
    # 2,986 pitches; 153 pitchers
model_data_final = subject_info.merge(model_data_biomech, on='subject_id', how='left').merge(model_data_ball, on='study_id', how='inner')

In [363]:
# save to csv, write to S3
model_data_final.to_csv('model_data_raw.csv', index=False)
aws_connection.upload_to_s3(model_data_final, 'subjects/summary/model_data.csv')

[AWS]: Uploaded object to s3://pitch-ml/subjects/summary/model_data.csv


$\textbf{Modeling: Peak Elbow Moment Magnitude (Development)}$

In [None]:
## FINAL DATA PREPROCESSING
    # normalize torque values
    # mirror LHP columns to match RHP
    # scale to metric system

In [365]:
# normalize torque values
model_data_final['normalized_peak_moment'] = model_data_final['peak_elbow_magnitude'] / (model_data_final['height'] * model_data_final['mass'] * 9.81)

# mirror LHP columns to match RHP
cols_to_mirror = ['x0', 'vx0', 'ax0', 'relSide', 'horzBreak', 'horzRelAngle']
model_data_final = mirror_columns(model_data_final, cols_to_mirror).reset_index(drop=True)

# convert to metric system
conversion_factors = {
    'relSpeed': 0.44704,    # mph to m/s
    'relSide': 0.3048,      # ft to m
    'relHeight': 0.3048,    # ft to m
    'ax0': 0.3048,          # ft/s^2 to m/s^2
    'ay0': 0.3048,          # ft/s^2 to m/s^2
    'az0': 0.3048,          # ft/s^2 to m/s^2
}

# iterate through conversion factors and apply them in each df
for col, factor in conversion_factors.items():
    if col in model_data_final.columns:
        model_data_final[col] *= factor
    if col in model_data_final.columns:
        model_data_final[col] *= factor

$\textit{Data Scaling}$

In [366]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scale_fts = ['relSide', 'relHeight', 'spinRate', 'spinAxis', 'relSpeed', 'ax0', 'ay0', 'az0'] + ['wrist_flex', 'wrist_dev', 'pro_sup', 'elbow_flex']

# fit scaler to data & apply
scaler.fit(model_data_final[scale_fts])
model_data_final[scale_fts] = scaler.transform(model_data_final[scale_fts])

# save the processed datasets
model_data_final.to_csv('model_data_processed.csv', index=False)
aws_connection.upload_to_s3(model_data_final, 'subjects/summary/model_data_processed.csv')

[AWS]: Uploaded object to s3://pitch-ml/subjects/summary/model_data_processed.csv


$\textit{Model Development}$

Implemented as follows. 

_For each subject_:
- Train baseline LR, RF models on all other subjects to estimate peak EVT, store errors
- Train LR, RF models on all other subjects to estimate kinematics at ball release, store errors; use model with lower error
- Train engineered LR, RF models to estimate peak EVT, store errors

This loop is more concise than previous iterations and makes it easier to prevent data leakage.

In [408]:
import numpy as np
import traceback
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [None]:
def train_subject_models(
    train_data: pd.DataFrame,
    val_data: pd.DataFrame,
    target: str = 'normalized_peak_moment'
) -> dict:
    
    """ Train models for each subject using training and validation data. 
    
    **Parameters**
    - train_data: DataFrame containing training data for each subject.
    - val_data: DataFrame containing validation data for each subject.
    - target: The target variable to predict (default is 'normalized_peak_value').

    **Returns**
    - results: Dictionary containing trained models and their performance metrics.
    """

    # initialize results dictionary, feature lists
    results = {}
    baseline_features = [
        'relSide', 'relHeight', 'spinRate', 'spinAxis', 'relSpeed', 'ax0', 'ay0', 'az0'
    ]
    eng_features = baseline_features + ['pred_wrist_flex', 'pred_wrist_dev', 'pred_pro_sup', 'pred_elbow_flex']

    """ BASELINE MODELS """
    # linear model
    baseline_lr = train_linear_model(train_data, val_data, baseline_features, target)
    results['baseline_lr'] = baseline_lr

    # random forest
    baseline_rf = train_rf_model(train_data, val_data, baseline_features, target, retrain=True)
    results['baseline_rf'] = baseline_rf

    """ KINEMATIC MODELS """
    # define kinematic features for each outcome, then train each
    kinematic_fts = {
        'wrist_flex': baseline_features,
        'wrist_dev': baseline_features,
        'pro_sup': baseline_features + ['pred_wrist_flex', 'pred_wrist_dev'],
        'elbow_flex': baseline_features + ['pred_wrist_flex', 'pred_pro_sup']
    }
    for outcome, fts in kinematic_fts.items():

        # train linear model, extract train preds --> add to train_data as pred_[kinematic]
        lr_kinematic_model = train_linear_model(train_data, val_data, fts, outcome)
        train_data.loc[:, f'pred_{outcome}'] = lr_kinematic_model['model'].predict(train_data[fts])
        val_data.loc[:, f'pred_{outcome}'] = lr_kinematic_model['model'].predict(val_data[fts])
        
        # train random forest model, extract train preds --> add to train_data as pred_[kinematic]
        rf_kinematic_model = train_rf_model(train_data, val_data, fts, outcome)
        train_data.loc[:, f'pred_{outcome}'] = rf_kinematic_model['model'].predict(train_data[fts])
        val_data.loc[:, f'pred_{outcome}'] = rf_kinematic_model['model'].predict(val_data[fts])

    """ ENGINEERED MODELS """
    # linear model
    eng_lr = train_linear_model(train_data, val_data, eng_features, target)
    results['eng_lr'] = eng_lr
    
    # random forest
    eng_rf = train_rf_model(train_data, val_data, eng_features, target, retrain=True)
    results['eng_rf'] = eng_rf
    
    # return dictionary w/ models & results, errors
    return results 

# linear model training
    # val_data: subject data for validation
def train_linear_model(
        train_data: pd.DataFrame,
        val_data: pd.DataFrame,
        features: list,
        target: str
) -> dict:
    lr_model = LinearRegression()                                               # initialize linear regression model
    lr_model.fit(train_data[features], train_data[target])       # fit model to training data

    # get validation predictions, error (RMSE) in original units
    val_predictions = lr_model.predict(val_data[features]) * val_data['height'] * val_data['mass'] * 9.81
    val_errors = abs(val_predictions - (val_data[target] * val_data['height'] * val_data['mass'] * 9.81))
    val_rmse = root_mean_squared_error(
        val_data[target] * val_data['height'] * val_data['mass'] * 9.81,
        val_predictions
    )

    return {
        'model': lr_model,
        'predictions': val_predictions,
        'errors': val_errors,
        'rmse': val_rmse
    }

# random forest model training
    # val_data: subject data for validation
def train_rf_model(
        train_data: pd.DataFrame,
        val_data: pd.DataFrame,
        features: list,
        target: str,
        params: dict = {'n_estimators': 250, 'random_state': 42},
        retrain: bool = False
):
    rf_model = RandomForestRegressor(**params)                            # initialize random forest model
    
    if retrain:
        retrain_data = pd.concat([train_data, val_data])
        rf_model.fit(retrain_data[features], retrain_data[target])
    else:
        rf_model.fit(train_data[features], train_data[target])

    # get validation predictions, error (RMSE) in original units
    val_predictions = rf_model.predict(val_data[features]) * val_data['height'] * val_data['mass'] * 9.81
    val_errors = abs(val_predictions - (val_data[target] * val_data['height'] * val_data['mass'] * 9.81))
    val_rmse = root_mean_squared_error(
        val_data[target] * val_data['height'] * val_data['mass'] * 9.81,
        val_predictions
    )

    return {
        'model': rf_model,
        'predictions': val_predictions,
        'errors': val_errors,
        'rmse': val_rmse
    }


In [410]:
# get list of all subjects (153 total)
subject_list = model_data_final['subject_id'].unique()

# initialize all results storage
subject_results = {}
model_errors = {
    'baseline_lr': [],
    'baseline_rf': [],
    'eng_lr': [],
    'eng_rf': []
}
model_rmse = {
    'baseline_lr': [],
    'baseline_rf': [],
    'eng_lr': [],
    'eng_rf': []
}
error_log = []

# iterate through subjects
for subject_id in subject_list[0:1]:
    try:
        print(f"Training models for holdout subject {subject_id}...", end='\r', flush=True)
        
        # setup LOOCV training/validation data
        train_data = model_data_final[model_data_final['subject_id'] != subject_id]
        val_data = model_data_final[model_data_final['subject_id'] == subject_id]

        # train models holding out subject
        subject_summary = train_subject_models(train_data, val_data) 
        
        # store results
        subject_results[subject_id] = subject_summary
        for model_name, model_info in subject_summary.items():
            if model_info is not None:
                model_errors[model_name].append(model_info['errors'])
                model_rmse[model_name].append(model_info['rmse'])

        # log error updates:
        print(f"Holdout model for subject {subject_id} trained successfully.")

    except Exception as e:
        print(f"Error training model for subject {subject_id}: {e}")
        error_log.append({
            'subject_id': subject_id,
            'error': str(e)
        })
        traceback.print_exc()

Holdout model for subject 2609 trained successfully.


In [411]:
model_rmse

{'baseline_lr': [12.91052935499846],
 'baseline_rf': [4.857586109528131],
 'eng_lr': [13.721727079228776],
 'eng_rf': [5.331295288970638]}

In [412]:
# close AWS connection
aws_connection.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
