In [1]:
import io
import numpy as np
import pandas as pd
from connections import AWS

$\textbf{Data Loading}$

In [2]:
""" INITIALIZE AWS CONNECTION """
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


In [None]:
model_data['peak_value']

Unnamed: 0,subject_id,study_id,peak_time,peak_value,peak_value_normalized,pitcher_throws_rh,pitch_type,rel_speed,vra,hra,...,humerus_ty,humerus_tz,elbow_flex,pro_sup,wrist_flex,wrist_dev,age,height,mass,throws
0,2609,2609_01,1.0164,101.484332,0.060541,1.0,FastBall,72.60000,-0.34000,-1.81000,...,1.294678,-0.250745,29.630059,135.869487,-5.616038,-12.609433,31.1,1.8796,90.91,right
1,2609,2609_03,1.0983,103.365228,0.061664,1.0,FastBall,72.80000,-0.82000,-2.47000,...,1.314900,-0.238200,31.186091,140.355943,-4.449118,-12.425181,31.1,1.8796,90.91,right
2,2609,2609_04,1.0668,107.313274,0.064019,1.0,FastBall,74.70000,-1.75000,-2.48000,...,1.304172,-0.262817,30.968620,143.873512,-3.385677,-14.516030,31.1,1.8796,90.91,right
3,2609,2609_05,1.1004,106.556224,0.063567,1.0,FastBall,73.80000,-1.22000,-2.50000,...,1.314514,-0.237082,30.215193,141.216140,-2.538984,-13.143293,31.1,1.8796,90.91,right
4,2609,2609_06,1.0416,130.399054,0.077791,1.0,FastBall,86.70000,-1.79000,-1.34000,...,1.241214,-0.307289,25.979811,133.388930,-5.123937,-17.360608,31.1,1.8796,90.91,right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2896,3055,3055_19,1.0626,165.663780,0.092341,1.0,Fastball,93.25796,-2.74313,-1.81866,...,1.172402,-0.370724,32.800065,132.161434,-18.388067,-45.536538,33.2,1.8288,100.00,right
2897,3055,3055_20,0.9030,163.981454,0.091403,1.0,Fastball,93.32194,-0.40567,-1.75552,...,1.206003,-0.362612,29.713630,123.865520,-23.746208,-42.178730,33.2,1.8288,100.00,right
2898,3055,3055_22,0.9807,161.702287,0.090132,1.0,Fastball,93.29086,-2.27179,-1.82887,...,1.192141,-0.421373,31.825301,127.511585,-21.555815,-37.909222,33.2,1.8288,100.00,right
2899,3055,3055_24,0.9303,164.004473,0.091416,1.0,Splitter,86.85003,-0.61324,-1.83149,...,1.178288,-0.335371,33.719209,130.725457,-14.034728,-33.711026,33.2,1.8288,100.00,right


In [None]:
# load model data (scaled & unscaled)
model_data = aws_connection.load_s3_object(
    'biomechanics/ml/modeling_datasets/model_dev_raw.csv',
    as_dataframe=True
)
model_data_scaled = aws_connection.load_s3_object(
    'biomechanics/ml/modeling_datasets/model_dev_scaled.csv',
    as_dataframe=True
)

# load subject info and merge
subject_info = aws_connection.load_subject_info()
model_data = model_data.merge(
    subject_info,
    on='subject_id',
    how='left'
)

# normalize torque values
model_data.insert(
    4,
    'peak_value_normalized', 
    model_data['peak_value'] / (model_data['height'] * model_data['mass'] * 9.81)
)

$\textbf{Model Development}$

Implemented as follows. __For each subject__:
- Train baseline LR, RF models on all other subjects to estimate peak EVT, store errors
- Train LR, RF models on all other subjects to estimate kinematics at ball release, store errors; use model with lower error
- Train engineered LR, RF models to estimate peak EVT, store errors

This loop is more concise than previous iterations and makes it easier to prevent data leakage.

In [15]:
import traceback
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [None]:
# looper function for training models on a heldout subject
    # NOTE: errors are reported in original units (Nm)
def train_subject_models(
    train_data: pd.DataFrame,
    val_data: pd.DataFrame,
    target: str = 'peak_value_normalized'
) -> dict:
    
    """ Train models for each subject using training and validation data. 
    
    Args
    - train_data: DataFrame containing training data for each subject.
    - val_data: DataFrame containing validation data for each subject.
    - target: The target variable to predict (default is 'normalized_peak_value').

    Returns
    - results: Dictionary containing trained models and their performance metrics.
    """

    # initialize results dictionary, feature lists
    results = {}
    baseline_features = [
        'rel_speed', 'rel_side', 'rel_ht', 'spin_rate', 'spin_axis', 'ax0', 'ay0', 'az0'
    ]
    eng_features = baseline_features + ['pred_wrist_flex', 'pred_wrist_dev', 'pred_pro_sup', 'pred_elbow_flex']

    """ BASELINE MODELS """
    # linear model
    baseline_lr = train_linear_model(train_data, val_data, baseline_features, target)
    results['baseline_lr'] = baseline_lr

    # random forest
    baseline_rf = train_rf_model(train_data, val_data, baseline_features, target, retrain=True)
    results['baseline_rf'] = baseline_rf

    """ KINEMATIC MODELS """
    # define kinematic features for each outcome, then train each
    kinematic_fts = {
        'wrist_flex': baseline_features,
        'wrist_dev': baseline_features,
        'pro_sup': baseline_features + ['pred_wrist_flex', 'pred_wrist_dev'],
        'elbow_flex': baseline_features + ['pred_wrist_flex', 'pred_pro_sup']
    }
    for outcome, fts in kinematic_fts.items():

        # train linear model, extract train preds --> add to train_data as pred_[kinematic]
        lr_kinematic_model = train_linear_model(train_data, val_data, fts, outcome)
        train_data.loc[:, f'pred_{outcome}'] = lr_kinematic_model['model'].predict(train_data[fts])
        val_data.loc[:, f'pred_{outcome}'] = lr_kinematic_model['model'].predict(val_data[fts])
        
        # train random forest model, extract train preds --> add to train_data as pred_[kinematic]
        rf_kinematic_model = train_rf_model(train_data, val_data, fts, outcome)
        train_data.loc[:, f'pred_{outcome}'] = rf_kinematic_model['model'].predict(train_data[fts])
        val_data.loc[:, f'pred_{outcome}'] = rf_kinematic_model['model'].predict(val_data[fts])

    """ ENGINEERED MODELS """
    # linear model
    eng_lr = train_linear_model(train_data, val_data, eng_features, target)
    results['eng_lr'] = eng_lr
    
    # random forest
    eng_rf = train_rf_model(train_data, val_data, eng_features, target, retrain=True)
    results['eng_rf'] = eng_rf
    
    # return dictionary w/ models & results, errors
    return results 

# linear model training
    # val_data: subject data for validation
def train_linear_model(
        train_data: pd.DataFrame,
        val_data: pd.DataFrame,
        features: list,
        target: str
) -> dict:
    lr_model = LinearRegression()                                               # initialize linear regression model
    lr_model.fit(train_data[features], train_data[target])       # fit model to training data

    # get validation predictions, error (RMSE) in original units
    val_predictions = lr_model.predict(val_data[features]) * val_data['height'] * val_data['mass'] * 9.81
    val_errors = abs(val_predictions - (val_data[target] * val_data['height'] * val_data['mass'] * 9.81))
    val_rmse = root_mean_squared_error(
        val_data[target] * val_data['height'] * val_data['mass'] * 9.81,
        val_predictions
    )

    return {
        'model': lr_model,
        'predictions': val_predictions,
        'errors': val_errors,
        'rmse': val_rmse
    }

# random forest model training
    # val_data: subject data for validation
def train_rf_model(
        train_data: pd.DataFrame,
        val_data: pd.DataFrame,
        features: list,
        target: str,
        params: dict = {'n_estimators': 250, 'random_state': 42},
        retrain: bool = False
):
    rf_model = RandomForestRegressor(**params)                            # initialize random forest model
    
    if retrain:
        retrain_data = pd.concat([train_data, val_data])
        rf_model.fit(retrain_data[features], retrain_data[target])
    else:
        rf_model.fit(train_data[features], train_data[target])

    # get validation predictions, error (RMSE) in original units
    val_predictions = rf_model.predict(val_data[features]) * val_data['height'] * val_data['mass'] * 9.81
    val_errors = abs(val_predictions - (val_data[target] * val_data['height'] * val_data['mass'] * 9.81))
    val_rmse = root_mean_squared_error(
        val_data[target] * val_data['height'] * val_data['mass'] * 9.81,
        val_predictions
    )

    return {
        'model': rf_model,
        'predictions': val_predictions,
        'errors': val_errors,
        'rmse': val_rmse
    }


In [None]:
# get list of all subjects (n = 150)
subject_list = model_data_scaled['subject_id'].unique()

# initialize all results storage
subject_results = {}
model_errors = {
    'baseline_lr': [],
    'baseline_rf': [],
    'eng_lr': [],
    'eng_rf': []
}
model_rmse = {
    'baseline_lr': [],
    'baseline_rf': [],
    'eng_lr': [],
    'eng_rf': []
}
error_log = []

# iterate through subjects
for subject_id in subject_list:
    try:
        print(f"Training models for holdout subject {subject_id}...", end='\r', flush=True)
        
        # setup LOOCV training/validation data
        train_data = model_data_scaled[model_data_scaled['subject_id'] != subject_id]
        val_data = model_data_scaled[model_data_scaled['subject_id'] == subject_id]

        # train models holding out subject
        subject_summary = train_subject_models(train_data, val_data) 
        
        # store results
        subject_results[subject_id] = subject_summary
        for model_name, model_info in subject_summary.items():
            if model_info is not None:
                model_errors[model_name].append(model_info['errors'])
                model_rmse[model_name].append(model_info['rmse'])

        # log error updates:
        print(f"Holdout model for subject {subject_id} trained successfully.")

    except Exception as e:
        print(f"Error training model for subject {subject_id}: {e}")
        error_log.append({
            'subject_id': subject_id,
            'error': str(e)
        })
        traceback.print_exc()

Holdout model for subject 2609 trained successfully.
Holdout model for subject 2610 trained successfully.
Holdout model for subject 2611 trained successfully.
Holdout model for subject 2612 trained successfully.
Holdout model for subject 2613 trained successfully.
Holdout model for subject 2614 trained successfully.
Holdout model for subject 2616 trained successfully.
Holdout model for subject 2618 trained successfully.
Holdout model for subject 2619 trained successfully.
Holdout model for subject 2621 trained successfully.
Holdout model for subject 2622 trained successfully.
Holdout model for subject 2623 trained successfully.
Holdout model for subject 2624 trained successfully.
Holdout model for subject 2625 trained successfully.
Holdout model for subject 2627 trained successfully.
Holdout model for subject 2628 trained successfully.
Holdout model for subject 2629 trained successfully.
Holdout model for subject 2630 trained successfully.
Holdout model for subject 2631 trained success

$\textbf{Upload to S3}$

In [48]:
import pickle

In [54]:
""" S3 UPLOADS """
# subject-specific dictionaries
for subject_id, results in subject_results.items():
    # save to pickle in local directory
    with open(f'subject_model_summary.pkl', 'wb') as f:
        pickle.dump(results, f)

    # upload subject results to S3
    s3_dest = f'biomechanics/subjects/{subject_id}/ml/model_summary.pkl'
    aws_connection.s3.upload_file(
        'subject_model_summary.pkl',
        aws_connection.bucket_name, 
        s3_dest
    )

In [None]:
# results summaries
with open('model_errors.pkl', 'wb') as f:
    pickle.dump(model_errors, f)
with open('model_rmse.pkl', 'wb') as f:
    pickle.dump(model_rmse, f)
aws_connection.s3.upload_file(
    'model_errors.pkl',
    aws_connection.bucket_name, 
    'biomechanics/ml/modeling_summary/model_errors.pkl'
)
aws_connection.s3.upload_file(
    'model_rmse.pkl',
    aws_connection.bucket_name, 
    'biomechanics/ml/modeling_summary/model_rmse.pkl'
)


$\textbf{Sandbox: Data Summary}$

Averages are reported in publications as follows: 
- LHP mirrored to match RHP
- Ball tracking converted to metric system

In [43]:
from evt_model_functions import *

In [69]:
# mirror LHP columns to match RHP
cols_to_mirror = ['x0', 'vx0', 'ax0', 'rel_side', 'hb', 'hra']
summary_data = model_data.copy()
summary_data = mirror_columns(summary_data, cols_to_mirror).reset_index(drop=True)

# convert to metric system
conversion_factors = {
    'rel_speed': 0.44704,    # mph to m/s
    'rel_side': 0.3048,      # ft to m
    'rel_ht': 0.3048,    # ft to m
    'ax0': 0.3048,          # ft/s^2 to m/s^2
    'ay0': 0.3048,          # ft/s^2 to m/s^2
    'az0': 0.3048,          # ft/s^2 to m/s^2
}

# iterate through conversion factors and apply them in each df
for col, factor in conversion_factors.items():
    if col in summary_data.columns:
        summary_data[col] *= factor
    if col in summary_data.columns:
        summary_data[col] *= factor

# get avg and stdev for each column
summary_data_avgs = summary_data.agg(
    {
        'height': ['mean', 'std'],
        'mass': ['mean', 'std'],
        'peak_value_normalized': ['mean', 'std'],
        'peak_value': ['mean', 'std'],
        'elbow_flex': ['mean', 'std'],
        'rel_speed': ['mean', 'std'],
        'rel_side': ['mean', 'std'],
        'rel_ht': ['mean', 'std'],
        'spin_rate': ['mean', 'std'],
        'spin_axis': ['mean', 'std'],
        'ax0': ['mean', 'std'],
        'ay0': ['mean', 'std'],
        'az0': ['mean', 'std']
    }
).reset_index()

# upload the summary data to S3
aws_connection.upload_to_s3(
    summary_data,
    f'biomechanics/ml/modeling_datasets/model_data_converted.csv'
)
aws_connection.upload_to_s3(
    summary_data_avgs,
    f'biomechanics/ml/modeling_summary/model_data_avgs.csv'
)

[AWS]: Uploaded object to s3://pitch-ml/biomechanics/ml/modeling_datasets/model_data_converted.csv
[AWS]: Uploaded object to s3://pitch-ml/biomechanics/ml/modeling_summary/model_data_avgs.csv


$\textbf{Close AWS Connection}$

In [58]:
# close AWS connection
aws_connection.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
