In [1]:
import io
import numpy as np
import pandas as pd
from connections import AWS

$\textbf{Data Loading}$

In [2]:
""" INITIALIZE AWS CONNECTION """
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


$\textit{Outputs}$

In [None]:
# load all peaks (outcomes); reduce to metadata & peak value, remove outliers
    # --> 3,304 pitches
outcomes = aws_connection.load_s3_object(
    'biomechanics/modeling_datasets/jra_peaks_all.csv',
    as_dataframe=True
)
outcomes_final = outcomes[outcomes['outlier_flag'] == 0][['subject_id', 'study_id', 'peak_time', 'peak_value']].reset_index(drop=True)

In [112]:
outcomes_final

Unnamed: 0,subject_id,study_id,peak_time,peak_value
0,2609,2609_01,1.0164,101.484332
1,2609,2609_03,1.0983,103.365228
2,2609,2609_04,1.0668,107.313274
3,2609,2609_05,1.1004,106.556224
4,2609,2609_06,1.0416,130.399054
...,...,...,...,...
3299,3055,3055_22,0.9807,161.702287
3300,3055,3055_23,0.9954,158.699661
3301,3055,3055_24,0.9303,164.004473
3302,3055,3055_25,0.9471,164.598229


$\textit{Inputs}$

Note that ball tracking data are in the RDS, not just S3.

In [None]:
""" BALL TRACKING DATA"""
# setup queries for compatible columns
trackman_query = """
    SELECT
    "study_id"
    , CASE 
        WHEN "pitcherthrows" = 'Right' THEN 1
        ELSE 0
    END AS pitcher_throws_rh
    , "taggedpitchtype" AS pitch_type
    , "relspeed" AS rel_speed
    , "vertrelangle" AS vra
    , "horzrelangle" AS hra
    , "spinrate" AS spin_rate
    , "spinaxis" AS spin_axis
    , "relheight" AS rel_ht
    , "relside" AS rel_side
    , "inducedvertbreak" AS ivb
    , "horzbreak" AS hb
    , "x0"
    , "y0"
    , "z0"
    , "vx0"
    , "vy0"
    , "vz0"
    , "ax0"
    , "ay0"
    , "az0"
    , 0 as from_rapsodo

FROM
    motus.ball.trackman
"""
rapsodo_query = """
    SELECT
    "study_id"
    , CASE 
        WHEN "relside" > 0 THEN 1
        ELSE 0
    END AS pitcher_throws_rh
    , "pitchtype" AS pitch_type
    , "relspeed" AS rel_speed
    , "vertrelangle" AS vra
    , "horzrelangle" AS hra
    , "spinrate" AS spin_rate
    , "spinaxis" AS spin_axis
    , "relheight" AS rel_ht
    , "relside" AS rel_side
    , "inducedvertbreak" AS ivb
    , "horzbreak" AS hb
    , "x0"
    , "y0"
    , "z0"
    , "vx0"
    , "vy0"
    , "vz0"
    , "ax0"
    , "ay0"
    , "az0"
    , 1 as from_rapsodo

FROM
    motus.ball.rapsodo
"""

# load all ball tracking data
trackman = aws_connection.run_query(trackman_query)
rapsodo = aws_connection.run_query(rapsodo_query)

# concatenate ball tracking data together
ball_tracking_data = pd.concat([trackman, rapsodo], axis=0).reset_index(drop=True)

In [None]:
""" IK DATA """
# get ball release times (nec. for IK data)
ball_release_times = aws_connection.load_s3_object(
    'biomechanics/modeling_datasets/ball_release_times.csv',
    as_dataframe=True
)

# load all IK file names
subject_files = aws_connection.list_s3_objects(
    prefix='biomechanics/subjects/'
)
ik_results_files = [f for f in subject_files if 'ik_results.csv' in f]

In [65]:
# preset general IK column names
ik_columns = [
    'time', 'arm_flex', 'arm_add', 'arm_rot', 'humerus_tx',
    'humerus_ty', 'humerus_tz', 'elbow_flex', 'pro_sup',
    'wrist_flex', 'wrist_dev'
]

# iterate through and load IK files
ik_results = []
ik_peak = []
for file in ik_results_files:
    # get study ID
    study_id = file.split('/')[-1].split('_ik')[0]
    if study_id not in outcomes_final['study_id'].unique():
        continue
    
    # load and update columns
    ik_data = aws_connection.load_s3_object(file, as_dataframe=True)
    ik_data.columns = ik_columns
    ik_data.insert(0, 'study_id', study_id)

    # get ball release time for study ID
        # NOTE: use time eps (e.g., 0.001 seconds) for approximately matching
    trial_brt = ball_release_times[ball_release_times['study_id'] == study_id]['ball_release_time'].values[0]
    ik_data_brt = ik_data[abs(ik_data['time'] - trial_brt) < 0.001]

    # get IK data at peak EVT time
    peak_time = outcomes_final[outcomes_final['study_id'] == study_id]['peak_time'].values[0]
    ik_data_pt = ik_data[abs(ik_data['time'] - peak_time) < 0.001]

    # append to results
    ik_results.append(ik_data_brt)
    ik_peak.append(ik_data_pt)

# concatenate all IK data together
ik_br_final = pd.concat(ik_results, axis=0).reset_index(drop=True)
ik_peak_final = pd.concat(ik_peak, axis=0).reset_index(drop=True)
    

In [79]:
""" SAVE TO S3 """
# outcomes_final
aws_connection.upload_to_s3(
    outcomes_final,
    'biomechanics/modeling_datasets/final_outcomes.csv',
)
    
# ball_tracking_data
aws_connection.upload_to_s3(
    ball_tracking_data,
    'biomechanics/modeling_datasets/final_ball_tracking_data.csv'
)

# ik_br_final
aws_connection.upload_to_s3(
    ik_br_final,
    'biomechanics/modeling_datasets/final_ik_ball_release.csv'
)
    
# ik_peak_final
aws_connection.upload_to_s3(
    ik_peak_final,
    'biomechanics/modeling_datasets/final_ik_peak_evt.csv'
)


[AWS]: Uploaded object to s3://pitch-ml/biomechanics/modeling_datasets/final_outcomes.csv
[AWS]: Uploaded object to s3://pitch-ml/biomechanics/modeling_datasets/final_ball_tracking_data.csv
[AWS]: Uploaded object to s3://pitch-ml/biomechanics/modeling_datasets/final_ik_ball_release.csv
[AWS]: Uploaded object to s3://pitch-ml/biomechanics/modeling_datasets/final_ik_peak_evt.csv


In [97]:
# merge all data together for model dev.
    # then clean (e.g., drop na --> missing ball tracking data)
model_dev_data = outcomes_final.merge(
        ball_tracking_data,
        on='study_id',
        how='left'
    ).merge(
        ik_br_final,
        on='study_id',
        how='left'
)
model_dev_clean = model_dev_data.dropna().reset_index(drop=True)

# upload to S3
aws_connection.upload_to_s3(
    model_dev_clean,
    'biomechanics/modeling_datasets/model_dev_raw.csv'
)


[AWS]: Uploaded object to s3://pitch-ml/biomechanics/modeling_datasets/model_dev_raw.csv


$\textbf{Scale Data}$

Final preprocessing step; handling here to condense model development notebook.

In [95]:
from sklearn.preprocessing import StandardScaler

In [98]:
# specify columns to scale
    # ball tracking + ik data
scale_cols = [
    'rel_speed', 'vra', 'hra',
    'spin_rate', 'spin_axis', 'rel_ht', 
    'rel_side', 'ivb', 'hb', 
    'x0', 'y0', 'z0', 
    'vx0', 'vy0', 'vz0', 
    'ax0', 'ay0', 'az0',
    'elbow_flex', 'pro_sup', 'wrist_flex', 'wrist_dev'
]

# create & fit scaler
scaler = StandardScaler()
scaler.fit(model_dev_clean[scale_cols])

# scale data
model_dev_scaled = model_dev_clean.copy()
model_dev_scaled[scale_cols] = scaler.transform(model_dev_clean[scale_cols])

# upload scaled data to S3
aws_connection.upload_to_s3(
    model_dev_scaled,
    'biomechanics/modeling_datasets/model_dev_scaled.csv'
)

[AWS]: Uploaded object to s3://pitch-ml/biomechanics/modeling_datasets/model_dev_scaled.csv


$\textbf{Close AWS Connection}$

In [108]:
# close AWS connection
aws_connection.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
