In [1]:
import pandas as pd
from connections import AWS

$\textbf{JRA Postprocessing: Outlier Removal}$

- __50 outliers__ in total, plus __2__ noisy subjects
- ~8.7% of all trials were removed during outlier processing

In [2]:
""" INITIALIZE AWS CONNECTION """
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


In [3]:
# load peaks and trials (NOTE: returns as df; peaks provide outlier information)
all_peaks = aws_connection.load_s3_object('biomechanics/subjects/summary/jra_peaks_all.csv')
all_trials = aws_connection.load_s3_object('biomechanics/subjects/summary/jra_trials_all.csv')

# add metadata for trials to filter outliers (± 1.96 SD from subject mean)
num_outliers = all_peaks[all_peaks['outlier_flag'] == 1].shape[0]               # 229 outliers // 3650 trials
all_trials_meta = all_trials.merge(
    all_peaks,
    on='study_id',
    how='left'
)
all_trials_clean = all_trials_meta[all_trials_meta['outlier_flag'] == 0]

In [52]:
""" OUTLIERS """
BAD_SUBJECTS = [2726, 3023]         # --> remove all pitches for these subjects
BAD_TRIALS = []

$\textbf{Investigation}$

Generate pitcher spaghetti plot, visually identify outlier threshold, append to list for removal.

In [53]:
import matplotlib.pyplot as plt

In [54]:
# plot for pitcher-specific spaghetti plots
def plot_subject_elbow_varus_torque(
        subject_id: int, 
        df: pd.DataFrame
) -> None:
    """    
    Generates spaghetti plot of elbow varus torque for a specific subject over normalized time.

    Args:
        subject_id (int): The ID of the subject to plot.
        df (pd.DataFrame): DataFrame containing the elbow varus torque data.
    """
    subject_group = df[df['subject_id'] == subject_id]
    if subject_group.empty:
        print(f"No data found for subject_id {subject_id}")
        return

    plt.figure(figsize=(10, 6))
    for _, group in subject_group.groupby('study_id'):
        plt.plot(group['normalized_time'], group['elbow_varus_torque'], alpha=0.5)

    if subject_group['peak_was_negative'].mean() == 1:
        mean_peak = subject_group['peak_value'].mean() * -1
    else:
        mean_peak = subject_group['peak_value'].mean()
    std_peak = subject_group['peak_value'].std()
    plt.axhline(mean_peak, color='black', linestyle='--', label=f'Mean Peak: {mean_peak:.2f} Nm')
    plt.axhline(mean_peak + 1.96 * std_peak, color='red', linestyle='--', alpha=0.7, label=f'±1.96 SD: {std_peak:.2f} Nm')
    plt.axhline(mean_peak - 1.96 * std_peak, color='red', linestyle='--', alpha=0.7)

    avg_peak_time = subject_group['peak_normalized_time'].mean()
    plt.axvline(avg_peak_time, color='skyblue', linestyle='--', label=f'Avg Peak Normalized Time: {avg_peak_time:.2f}')

    plt.xlabel('Normalized Time', fontdict={'fontsize': 14})
    plt.ylabel('Elbow Varus Torque', fontdict={'fontsize': 14})
    plt.title(f'Elbow Varus Torque vs Normalized Time\nPitcher: {subject_id}')
    plt.legend()
    plt.tight_layout()
    plt.show()

# function to identify outliers based on threshold
def identify_outliers(
        df: pd.DataFrame, 
        subject_id: int,
        threshold: float,
        greater_than: bool = False
) -> list:
    """
    Identifies outliers in the DataFrame based on a specified threshold.

    Args:
        df (pd.DataFrame): DataFrame containing the elbow varus torque data.
        subject_id (int): The ID of the subject to filter.
        threshold (float): The threshold value to identify outliers.
        greater_than (bool): If True, identifies values greater than the threshold; otherwise, identifies values less than the threshold.

    Returns:
        list: A list of study IDs that are considered outliers based on the specified threshold.
    """
    if greater_than:
        max_vals_pitcher = all_trials_clean[all_trials_clean['subject_id'] == subject_id].groupby('study_id')['elbow_varus_torque'].max()
        outliers = max_vals_pitcher[max_vals_pitcher > threshold].index.tolist()
    else:
        min_vals_pitcher = all_trials_clean[all_trials_clean['subject_id'] == subject_id].groupby('study_id')['elbow_varus_torque'].min()
        outliers  = min_vals_pitcher[min_vals_pitcher < threshold].index.tolist()

    return outliers



In [55]:
# 2640 --> outliers: any pitches that get below -100 Nm
# plot_subject_elbow_varus_torque(2640, all_trials_clean)

# get list of outliers
outliers_2640 = identify_outliers(all_trials_clean, 2640, -100, greater_than=False)
BAD_TRIALS.extend(outliers_2640)

In [56]:
# 2746 --> one pitch that gets below -100 Nm
# plot_subject_elbow_varus_torque(2746, all_trials_clean)

# get list of outliers
outliers_2746 = identify_outliers(all_trials_clean, 2746, -100, greater_than=False)
BAD_TRIALS.extend(outliers_2746)

In [57]:
# 2749 --> one pitch that gets above 185 Nm
# plot_subject_elbow_varus_torque(2749, all_trials_clean)

# get list of outliers
outliers_2749 = identify_outliers(all_trials_clean, 2749, 185, greater_than=True)
BAD_TRIALS.extend(outliers_2749)

In [58]:
# 2951 --> bad pitches get below -150
# plot_subject_elbow_varus_torque(2951, all_trials_clean)

# get list of outliers
outliers_2951 = identify_outliers(all_trials_clean, 2951, -150, greater_than=False)
BAD_TRIALS.extend(outliers_2951)

In [59]:
# 2959 --> one pitch > 100
# plot_subject_elbow_varus_torque(2959, all_trials_clean)

# get list of outliers
outliers_2959 = identify_outliers(all_trials_clean, 2959, 100, greater_than=True)
BAD_TRIALS.extend(outliers_2959)

In [None]:
# 2976 --> bad pitches (~ 2) get below -130
# plot_subject_elbow_varus_torque(2976, all_trials_clean)

# get list of outliers
outliers_2976 = identify_outliers(all_trials_clean, 2976, -130, greater_than=False)
BAD_TRIALS.extend(outliers_2976)

In [70]:
# 2984 --> bad pitches seem to get above 190
# plot_subject_elbow_varus_torque(2984, all_trials_clean)

# get list of outliers
outliers_2984 = identify_outliers(all_trials_clean, 2984, 190, greater_than=True)
BAD_TRIALS.extend(outliers_2984)

In [74]:
# 2996 --> bad pitch gets below -131
# plot_subject_elbow_varus_torque(2996, all_trials_clean)

# get list of outliers
outliers_2996 = identify_outliers(all_trials_clean, 2996, -131, greater_than=False)
BAD_TRIALS.extend(outliers_2996)

In [76]:
# 3050 --> bad pitches get above 110
# plot_subject_elbow_varus_torque(3050, all_trials_clean)

# get list of outliers
outliers_3050 = identify_outliers(all_trials_clean, 3050, 110, greater_than=True)
BAD_TRIALS.extend(outliers_3050)

$\textbf{Outlier Removal}$

In [83]:
# create copy for postprocessed trials
all_trials_postprocessed = all_trials_clean.copy()

# remove bad trials and subjects from postprocessed dataframe
all_trials_postprocessed = all_trials_postprocessed[~all_trials_postprocessed['study_id'].isin(BAD_TRIALS)]
all_trials_postprocessed = all_trials_postprocessed[~all_trials_postprocessed['subject_id'].isin(BAD_SUBJECTS)]
all_trials_postprocessed.reset_index(drop=True, inplace=True)

# set outlier flag for bad trials
all_peaks.loc[all_peaks['study_id'].isin(BAD_TRIALS), 'outlier_flag'] = 1
all_peaks.loc[all_peaks['subject_id'].isin(BAD_SUBJECTS), 'outlier_flag'] = 1

In [88]:
# re-upload to S3
aws_connection.upload_to_s3(
    all_trials_postprocessed, 
    'biomechanics/modeling_datasets/jra_trials_all.csv'
)
aws_connection.upload_to_s3(
    all_peaks, 
    'biomechanics/postprocessed_datasets/modeling_datasets.csv'
)

[AWS]: Uploaded object to s3://pitch-ml/biomechanics/modeling_datasets/jra_trials_all.csv
[AWS]: Uploaded object to s3://pitch-ml/biomechanics/postprocessed_datasets/modeling_datasets.csv


$\textbf{Close AWS Connection}$

In [89]:
aws_connection.close()

[AWS]: Database connection closed.
[AWS]: SSH tunnel stopped.
