In [2]:
import pickle
import pandas as pd
from connections import AWS

$\textbf{Feature Importances}$

Applies the permutation-based feature importance method to each subject-specific model to determine the most important ball flight features.

In [4]:
""" INITIALIZE AWS CONNECTION """
aws_connection = AWS()
aws_connection.connect()

[AWS]: Port 5433 is free.
[AWS]: Connected to RDS endpoint.


In [None]:
# load all data w/ model predictions
model_data = aws_connection.load_s3_object('biomechanics/ml/modeling_datasets/model_dev_raw.csv')
model_preds = aws_connection.load_s3_object('biomechanics/ml/modeling_summary/model_preds.csv')

# load subject info
subject_info = aws_connection.load_subject_info()
subject_ids = subject_info['subject_id'].unique()

# instantiate subject model storage
with open('all_subject_models.pkl', 'rb') as f:
    models = pickle.load(f)

In [None]:
# iterate through subject IDs and load models
for subject_id in subject_ids:
    # check if model already exists
    if subject_id in models.keys():
        continue
    
    # set model path
    model_path = f'biomechanics/subjects/{subject_id}/ml/model_summary.pkl'
    
    # check if model exists in S3
    try:
        aws_connection.s3.download_file(
            aws_connection.bucket_name, 
            model_path,
            f'subject_model.pkl'
        )
    except Exception as e:
        print(f'No model found for subject {subject_id}. Skipping...')
        continue

    with open('subject_model.pkl', 'rb') as f:
        model_summary = pickle.load(f)
        models[subject_id] = model_summary['baseline_rf']['model']

    print(f'Loaded model for subject {subject_id}.')

In [103]:
# save to all subject models
with open('all_subject_models.pkl', 'wb') as f:
    pickle.dump(models, f)

$\textbf{Permutations}$

In [104]:
import numpy as np
from sklearn.metrics import root_mean_squared_error

In [105]:
# set all ball flight features 
model_features = ['vra', 'hra', 'rel_side', 'rel_ht', 'spin_rate', 'spin_axis', 'rel_speed', 'ax0', 'ay0', 'az0']

In [106]:
# initialize importances, model features
importances = {}
num_pitches = {}
model_features = [
    'vra', 'hra', 
    'rel_side', 'rel_ht', 
    'spin_rate', 'spin_axis', 
    'rel_speed', 'ax0', 'ay0', 'az0'
]

# iterate through features
for f in model_features:
    # get data, initialize importance
    copy = model_data.merge(subject_info, on='subject_id').copy()
    importances[f] = []
    num_pitches[f] = []

    print(f'Computing importances for feature: {f}.')

    # iterate through subject IDs
    for subject_id in models.keys():
        # get subject data & model
        subject_data = copy[copy['subject_id'] == subject_id]
        num_pitches[f].append(subject_data.shape[0])
        peak_elbow_moment_rf = models[subject_id]
    
        # get orig. preds
        pred_orig = peak_elbow_moment_rf.predict(subject_data[peak_elbow_moment_rf.feature_names_in_])
        pred_orig_scaled = pred_orig * subject_data['mass'] * subject_data['height'] * 9.81

        # compute orig. rmse
        rmse_orig = root_mean_squared_error(
            y_true=subject_data['peak_value'],
            y_pred=pred_orig_scaled
        )

        # permute/shuffle 100 times
        rmse_permuted = []
        for i in range(100):
            # permute feature
            subject_data[f] = subject_data[f].sample(frac=1).reset_index(drop=True)

            # get new preds
            pred_permuted = peak_elbow_moment_rf.predict(subject_data[peak_elbow_moment_rf.feature_names_in_])
            pred_permuted_scaled = pred_permuted * subject_data['mass'] * subject_data['height'] * 9.81

            # compute permuted rmse
            new_error = root_mean_squared_error(
                y_true=subject_data['peak_value'],
                y_pred=pred_permuted_scaled
            )
            rmse_permuted.append(new_error - rmse_orig)

        # compute average delta rmse
        delta_rmse = np.mean(rmse_permuted)
        importances[f].append(delta_rmse)

Computing importances for feature: vra.
Computing importances for feature: hra.
Computing importances for feature: rel_side.
Computing importances for feature: rel_ht.
Computing importances for feature: spin_rate.
Computing importances for feature: spin_axis.
Computing importances for feature: rel_speed.
Computing importances for feature: ax0.
Computing importances for feature: ay0.
Computing importances for feature: az0.


In [107]:
# compute weighted average of importances based on number of pitches --> relative importance
wtd_importances = {}
for f in importances.keys():
    if np.array(num_pitches[f]).sum() == 0:
        wtd_importances[f] = 0
    else:
        wtd_importances[f] = np.average(importances[f], weights=num_pitches[f])

In [None]:
# create dataframe w/ importances
ft_importances = pd.DataFrame([wtd_importances]).T.rename(columns={0: 'delta_rmse'})

# compute normalized importances
ft_importances['relative_importance'] = ft_importances['delta_rmse'] / ft_importances['delta_rmse'].sum()
ft_importances.sort_values(by='relative_importance', ascending=False, inplace=True)

In [111]:
ft_importances

Unnamed: 0,delta_rmse,relative_importance
rel_speed,9.150737,0.243871
rel_side,7.839572,0.208928
rel_ht,4.746251,0.12649
spin_rate,4.340058,0.115664
spin_axis,3.990498,0.106348
az0,3.382758,0.090152
ax0,2.077948,0.055378
ay0,1.995034,0.053169
vra,0.0,0.0
hra,0.0,0.0


$\textbf{Close AWS Connections}$

In [112]:
aws_connection.close()

[AWS]: No active connection to close.
