In [74]:
from pybaseball import statcast, statcast_batter, playerid_lookup
import pandas as pd
import numpy as np

from datetime import datetime
from scipy.stats import multivariate_normal
from scipy.spatial.distance import mahalanobis

import warnings
warnings.filterwarnings('ignore')

In [2]:
data21_clus = pd.read_csv('Clustering2021.csv')
data22_clus = pd.read_csv('Clustering2022.csv')

# Dataset Building

## First Pass, New Features

In [68]:
# Define a function to fill NaN values within groups
def fillna_by_pitcher(df, cols):
    '''
    Description: Fills NA values (pitch metrics), applied by pitch type per pitcher
    --------------------------------------------------------------------------------
    Inputs: df, cols
    
    Returns: df
        NA columns filled
    '''
    
    # For each column, take mean of column within dataframe, fill NA values with mean
    for i in cols:
        mean = df[i].mean()
        df[i].fillna(mean,inplace = True)
    
    return df

In [69]:
def clean_train_data(df):
    '''
    Description: Cleans training data, filters dataframe for relevant features, 
    removes non-pitches and fills in NA values for each unique pitch for all pitchers
    --------------------------------------------------------------------------------
    Inputs: df
    
    Returns: df_clean
        Cleaned input df
    '''
    # Define relevant feature columns, values to remove, columns with NA values to fill
    non_pitches = ['FA','PO']
    
    y = ['delta_run_exp']
    
    context_features = ['player_name','p_throws','batter','stand','pitch_type','pitch_number',
            'home_team','game_date','game_pk','at_bat_number',
            'balls','strikes', 'outs_when_up','on_3b', 'on_2b', 'on_1b']
    
    cont_features = ['release_speed','release_extension','effective_speed','release_spin_rate',
            'release_pos_x', 'release_pos_y', 'release_pos_z','spin_axis', 'pfx_x', 'pfx_z',
            'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot']
    
    features = y + context_features + cont_features
    
    # Drop: game_pk, player_name, batter, game_date

    
    # Filter Dataframe for features
    df = df[features]
    
    # Remove pitchouts/non-pitches, pitches with 0 movement (Statcast errors)
    df_filt = df[(~df.pitch_type.isin(non_pitches)) & ((df.pfx_x != 0.0) & (df.pfx_z != 0.0))]
    
    # Define columns to fill or drop if NA
    fill_cols = ['release_speed','release_extension','effective_speed','release_spin_rate',
                 'release_pos_x','release_pos_y','release_pos_z','spin_axis']

    na_cols = ['delta_run_exp','pitch_type','pitch_number','pfx_x','pfx_z',
               'release_pos_x', 'release_pos_y', 'release_pos_z', 
               'release_speed','release_extension','effective_speed','release_spin_rate',
               'spin_axis','sz_top', 'sz_bot']
    
    # Fill in NA values for each pitch and pitcher with mean of each column for each unique 
    df_filled = df_filt.groupby(['player_name','pitch_type']).apply(fillna_by_pitcher, cols = fill_cols)
    df_clean = df_filled.dropna(subset=na_cols)
    
    # OHC Base variables to 0 and 1
    df_clean[['on_1b','on_2b','on_3b']] = df_clean[['on_1b','on_2b','on_3b']].notna().astype(int)
    
    # Sirt dataframe by pitches in chronological order, return
    df_clean = df_clean.sort_values(['game_date','game_pk','at_bat_number','pitch_number'])
    return df_clean

In [70]:
def add_new_features(df, season_start):
    '''
    Description: Adds new features in dataframe
        - inferred_axis: Inferred Spin Axis (SSW Effects)
        - axis_diff: Difference of Inferred and Observed Spin Axis
        - game_week: Change game date to week of season depending on start date of season
        - pitch_count: Pitch # of outing for each outing per pitcher 
    --------------------------------------------------------------------------------
    Inputs: df, season_start (str)
    
    Returns: df
        Dataframe with new features added
    '''
    
    # inferred_axis: 180 / pi * atan(pfx_z / pfx_x) + 90 (where pfx_x is < 0, add 180 degrees.)
    df['inferred_axis'] = np.degrees(np.arctan(df['pfx_z'] / df['pfx_x'])) + 90
    df.loc[df['pfx_x'] < 0, 'inferred_axis'] += 180
    df['axis_diff'] = df['spin_axis'] - df['inferred_axis']
    
    # axis_diff: spin_axis - inferred_axis
    
    # Pitch Count: Cumulative pitch number of outing for pitcher
    df['pitch_count'] = df.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name']).cumcount() + 1
    
    # Create game_week column, where week of season is taken from game_date in Savant
    start_date = datetime.strptime(season_start, '%Y-%m-%d').date()
    
    df['datetime'] = pd.to_datetime(df['game_date'])
    df['game_week'] = df.apply(lambda x: ((x['datetime'].date() - start_date).days // 7) + 1, axis = 1)
    df = df.drop('datetime',axis=1)
    
    return df

## Second Pass, New Features
Note: Need first pass of new features before added second pass

In [14]:
# For "noise" around pitch trajectory, calculate a multivariate normal distribution
# for each unique pitch thrown for each pitcher over a season

# Note: Calculating multivariate distributions for eachn pitch per game is 
# both extremely computationally intensive, and each distribution unstable due to small samples
# of each pitch per game

# Note: Does not include axis_diff for this iteration

def multivariate_normal_distribution(x):
    '''
    Description: Applied to each group, calculate multivariate normal distribution
    for each row's continuous features with mean and covariance matrix
    --------------------------------------------------------------------------------
    Inputs: x (dataframe row, Series)
    
    Returns: mvn_dist, SciPy multivariate normal distribution
    '''
    
    # Define all continuous features 
    cont_feats = ['release_speed','release_extension','effective_speed',
    'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
    'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis','inferred_axis','axis_diff']
    
    # Extract the continuous variables
    continuous_vars = x[cont_feats]  
    
    # Calculate the mean and covariance matrix for the continuous variables
    mean = continuous_vars.mean()
    cov_matrix = continuous_vars.cov().fillna(0) + (np.eye(continuous_vars.cov().shape[0]) * 1e-6)
    
    # Create a multivariate normal distribution object
    mvn_dist = multivariate_normal(mean=mean, cov=cov_matrix, allow_singular=True)
    
    return mvn_dist

In [15]:
def calc_mahalanobis(x):
    '''
    Description: Calculates mahalanobis distance of each pitch's continuous features
    from center, inverse covariance matrix of distribution
    --------------------------------------------------------------------------------
    Inputs: x (type Series)
    
    Returns: mahalanobis_distance (type float)
    '''
    # Defines distribution, continuous features
    distribution = x[-1]
    data = np.array(x[:-1])
    
    # Calculates distance
    mahalanobis_distance= mahalanobis(data, distribution.mean, np.linalg.inv(distribution.cov))
    return mahalanobis_distance

In [77]:
def add_diff_features(data):
    '''
    Description: Adds features of movement and velocity differentials for each pitch
    based on primary pitch per batter handedness per outing
    --------------------------------------------------------------------------------
    Inputs: df
    
    Returns: full_data 
        DataFrame with movement, velocity differentials and mahalanobis distance "noise"
        variables added
    '''
    
    # Groups data by each game_date and pitcher, finds primary fastball (or primary pitch if no fastball)
    primary_fb = data.groupby(['game_date','game_pk','stand','player_name']).agg({
        'pitch_type': lambda x: x[x.isin(['FC','SI','FF'])].value_counts().idxmax() 
        if any(x.isin(['FC','SI','FF'])) 
        else x.value_counts().idxmax()
    }).rename(columns={'pitch_type':'primary_pitch'})


    # Merges training data with primary fastball, defines new columns of primary_pitch
    primary_fb_data = data.merge(primary_fb.reset_index(), 
                                 left_on=['game_date','game_pk','stand','player_name','pitch_type'], 
                                 right_on=['game_date','game_pk','stand','player_name','primary_pitch'], 
                                 how='inner')

    # Define velocity, movement variables to calculate differentials from primary pitch
    velo_mvt_cols = ['release_speed','release_spin_rate','pfx_x', 'pfx_z', 
                     'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis', 'inferred_axis','axis_diff']

    # Calculates mean for velocity, movement variables for each primary pitch for each pitcher per outing
    primary_fb_means = primary_fb_data.groupby(['game_date','game_pk','stand','player_name','pitch_type']).agg({
        i: 'mean' for i in velo_mvt_cols}).rename(columns = {
        i: i + '_mean' for i in velo_mvt_cols
    }).reset_index()
    primary_fb_means = primary_fb_means.rename(columns={'pitch_type':'primary_pitch'})

    # Merges training data with primary pitch mean data
    data_merged = data.merge(primary_fb_means, 
                             on = ['game_date','game_pk','stand','player_name'], how = 'inner')

    # Calculates all differentials for velocity and movenent profiles
    data_merged['velo_diff'] = data_merged['release_speed'] - data_merged['release_speed_mean']
    data_merged['spin_rate_diff'] = data_merged['release_spin_rate'] - data_merged['release_spin_rate_mean']
    data_merged['pfx_x_diff'] = data_merged['pfx_x'] - data_merged['pfx_x_mean']
    data_merged['pfx_z_diff'] = data_merged['pfx_z'] - data_merged['pfx_z_mean']
    data_merged['vx0_diff'] = data_merged['vx0'] - data_merged['vx0_mean']
    data_merged['vy0_diff'] = data_merged['vy0'] - data_merged['vy0_mean']
    data_merged['vz0_diff'] = data_merged['vz0'] - data_merged['vz0_mean']
    data_merged['ax_diff'] = data_merged['ax'] - data_merged['ax_mean']
    data_merged['ay_diff'] = data_merged['ay'] - data_merged['ay_mean']
    data_merged['az_diff'] = data_merged['az'] - data_merged['az_mean']
    data_merged['spin_axis_diff'] = data_merged['spin_axis'] - data_merged['spin_axis_mean']
    data_merged['inferred_axis_diff'] = data_merged['inferred_axis'] - data_merged['inferred_axis_mean']
    data_merged['axis_diff_diff'] = data_merged['axis_diff'] - data_merged['axis_diff_mean']

    # Drops all primary pitch velocity, movement mean columns
    data_merged = data_merged.drop(['primary_pitch'] + [i + '_mean' for i in velo_mvt_cols],axis = 1)
    
    
    cont_feats = ['release_speed','release_extension','effective_speed',
    'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
    'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis','inferred_axis','axis_diff']
    
    # Applies multivariate normal to all unique pitches for each pitcher per year(> 3000 pitches per year)
    pitch_noise_groups = data.groupby(['player_name','pitch_type']).apply(
        multivariate_normal_distribution).reset_index(name = 'MV_Dist')

    # Merges dataframes of training data, dataframe with multivariate distributions,
    # so each unique pitch's distribution included in column for each pitch in training data
    full_data = data_merged.merge(pitch_noise_groups, on = ['player_name','pitch_type'], how = 'inner')

    # Calculate mahalanobis distance for all unique pitch's continuous features based on 
    # center and inverse covariance matrix of each pitch's multivariate distribution
    full_data['mahalanobis'] = full_data[cont_feats + ['MV_Dist']].apply(
        calc_mahalanobis, axis = 1)

    # Drops all multivariate normal distributions, contextual features used for grouping/sorting
    full_data = full_data.drop(
        ['MV_Dist','game_pk', 'player_name', 'batter', 'game_date', 'at_bat_number', 'pitch_number'],axis = 1)
   
    return full_data

### Build Training Datasets

In [60]:
data21_train = add_diff_features(data21_clus)
data22_train = add_diff_features(data22_clus)

training_set = pd.concat([data21_train,data22_train])

training_set.columns

Index(['delta_run_exp', 'p_throws', 'stand', 'home_team', 'balls', 'strikes',
       'outs_when_up', 'on_3b', 'on_2b', 'on_1b', 'release_speed',
       'release_extension', 'effective_speed', 'release_spin_rate',
       'release_pos_x', 'release_pos_y', 'release_pos_z', 'spin_axis', 'pfx_x',
       'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'inferred_axis', 'axis_diff', 'pitch_count', 'game_week',
       'ClusterLabel', 'velo_diff', 'spin_rate_diff', 'pfx_x_diff',
       'pfx_z_diff', 'vx0_diff', 'vy0_diff', 'vz0_diff', 'ax_diff', 'ay_diff',
       'az_diff', 'spin_axis_diff', 'inferred_axis_diff', 'axis_diff_diff',
       'mahalanobis'],
      dtype='object')

In [61]:
training_set.to_csv('TrainingData.csv',index=False)

# Full Test Run, Dataset Building

In [66]:
test_df = pd.read_csv('statcast21.csv')
test_df_april = test_df[(test_df.game_date >= '2021-04-01') & (test_df.game_date <= '2021-05-01')]

In [78]:
test_df_april_clean = clean_train_data(test_df_april)
test_df_clean = add_new_features(test_df_april_clean,'2021-04-01')
test_df_train = add_diff_features(test_df_clean)
test_df_train

Unnamed: 0,delta_run_exp,p_throws,stand,pitch_type,home_team,balls,strikes,outs_when_up,on_3b,on_2b,...,vx0_diff,vy0_diff,vz0_diff,ax_diff,ay_diff,az_diff,spin_axis_diff,inferred_axis_diff,axis_diff_diff,mahalanobis
0,0.038,R,R,FF,COL,0,0,0,0,0,...,-0.001298,0.172731,3.286949,1.689178,-0.336647,1.463105,8.304467,-9.466697,17.771164,3.945964
1,-0.049,R,R,FF,COL,1,0,0,0,0,...,0.463805,-0.503070,-0.328365,2.251365,-0.423340,0.511554,-10.578109,-10.150314,-0.427794,8.599879
2,0.052,R,R,FF,COL,1,1,0,0,0,...,-1.935087,-1.302337,1.729274,0.325825,-0.096784,0.100598,2.421891,0.019912,2.401980,2.852595
3,0.113,R,R,FF,COL,2,1,0,0,0,...,-0.012811,-0.228170,3.261179,4.846520,-1.007520,-2.789695,8.304467,-20.967229,29.271697,4.869120
4,-0.078,R,R,FF,COL,3,1,0,0,0,...,-0.326409,-0.997420,0.591183,3.344360,-1.030708,0.165578,6.421891,-14.827926,21.249818,3.740781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115132,0.158,L,R,CH,MIL,2,0,0,0,1,...,-0.448591,10.879848,0.720610,5.490410,-5.386335,-12.111464,-22.214286,-35.301475,13.087190,1.500000
115133,0.128,L,R,CH,MIL,0,0,1,1,1,...,0.014813,7.865609,-1.117847,6.631185,-3.136536,-9.595460,-23.214286,-32.019760,8.805475,1.500000
115134,-0.101,L,R,CH,MIL,1,0,0,0,1,...,0.457304,12.131015,2.876597,4.449641,-5.203280,-12.189709,-24.214286,-32.502641,8.288355,1.500000
115135,0.097,L,R,CH,MIL,1,1,0,0,1,...,-0.091993,11.836214,0.468508,3.646748,-6.570178,-11.548592,-29.214286,-30.090731,0.876445,1.500000


In [88]:
print(test_df_train.columns)

Index(['delta_run_exp', 'p_throws', 'stand', 'pitch_type', 'home_team',
       'balls', 'strikes', 'outs_when_up', 'on_3b', 'on_2b', 'on_1b',
       'release_speed', 'release_extension', 'effective_speed',
       'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z',
       'spin_axis', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
       'sz_top', 'sz_bot', 'inferred_axis', 'axis_diff', 'pitch_count',
       'game_week', 'velo_diff', 'spin_rate_diff', 'pfx_x_diff', 'pfx_z_diff',
       'vx0_diff', 'vy0_diff', 'vz0_diff', 'ax_diff', 'ay_diff', 'az_diff',
       'spin_axis_diff', 'inferred_axis_diff', 'axis_diff_diff',
       'mahalanobis'],
      dtype='object')
