# Imports

In [2]:
from pybaseball import statcast, statcast_batter, playerid_lookup
import pandas as pd
import numpy as np
import math

from datetime import datetime
from scipy.stats import multivariate_normal
from scipy.spatial.distance import mahalanobis

import warnings
warnings.filterwarnings('ignore')

In [None]:
#data21_clus = pd.read_csv('Data/ClusteredData/Clustered2021.csv', low_memory = False)
#data22_clus = pd.read_csv('Data/ClusteredData/Clustered2022.csv', low_memory = False)
#data21_clus.shape, data22_clus.shape

# Dataset Building

## First Pass, New Features

In [5]:
# Define a function to fill NaN values within groups
def fillna_by_pitcher(df, cols):
    '''
    Description: Fills NA values (pitch metrics), applied by pitch type per pitcher
    --------------------------------------------------------------------------------
    Inputs: df, cols
    
    Returns: df
        NA columns filled
    '''
    
    # For each column, take mean of column within dataframe, fill NA values with mean
    for i in cols:
        mean = df[i].mean()
        df[i].fillna(mean,inplace = True)
    
    return df

In [6]:
def clean_train_data(df):
    '''
    Description: Cleans training data, filters dataframe for relevant features, 
    removes non-pitches and fills in NA values for each unique pitch for all pitchers
    --------------------------------------------------------------------------------
    Inputs: df
    
    Returns: df_clean
        Cleaned input df
    '''
    # Define relevant feature columns, values to remove, columns with NA values to fill
    non_pitches = ['FA','PO','EP','CS']
    
    y = ['delta_run_exp']
    
    y_cats = ['description','bb_type']
    
    context_features = ['game_pk','game_date','player_name','batter', 'pitch_type',
                        'at_bat_number','pitch_number',
                        'home_team','p_throws','stand',
                        'balls','strikes','sz_top', 'sz_bot']
    # do not inlcude baserunners for GIDPs
    
    cont_features = ['release_speed','release_extension','effective_speed','release_spin_rate',
            'release_pos_x', 'release_pos_y', 'release_pos_z','spin_axis', 'pfx_x', 'pfx_z',
            'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'plate_x', 'plate_z',
            'launch_speed','launch_angle', 'hc_x','hc_y']
    
    
    features = y + y_cats + context_features + cont_features
    
    # Drop later: game_pk, player_name, batter, game_date, at_bat_number, pitch_number

    # Filter Dataframe for features
    df = df[features]
    
    # Remove pitchouts/non-pitches, pitches with 0 movement (Statcast errors)
    print("Number of Statcast Errors Removed:",
        df[(df.pitch_type.isin(non_pitches)) | ((df.pfx_x == 0.0) & (df.pfx_z == 0.0))].shape[0])
    
    df_filt = df[(~df.pitch_type.isin(non_pitches)) & ((df.pfx_x != 0.0) & (df.pfx_z != 0.0))]
    """
    # Define columns to fill or drop if NA
    fill_cols = ['release_speed','release_extension','effective_speed','release_spin_rate',
                 'release_pos_x','release_pos_y','release_pos_z','spin_axis']

    fill_dtypes = {'release_speed': 'float64',
                   'release_extension': 'float64',
                   'effective_speed': 'float64',
                   'release_spin_rate': 'int64',
                 'release_pos_x': 'float64',
                   'release_pos_y': 'float64',
                   'release_pos_z': 'float64',
                   'spin_axis':'int64'} 
    
    
    
    # Fill in NA values for each pitch and pitcher with mean of each column for each unique
    

    df_filled = df_filt.groupby(['player_name','pitch_type']).apply(
        fillna_by_pitcher, cols = fill_cols, types = fill_dtypes)
    """

    na_cols = ['delta_run_exp','pitch_number','pfx_x','pfx_z',
               'release_pos_x', 'release_pos_y', 'release_pos_z', 
               'release_speed','release_extension','effective_speed','release_spin_rate',
               'spin_axis','sz_top', 'sz_bot']
    
    df_clean = df_filt.dropna(subset=na_cols)
    
    # get rid of bad values of description/bb_type
    df_clean = df_clean[(~((df_clean.description == 'hit_into_play') & (df_clean.bb_type.isna()))) | 
                        ( (df_clean.bb_type.notna()) & 
                          (df_clean.launch_speed.notna()) & (df_clean.launch_angle.notna()) &
                         (df_clean.hc_x.notna()) & (df_clean.hc_y.notna()) 
                         )  
                       ]
    
    df_clean = df_clean.replace('foul_tip','swinging_strike').replace(
    'swinging_strike_blocked','swinging_strike').replace('blocked_ball','ball').replace(
    'missed_bunt','swinging_strike').replace('bunt_foul_tip','swinging_strike').replace(
        'foul_bunt','foul')
    
    #df_clean = df_clean[~((df_clean.description == 'hit_into_play') & (df_clean.bb_type.isna()))]
    
    # OHC Base variables to 0 and 1
    #df_clean[['on_1b','on_2b','on_3b']] = df_clean[['on_1b','on_2b','on_3b']].notna().astype(int)

    # Sort dataframe by pitches in chronological order, return
    df_clean = df_clean.sort_values(['game_date','game_pk','at_bat_number','pitch_number'])
    return df_clean

In [7]:
def add_new_features(df, season_start):
    '''
    Description: Adds new features in dataframe
        - inferred_axis: Inferred Spin Axis (SSW Effects)
        - axis_diff: Difference of Inferred and Observed Spin Axis
        - game_week: Change game date to week of season depending on start date of season
        - pitch_count: Pitch # of outing for each outing per pitcher 
    --------------------------------------------------------------------------------
    Inputs: df, season_start (str)
    
    Returns: df
        Dataframe with new features added
    '''
    
    # inferred_axis: 180 / pi * atan(pfx_z / pfx_x) + 90 (where pfx_x is < 0, add 180 degrees.)
    df['inferred_axis'] = np.degrees(np.arctan(df['pfx_z'] / df['pfx_x'])) + 90
    df.loc[df['pfx_x'] < 0, 'inferred_axis'] += 180
    df['axis_diff'] = df['spin_axis'] - df['inferred_axis']
    
    # axis_diff: spin_axis - inferred_axis
    
    # Pitch Count: Cumulative pitch number of outing for pitcher
    df['pitch_count'] = df.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name']).cumcount() + 1
    
    
    # Transform hit coordinates
    df['hc_x'] = df['hc_x'] - 130
    df['hc_y'] = 210 - df['hc_y']
    
    # Add spray angle
    df['spray_angle'] = df.apply(lambda row: math.degrees(math.atan2(row['hc_y'], row['hc_x'])), axis=1)
    
    # Create game_week column, where week of season is taken from game_date in Savant
    """
    start_date = datetime.strptime(season_start, '%Y-%m-%d').date()
    
    df['datetime'] = pd.to_datetime(df['game_date'])
    df['game_week'] = df.apply(lambda x: ((x['datetime'].date() - start_date).days // 7) + 1, axis = 1)
    df = df.drop('datetime',axis=1)
    """
    return df

## Second Pass, New Features
Note: Need first pass of new features before added second pass

In [8]:
# For "noise" around pitch trajectory, calculate a multivariate normal distribution
# for each unique pitch thrown for each pitcher over a season

# Note: Calculating multivariate distributions for eachn pitch per game is 
# both extremely computationally intensive, and each distribution unstable due to small samples
# of each pitch per game

# Note: Does not include axis_diff for this iteration

def multivariate_normal_distribution(x):
    '''
    Description: Applied to each group, calculate multivariate normal distribution
    for each row's continuous features with mean and covariance matrix
    --------------------------------------------------------------------------------
    Inputs: x (dataframe row, Series)
    
    Returns: mvn_dist, SciPy multivariate normal distribution
    '''
    
    # Define all continuous features 
    cont_feats = ['release_speed','release_extension','effective_speed',
    'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
    'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis','inferred_axis','axis_diff']
    
    
    # Extract the continuous variables
    continuous_vars = x[cont_feats]  
    
    # Calculate the mean and covariance matrix for the continuous variables
    mean = continuous_vars.mean()
    cov_matrix = continuous_vars.cov().fillna(0) + (np.eye(continuous_vars.cov().shape[0]) * 1e-6)
    
    # Create a multivariate normal distribution object
    mvn_dist = multivariate_normal(mean=mean, cov=cov_matrix, allow_singular=True)
    
    return mvn_dist

In [9]:
def calc_mahalanobis(x):
    '''
    Description: Calculates mahalanobis distance of each pitch's continuous features
    from center, inverse covariance matrix of distribution
    --------------------------------------------------------------------------------
    Inputs: x (type Series)
    
    Returns: mahalanobis_distance (type float)
    '''
    # Defines distribution, continuous features
    distribution = x[-1]
    data = np.array(x[:-1])
    
    # Calculates distance
    mahalanobis_distance= mahalanobis(data, distribution.mean, np.linalg.inv(distribution.cov))
    return mahalanobis_distance

In [10]:
def add_diff_features(data):
    '''
    Description: Adds features of movement and velocity differentials for each pitch
    based on primary pitch per batter handedness per outing
    --------------------------------------------------------------------------------
    Inputs: df
    
    Returns: full_data 
        DataFrame with movement, velocity differentials and mahalanobis distance "noise"
        variables added
    '''
    
    # Groups data by each game_date and pitcher, finds primary fastball (or primary pitch if no fastball)
    primary_fb = data.groupby(['game_date','game_pk','stand','player_name']).agg({
        'pitch_type': lambda x: x[x.isin(['FC','SI','FF'])].value_counts().idxmax() 
        if any(x.isin(['FC','SI','FF'])) 
        else x.value_counts().idxmax()
    }).rename(columns={'pitch_type':'primary_pitch'})


    # Merges training data with primary fastball, defines new columns of primary_pitch
    primary_fb_data = data.merge(primary_fb.reset_index(), 
                                 left_on=['game_date','game_pk','stand','player_name','pitch_type'], 
                                 right_on=['game_date','game_pk','stand','player_name','primary_pitch'], 
                                 how='inner')

    # Define velocity, movement variables to calculate differentials from primary pitch
    # Add release point means later?
    velo_mvt_cols = ['release_speed','release_spin_rate', 
                     'release_pos_x', 'release_pos_y', 'release_pos_z',
                     'pfx_x', 'pfx_z', 
                     'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis', 'inferred_axis','axis_diff']

    # Calculates mean for velocity, movement variables for each primary pitch for each pitcher per outing
    primary_fb_means = primary_fb_data.groupby(['game_date','game_pk','stand','player_name','pitch_type']).agg({
        i: 'mean' for i in velo_mvt_cols}).rename(columns = {
        i: i + '_mean' for i in velo_mvt_cols
    }).reset_index()
    primary_fb_means = primary_fb_means.rename(columns={'pitch_type':'primary_pitch'})

    # Merges training data with primary pitch mean data
    data_merged = data.merge(primary_fb_means, 
                             on = ['game_date','game_pk','stand','player_name'], how = 'inner')

    # Calculates all differentials for velocity and movenent profiles
    data_merged['velo_diff'] = data_merged['release_speed'] - data_merged['release_speed_mean']
    data_merged['spin_rate_diff'] = data_merged['release_spin_rate'] - data_merged['release_spin_rate_mean']
    data_merged['release_pos_x_diff'] = data_merged['release_pos_x'] - data_merged['release_pos_x_mean']
    data_merged['release_pos_y_diff'] = data_merged['release_pos_y'] - data_merged['release_pos_y_mean']
    data_merged['release_pos_z_diff'] = data_merged['release_pos_z'] - data_merged['release_pos_z_mean']
    data_merged['pfx_x_diff'] = data_merged['pfx_x'] - data_merged['pfx_x_mean']
    data_merged['pfx_z_diff'] = data_merged['pfx_z'] - data_merged['pfx_z_mean']
    data_merged['vx0_diff'] = data_merged['vx0'] - data_merged['vx0_mean']
    data_merged['vy0_diff'] = data_merged['vy0'] - data_merged['vy0_mean']
    data_merged['vz0_diff'] = data_merged['vz0'] - data_merged['vz0_mean']
    data_merged['ax_diff'] = data_merged['ax'] - data_merged['ax_mean']
    data_merged['ay_diff'] = data_merged['ay'] - data_merged['ay_mean']
    data_merged['az_diff'] = data_merged['az'] - data_merged['az_mean']
    data_merged['spin_axis_diff'] = data_merged['spin_axis'] - data_merged['spin_axis_mean']
    data_merged['inferred_axis_diff'] = data_merged['inferred_axis'] - data_merged['inferred_axis_mean']
    data_merged['axis_diff_diff'] = data_merged['axis_diff'] - data_merged['axis_diff_mean']

    # Drops all primary pitch velocity, movement mean columns
    data_merged = data_merged.drop(['primary_pitch'] + [i + '_mean' for i in velo_mvt_cols],axis = 1)
    
    cont_feats = ['release_speed','release_extension','effective_speed',
    'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
    'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 
                  'spin_axis','inferred_axis','axis_diff']
    
    # Applies multivariate normal to all unique pitches for each pitcher per year(> 3000 pitches per year)
    pitch_noise_groups = data.groupby(['player_name','pitch_type']).apply(
        multivariate_normal_distribution).reset_index(name = 'MV_Dist')

    # Merges dataframes of training data, dataframe with multivariate distributions,
    # so each unique pitch's distribution included in column for each pitch in training data
    full_data = data_merged.merge(pitch_noise_groups, on = ['player_name','pitch_type'], how = 'inner')

    # Calculate mahalanobis distance for all unique pitch's continuous features based on 
    # center and inverse covariance matrix of each pitch's multivariate distribution
    full_data['mahalanobis'] = full_data[cont_feats + ['MV_Dist']].apply(
        calc_mahalanobis, axis = 1)

    # Drops all multivariate normal distributions, contextual features used for grouping/sorting
    full_data = full_data.drop(
        ['MV_Dist','game_pk', 'player_name', 'batter', 'game_date', 'at_bat_number', 'pitch_number'],axis = 1)
    
    ord_cols = ['delta_run_exp', 'description', 'bb_type','p_throws', 'stand',
       'home_team', 'balls', 'strikes', 'pitch_count', #'game_week',
       'pitch_type', 'plate_x','plate_z',
       'release_speed', 'release_extension', 'effective_speed',
       'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z',
       'spin_axis', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
       'sz_top', 'sz_bot', 'inferred_axis', 'axis_diff', 'velo_diff',
        'release_pos_x_diff', 'release_pos_y_diff', 'release_pos_z_diff',
       'spin_rate_diff', 'pfx_x_diff', 'pfx_z_diff', 'vx0_diff', 'vy0_diff',
       'vz0_diff', 'ax_diff', 'ay_diff', 'az_diff', 'spin_axis_diff',
       'inferred_axis_diff', 'axis_diff_diff', 'mahalanobis', 'launch_speed','launch_angle',
        'hc_x', 'hc_y', "spray_angle"]
    
    full_data = full_data[ord_cols]
    return full_data

In [11]:
# Re-order training dataset, write to csv file
# Training set has no context-neutral run values
#training_set = training_set[ord_cols].reset_index(drop=True)
#training_set.to_csv('TrainingDataAll.csv',index=False)

# Full Test Run, Dataset Building

In [9]:
# Test: Import dataset of 2021 statcast data
statcast21 = pd.read_csv('Data/Statcast/statcast21.csv')
statcast22 = pd.read_csv('Data/Statcast/statcast22.csv')

#test_df_april = test_df[(test_df.game_date >= '2021-04-01') & (test_df.game_date <= '2021-05-01')]

In [12]:
def build_train_dataset(data, start_date):
    # Clean Data
    clean_data = clean_train_data(data)

    print("Adding new features...")
    clean_data = add_new_features(clean_data,start_date)

    print('Adding Differentials, Noise Variables...')
    data_train = add_diff_features(clean_data)
    return data_train

In [None]:
# Clean data, add additional features, pitch "noise" features
statcast21_clean = clean_train_data(statcast21)

print("Adding new features...")
statcast21_clean = add_new_features(statcast21_clean,'2021-04-01')

print('Adding Differentials, Noise Variables...')
statcast21_train = add_diff_features(statcast21_clean)
statcast21_train

In [None]:
# Clean data, add additional features, pitch "noise" features
statcast22_clean = clean_train_data(statcast22)

print("Adding new features...")
statcast22_clean = add_new_features(statcast22_clean,'2022-03-31')

print('Adding Differentials, Noise Variables...')
statcast22_train = add_diff_features(statcast22_clean)
statcast22_train

In [None]:
statcast_train = pd.concat([statcast21_train,statcast22_train])

In [None]:
statcast_train.to_csv('statcast_train.csv',index=False)

In [None]:
statcast_train.shape, statcast_train.columns

# Create Target Variables

In [10]:
# Import training data (all)
training_data = pd.read_csv('statcast_train.csv', low_memory=False)

In [11]:
training_data.shape, training_data.columns

((1387824, 54),
 Index(['delta_run_exp', 'description', 'bb_type', 'p_throws', 'stand',
        'home_team', 'balls', 'strikes', 'pitch_count', 'pitch_type', 'plate_x',
        'plate_z', 'release_speed', 'release_extension', 'effective_speed',
        'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z',
        'spin_axis', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
        'sz_top', 'sz_bot', 'inferred_axis', 'axis_diff', 'velo_diff',
        'release_pos_x_diff', 'release_pos_y_diff', 'release_pos_z_diff',
        'spin_rate_diff', 'pfx_x_diff', 'pfx_z_diff', 'vx0_diff', 'vy0_diff',
        'vz0_diff', 'ax_diff', 'ay_diff', 'az_diff', 'spin_axis_diff',
        'inferred_axis_diff', 'axis_diff_diff', 'mahalanobis', 'launch_speed',
        'launch_angle', 'hc_x', 'hc_y', 'spray_angle'],
       dtype='object'))

In [13]:
def add_id_cols(row):
    '''
    Description: Add swing/no-swing, contact/swinging-strike, foul/fair event columns for each  
    row based on description values in pitch data
    --------------------------------------------------------------------------------
    Inputs: row
    
    Returns: row 
        DataFrame row with added swing, contact, foul event columns I.e. '<90_GB, 100_105_FB, >105_LD, etc.'
    '''
    
    # Define, non-swing, contact, swstrikes, foul events
    non_swings = ['ball','called_strike','hit_by_pitch','blocked_ball']
    contact_events = ['foul', 'hit_into_play','foul_bunt']  
    sw_strikes = ['swinging_strike','foul_tip','bunt_foul_tip', 'missed_bunt', 'swinging_strike_blocked']
    foul_events = ['foul', 'foul_bunt']  
    
    # If a non-swing event, create no swing column
    if row['description'] in non_swings:
        # no-swing value
        swing_event = 'no_swing'
        
        # Standardize description for context-neutral run values
        #row['description'] = row['description'].replace('blocked_ball', 'ball')
        #row['description'] = row['description'].replace('called_strike', 'strike')
    else:
        # swing value
        swing_event = 'swing'

    # If a swinging strike or contact event, create contact event column
    if row['description'] in sw_strikes:
        # swinging_strike value, standardize description
        contact_event = 'swinging_strike'
        
        #row['description'] = 'strike'
    elif row['description'] in contact_events:
        # Contact
        contact_event = 'contact'
    else:
        # If neither swinging strike nor contact, i.e. ball, HBP
        contact_event = np.nan
        
    # If contact is made, create foul/fair event column
    if row['description'] in foul_events:
        # Foul Event, standardize description to foul balls only
        foul_event = 'foul'
        
        #row['description'] = row['description'].replace('foul_bunt', 'foul')
    elif row['description'] == 'hit_into_play':
        # Fair Event
        foul_event = 'fair'
    else:
        # If no contact is made
        foul_event = np.nan

    # Add all events to series, return row with series appended
    events = pd.Series({'Swing_Event': swing_event, 
                        'Contact_Event': contact_event,
                        'Foul_Event': foul_event})
    return row.append(events)

In [14]:
def add_la_ev_bucket(row):
    '''
    Description: Add la/ev bucket column value for each row based on bb_type, 
    launch_speed values in pitch data
    --------------------------------------------------------------------------------
    Inputs: row
    
    Returns: row 
        DataFrame row with added category str of bb_type/la bucket + ev_bucket
    '''
    
    # Define batted-ball types
    gb = row['bb_type'] == 'ground_ball'
    ld = row['bb_type'] == 'line_drive'
    fb = row['bb_type'] == 'fly_ball'
    pu = row['bb_type'] == 'popup'
    
    # Define EV bucket ranges (< 90, 90 < x < 95, etc.)
    less_90 = row['launch_speed'] < 90.0
    betw_90_95 = row['launch_speed'] >= 90.0 and row['launch_speed'] < 95.0
    betw_95_100 = row['launch_speed'] >= 95.0 and row['launch_speed'] < 100.0
    betw_100_105 = row['launch_speed'] >= 100.0 and row['launch_speed'] < 105.0
    greater_105 = row['launch_speed'] >= 105.0
    
    # If gb, lb, fb, pp batted-ball type
    if gb:
        # Else-IF logic for each EV bucket
        if less_90:
            category = '<90_GB'
        elif betw_90_95:
            category = '90_95_GB'
        elif betw_95_100:
            category = '95_100_GB'
        elif betw_100_105:
            category = '100_105_GB'
        elif greater_105:
             category = '>105_GB'
        else:
            category = np.nan
    elif ld:
        if less_90:
            category = '<90_LD'
        elif betw_90_95:
            category = '90_95_LD'
        elif betw_95_100:
            category = '95_100_LD'
        elif betw_100_105:
            category = '100_105_LD'
        elif greater_105:
             category = '>105_LD'
        else:
            category = np.nan
    elif fb:
        if less_90:
            category = '<90_FB'
        elif betw_90_95:
            category = '90_95_FB'
        elif betw_95_100:
            category = '95_100_FB'
        elif betw_100_105:
            category = '100_105_FB'
        elif greater_105:
             category = '>105_FB'
        else:
            category = np.nan
    elif pu:
        if less_90:
            category = '<90_PU'
        elif betw_90_95:
            category = '90_95_PU'
        elif betw_95_100:
            category = '95_100_PU'
        elif betw_100_105:
            category = '100_105_PU'
        elif greater_105:
             category = '>105_PU'
        else:
            category = np.nan
    else:
        category = np.nan
        
    #print(category)
    
    # Define series for la/ev bucket, append new column to row
    la_ev_bucket = pd.Series({'LA_EV': category})
    return row.append(la_ev_bucket)

In [15]:
def add_spray_bucket(row):
    """
    
    """
    # Define conditions and corresponding values for the new column
    conditions = [
        (row['stand'] == 'R') & (row['bb_type'] == 'ground_ball') & (row['spray_angle'] <= 75),
        (row['stand'] == 'R') & (row['bb_type'] == 'ground_ball') & (row['spray_angle'] > 75) & (row['spray_angle'] < 105),
        (row['stand'] == 'R') & (row['bb_type'] == 'ground_ball') & (row['spray_angle'] >= 105),

        (row['stand'] == 'R') & (row['bb_type'] == 'line_drive') & (row['spray_angle'] <= 75),
        (row['stand'] == 'R') & (row['bb_type'] == 'line_drive') & (row['spray_angle'] > 75) & (row['spray_angle'] < 105),
        (row['stand'] == 'R') & (row['bb_type'] == 'line_drive') & (row['spray_angle'] >= 105),

        (row['stand'] == 'R') & (row['bb_type'] == 'fly_ball') & (row['spray_angle'] <= 75),
        (row['stand'] == 'R') & (row['bb_type'] == 'fly_ball') & (row['spray_angle'] > 75) & (row['spray_angle'] < 105),
        (row['stand'] == 'R') & (row['bb_type'] == 'fly_ball') & (row['spray_angle'] >= 105),

        (row['stand'] == 'R') & (row['bb_type'] == 'popup') & (row['spray_angle'] <= 75),
        (row['stand'] == 'R') & (row['bb_type'] == 'popup') & (row['spray_angle'] > 75) & (row['spray_angle'] < 105),
        (row['stand'] == 'R') & (row['bb_type'] == 'popup') & (row['spray_angle'] >= 105),

        (row['stand'] == 'L') & (row['bb_type'] == 'ground_ball') & (row['spray_angle'] >= 105),
        (row['stand'] == 'L') & (row['bb_type'] == 'ground_ball') & (row['spray_angle'] > 75) & (row['spray_angle'] < 105),
        (row['stand'] == 'L') & (row['bb_type'] == 'ground_ball') & (row['spray_angle'] <= 75),

        (row['stand'] == 'L') & (row['bb_type'] == 'line_drive') & (row['spray_angle'] >= 105),
        (row['stand'] == 'L') & (row['bb_type'] == 'line_drive') & (row['spray_angle'] > 75) & (row['spray_angle'] < 105),
        (row['stand'] == 'L') & (row['bb_type'] == 'line_drive') & (row['spray_angle'] <= 75),

        (row['stand'] == 'L') & (row['bb_type'] == 'fly_ball') & (row['spray_angle'] >= 105),
        (row['stand'] == 'L') & (row['bb_type'] == 'fly_ball') & (row['spray_angle'] > 75) & (row['spray_angle'] < 105),
        (row['stand'] == 'L') & (row['bb_type'] == 'fly_ball') & (row['spray_angle'] <= 75),

        (row['stand'] == 'L') & (row['bb_type'] == 'popup') & (row['spray_angle'] <= 75),
        (row['stand'] == 'L') & (row['bb_type'] == 'popup') & (row['spray_angle'] > 75) & (row['spray_angle'] < 105),
        (row['stand'] == 'L') & (row['bb_type'] == 'popup') & (row['spray_angle'] >= 105),
    ]

    choices = ['Oppo_GB', 'Cent_GB', 'Pull_GB', 
               'Oppo_LD', 'Cent_LD', 'Pull_LD', 
               'Oppo_FB', 'Cent_FB', 'Pull_FB',
               'Oppo_PU', 'Cent_PU', 'Pull_PU',
              'Oppo_GB', 'Cent_GB', 'Pull_GB', 
               'Oppo_LD', 'Cent_LD', 'Pull_LD', 
               'Oppo_FB', 'Cent_FB', 'Pull_FB',
              'Oppo_PU', 'Cent_PU', 'Pull_PU']

    category = np.select(conditions, choices, default=np.nan)
    
    # Define series for la/ev bucket, append new column to row
    spray_bucket = pd.Series({'Spray_Bucket': category})
    return row.append(spray_bucket)

In [15]:
# Add columns for contact event, foul/fair, in-play event, la/ev bucket, spray bucket
print('Adding ID Columns...')
train_data_id = training_data.apply(add_id_cols, axis=1)
print('Adding LA EV Buckets...')
train_data_la_ev = train_data_id.apply(add_la_ev_bucket, axis=1)
print("Adding Spray Bucket...")
train_data_labelled = train_data_la_ev.apply(add_spray_bucket, axis=1)

train_data_labelled

Adding ID Columns...
Adding LA EV Buckets...
Adding Spray Bucket...


Unnamed: 0,delta_run_exp,description,bb_type,p_throws,stand,home_team,balls,strikes,pitch_count,pitch_type,...,launch_speed,launch_angle,hc_x,hc_y,spray_angle,Swing_Event,Contact_Event,Foul_Event,LA_EV,Spray_Bucket
0,-0.049,foul,,R,R,COL,1,0,1,FF,...,77.7,39.0,,,,swing,contact,foul,,
1,0.052,ball,,R,R,COL,1,1,2,FF,...,,,,,,no_swing,,,,
2,-0.078,called_strike,,R,R,COL,3,1,3,FF,...,,,,,,no_swing,,,,
3,-0.114,foul,,R,R,COL,0,1,7,FF,...,74.5,-23.0,,,,swing,contact,foul,,
4,-0.038,foul,,R,R,COL,0,0,21,FF,...,76.5,17.0,,,,swing,contact,foul,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387819,0.023,ball,,R,L,HOU,0,0,5,CU,...,,,,,,no_swing,,,,
1387820,0.032,ball,,R,L,HOU,0,0,29,CH,...,,,,,,no_swing,,,,
1387821,0.044,ball,,R,L,HOU,1,0,30,CH,...,,,,,,no_swing,,,,
1387822,-0.066,foul,,R,L,HOU,0,0,36,CH,...,107.6,22.0,,,,swing,contact,foul,,


In [17]:
train_data_labelled.shape, train_data_labelled.columns

((1387824, 59),
 Index(['delta_run_exp', 'description', 'bb_type', 'p_throws', 'stand',
        'home_team', 'balls', 'strikes', 'pitch_count', 'pitch_type', 'plate_x',
        'plate_z', 'release_speed', 'release_extension', 'effective_speed',
        'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z',
        'spin_axis', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
        'sz_top', 'sz_bot', 'inferred_axis', 'axis_diff', 'velo_diff',
        'release_pos_x_diff', 'release_pos_y_diff', 'release_pos_z_diff',
        'spin_rate_diff', 'pfx_x_diff', 'pfx_z_diff', 'vx0_diff', 'vy0_diff',
        'vz0_diff', 'ax_diff', 'ay_diff', 'az_diff', 'spin_axis_diff',
        'inferred_axis_diff', 'axis_diff_diff', 'mahalanobis', 'launch_speed',
        'launch_angle', 'hc_x', 'hc_y', 'spray_angle', 'Swing_Event',
        'Contact_Event', 'Foul_Event', 'LA_EV', 'Spray_Bucket'],
       dtype='object'))

In [16]:
# save to csv file, (w/o context-neutral run values)
#train_data_labelled.to_csv('TrainingDataLabelled.csv',index=False)
train_data_labelled.to_csv('TrainDataNonClustered.csv',index=False)

 # Cluster Pitches

In [None]:
# Do not cluster if a pitcher only has so many pitches in total/per pitch

# Splitting Data by Sub-Model

Pitch Type
- Fastballs
- Offspeed
- Breaking Balls

Events
- Swing/No Swing Events
- Contact/No Contact Events
- Foul/Fair Ball Events

In [2]:
#all_data = pd.read_csv('TrainingDataLabelled.csv', low_memory=False)
all_data = pd.read_csv('TrainDataNonClustered.csv', low_memory=False)

In [3]:
all_data

Unnamed: 0,delta_run_exp,description,bb_type,p_throws,stand,home_team,balls,strikes,pitch_count,pitch_type,...,launch_speed,launch_angle,hc_x,hc_y,spray_angle,Swing_Event,Contact_Event,Foul_Event,LA_EV,Spray_Bucket
0,-0.049,foul,,R,R,COL,1,0,1,FF,...,77.7,39.0,,,,swing,contact,foul,,
1,0.052,ball,,R,R,COL,1,1,2,FF,...,,,,,,no_swing,,,,
2,-0.078,called_strike,,R,R,COL,3,1,3,FF,...,,,,,,no_swing,,,,
3,-0.114,foul,,R,R,COL,0,1,7,FF,...,74.5,-23.0,,,,swing,contact,foul,,
4,-0.038,foul,,R,R,COL,0,0,21,FF,...,76.5,17.0,,,,swing,contact,foul,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387819,0.023,ball,,R,L,HOU,0,0,5,CU,...,,,,,,no_swing,,,,
1387820,0.032,ball,,R,L,HOU,0,0,29,CH,...,,,,,,no_swing,,,,
1387821,0.044,ball,,R,L,HOU,1,0,30,CH,...,,,,,,no_swing,,,,
1387822,-0.066,foul,,R,L,HOU,0,0,36,CH,...,107.6,22.0,,,,swing,contact,foul,,


In [19]:
# Define irrelevant columns to drop for each dataset/model combination
# I.e. in-play dataset/model do not need swing, contact, or foul event columns

swing_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle', 'hc_x', 'hc_y','Contact_Event','Foul_Event','LA_EV','Spray_Bucket']

no_swing_drop_cols = ['delta_run_exp','bb_type','launch_speed',
             'launch_angle','hc_x', 'hc_y','Swing_Event','Contact_Event',
                      'Foul_Event','LA_EV','Spray_Bucket']

contact_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle','hc_x', 'hc_y','Swing_Event','Foul_Event','LA_EV','Spray_Bucket']

foul_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle','hc_x', 'hc_y','Swing_Event','Contact_Event','LA_EV','Spray_Bucket']

in_play1_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle','hc_x', 'hc_y','Swing_Event','Contact_Event',
                      'Foul_Event','Spray_Bucket']

in_play2_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle','hc_x', 'hc_y','Swing_Event','Contact_Event','Foul_Event','LA_EV']

In [5]:
# Split data by pitch class, create datasets for each sub-model for each class 
fastballs = all_data[all_data.pitch_type.isin(['FF','SI','FC'])]
offspeeds = all_data[all_data.pitch_type.isin(['CH','FS','KN','FO'])]
breaking_balls = all_data[all_data.pitch_type.isin(['SL','KC','CU','ST','SV'])]

In [47]:
stuff_cols = ['p_throws', 'stand', 'home_team', 'pitch_count',
              'release_speed', 'release_extension', 'effective_speed',
              'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z',
              'spin_axis', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
              'inferred_axis', 'axis_diff', 'velo_diff',
              'release_pos_x_diff', 'release_pos_y_diff', 'release_pos_z_diff',
              'spin_rate_diff', 'pfx_x_diff', 'pfx_z_diff', 'vx0_diff', 'vy0_diff',
              'vz0_diff', 'ax_diff', 'ay_diff', 'az_diff', 'spin_axis_diff',
              'inferred_axis_diff', 'axis_diff_diff', 'mahalanobis']

location_cols = [ 'p_throws', 'stand', 'home_team', 'balls', 'strikes', 
                 'pitch_count', 'pitch_type',
                 'sz_top', 'sz_bot', 'plate_x','plate_z']

overall_cols = ['p_throws', 'stand','home_team', 'pitch_count', 
                'pitch_type', 'sz_top', 'sz_bot','plate_x', 'plate_z',
                'release_speed', 'release_extension', 'effective_speed',
                'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z',
                'spin_axis', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
                'inferred_axis', 'axis_diff', 'velo_diff',
                'release_pos_x_diff', 'release_pos_y_diff', 'release_pos_z_diff',
                'spin_rate_diff', 'pfx_x_diff', 'pfx_z_diff', 'vx0_diff', 'vy0_diff',
                'vz0_diff', 'ax_diff', 'ay_diff', 'az_diff', 'spin_axis_diff',
                'inferred_axis_diff', 'axis_diff_diff', 'mahalanobis']

## Build Datasets

In [48]:
def build_stuff_datasets(pitch_df, cols = stuff_cols):
    
    contact = pitch_df[pitch_df.Contact_Event.notna()][cols + ['Contact_Event']]
    #.drop(contact_drop_cols,axis=1)

    foul = pitch_df[pitch_df.Foul_Event.notna()][cols + ['Foul_Event']]
    #.drop(foul_drop_cols,axis=1)
    
    in_play1 = pitch_df[pitch_df.Foul_Event == 'fair'][cols + ['LA_EV']]
    #.drop(in_play1_drop_cols,axis=1)
    
    in_play2 = pitch_df[pitch_df.Foul_Event == 'fair'][cols + ['Spray_Bucket']]
    #.drop(in_play2_drop_cols,axis=1)
    
    return contact, foul, in_play1, in_play2

def build_location_datasets(pitch_df, cols = location_cols):
    swing = pitch_df[pitch_df.Swing_Event.notna()][cols + ['Swing_Event']]
    #.drop(contact_drop_cols,axis=1)
    
    no_swing = pitch_df[pitch_df.Swing_Event == 'no_swing'][cols + ['description']].rename(
        columns = {'description': 'No_Swing_Event'})
    
    contact = pitch_df[pitch_df.Contact_Event.notna()][cols + ['Contact_Event']]
    #.drop(contact_drop_cols,axis=1)

    foul = pitch_df[pitch_df.Foul_Event.notna()][cols + ['Foul_Event']]
    #.drop(foul_drop_cols,axis=1)
    
    in_play1 = pitch_df[pitch_df.Foul_Event == 'fair'][cols + ['LA_EV']]
    #.drop(in_play1_drop_cols,axis=1)
    
    in_play2 = pitch_df[pitch_df.Foul_Event == 'fair'][cols + ['Spray_Bucket']]
    
    return swing, no_swing, contact, foul, in_play1, in_play2
    
    
def overall_datasets(pitch_df, cols = overall_cols):
    
    swing = pitch_df[pitch_df.Swing_Event.notna()][cols + ['Swing_Event']]
    #.drop(contact_drop_cols,axis=1)
    
    no_swing = pitch_df[pitch_df.Swing_Event == 'no_swing'][cols + ['description']].rename(
        columns = {'description': 'No_Swing_Event'})
    
    contact = pitch_df[pitch_df.Contact_Event.notna()][cols + ['Contact_Event']]
    #.drop(contact_drop_cols,axis=1)

    foul = pitch_df[pitch_df.Foul_Event.notna()][cols + ['Foul_Event']]
    #.drop(foul_drop_cols,axis=1)
    
    in_play1 = pitch_df[pitch_df.Foul_Event == 'fair'][cols + ['LA_EV']]
    #.drop(in_play1_drop_cols,axis=1)
    
    in_play2 = pitch_df[pitch_df.Foul_Event == 'fair'][cols + ['Spray_Bucket']]
    #.drop(in_play2_drop_cols,axis=1)
    
    return swing, no_swing, contact, foul, in_play1, in_play2

# Stuff

In [12]:
fb_contact, fb_foul, fb_in_play1, fb_in_play2 = build_stuff_datasets(fastballs)

In [13]:
fb_contact.to_csv('Data/Models/FB Models/Stuff_FB_Contact.csv',index = False)
fb_foul.to_csv('Data/Models/FB Models/Stuff_FB_Foul.csv',index = False)
fb_in_play1.to_csv('Data/Models/FB Models/Stuff_FB_InPlay1.csv',index = False)
fb_in_play2.to_csv('Data/Models/FB Models/Stuff_FB_InPlay2.csv',index = False)

In [14]:
os_contact, os_foul, os_in_play1, os_in_play2 = build_stuff_datasets(offspeeds)

In [15]:
os_contact.to_csv('Data/Models/OS Models/Stuff_OS_Contact.csv',index = False)
os_foul.to_csv('Data/Models/OS Models/Stuff_OS_Foul.csv',index = False)
os_in_play1.to_csv('Data/Models/OS Models/Stuff_OS_InPlay1.csv',index = False)
os_in_play2.to_csv('Data/Models/OS Models/Stuff_OS_InPlay2.csv',index = False)

In [16]:
brba_contact, brba_foul, brba_in_play1, brba_in_play2 = build_stuff_datasets(breaking_balls)

In [17]:
brba_contact.to_csv('Data/Models/BrBa Models/Stuff_BrBa_Contact.csv',index = False)
brba_foul.to_csv('Data/Models/BrBa Models/Stuff_BrBa_Foul.csv',index = False)
brba_in_play1.to_csv('Data/Models/BrBa Models/Stuff_BrBa_InPlay1.csv',index = False)
brba_in_play2.to_csv('Data/Models/BrBa Models/Stuff_BrBa_InPlay2.csv',index = False)

# Location

In [94]:
fb_swing, fb_no_swing, fb_contact, fb_foul, fb_in_play1, fb_in_play2 = build_location_datasets(
    fastballs)

In [95]:
fb_swing.to_csv('Data/Models/FB Models/Location_FB_Swing.csv',index = False)
fb_no_swing.to_csv('Data/Models/FB Models/Location_FB_NoSwing.csv',index = False)
fb_contact.to_csv('Data/Models/FB Models/Location_FB_Contact.csv',index = False)
fb_foul.to_csv('Data/Models/FB Models/Location_FB_Foul.csv',index = False)
fb_in_play1.to_csv('Data/Models/FB Models/Location_FB_InPlay1.csv',index = False)
fb_in_play2.to_csv('Data/Models/FB Models/Location_FB_InPlay2.csv',index = False)

In [96]:
os_swing, os_no_swing, os_contact, os_foul, os_in_play1, os_in_play2 = build_location_datasets(
    offspeeds)

In [97]:
os_swing.to_csv('Data/Models/OS Models/Location_OS_Swing.csv',index = False)
os_no_swing.to_csv('Data/Models/OS Models/Location_OS_NoSwing.csv',index = False)
os_contact.to_csv('Data/Models/OS Models/Location_OS_Contact.csv',index = False)
os_foul.to_csv('Data/Models/OS Models/Location_OS_Foul.csv',index = False)
os_in_play1.to_csv('Data/Models/OS Models/Location_OS_InPlay1.csv',index = False)
os_in_play2.to_csv('Data/Models/OS Models/Location_OS_InPlay2.csv',index = False)

In [98]:
brba_swing, brba_no_swing, brba_contact, brba_foul, brba_in_play1, brba_in_play2 = build_location_datasets(
    breaking_balls)

In [99]:
brba_swing.to_csv('Data/Models/BrBa Models/Location_BrBa_Swing.csv',index = False)
brba_no_swing.to_csv('Data/Models/BrBa Models/Location_BrBa_NoSwing.csv',index = False)
brba_contact.to_csv('Data/Models/BrBa Models/Location_BrBa_Contact.csv',index = False)
brba_foul.to_csv('Data/Models/BrBa Models/Location_BrBa_Foul.csv',index = False)
brba_in_play1.to_csv('Data/Models/BrBa Models/Location_BrBa_InPlay1.csv',index = False)
brba_in_play2.to_csv('Data/Models/BrBa Models/Location_BrBa_InPlay2.csv',index = False)

# Overall

In [100]:
fb_swing, fb_no_swing, fb_contact, fb_foul, fb_in_play1, fb_in_play2 = overall_datasets(
    fastballs)

In [101]:
fb_swing.to_csv('Data/Models/FB Models/Overall_FB_Swing.csv',index = False)
fb_no_swing.to_csv('Data/Models/FB Models/Overall_FB_NoSwing.csv',index = False)
fb_contact.to_csv('Data/Models/FB Models/Overall_FB_Contact.csv',index = False)
fb_foul.to_csv('Data/Models/FB Models/Overall_FB_Foul.csv',index = False)
fb_in_play1.to_csv('Data/Models/FB Models/Overall_FB_InPlay1.csv',index = False)
fb_in_play2.to_csv('Data/Models/FB Models/Overall_FB_InPlay2.csv',index = False)

In [102]:
os_swing, os_no_swing, os_contact, os_foul, os_in_play1, os_in_play2 = overall_datasets(
    offspeeds)

In [103]:
os_swing.to_csv('Data/Models/OS Models/Overall_OS_Swing.csv',index = False)
os_no_swing.to_csv('Data/Models/OS Models/Overall_OS_NoSwing.csv',index = False)
os_contact.to_csv('Data/Models/OS Models/Overall_OS_Contact.csv',index = False)
os_foul.to_csv('Data/Models/OS Models/Overall_OS_Foul.csv',index = False)
os_in_play1.to_csv('Data/Models/OS Models/Overall_OS_InPlay1.csv',index = False)
os_in_play2.to_csv('Data/Models/OS Models/Overall_OS_InPlay2.csv',index = False)

In [104]:
brba_swing, brba_no_swing, brba_contact, brba_foul, brba_in_play1, brba_in_play2 = overall_datasets(
    breaking_balls)

In [105]:
brba_swing.to_csv('Data/Models/BrBa Models/Overall_BrBa_Swing.csv',index = False)
brba_no_swing.to_csv('Data/Models/BrBa Models/Overall_BrBa_NoSwing.csv',index = False)
brba_contact.to_csv('Data/Models/BrBa Models/Overall_BrBa_Contact.csv',index = False)
brba_foul.to_csv('Data/Models/BrBa Models/Overall_BrBa_Foul.csv',index = False)
brba_in_play1.to_csv('Data/Models/BrBa Models/Overall_BrBa_InPlay1.csv',index = False)
brba_in_play2.to_csv('Data/Models/BrBa Models/Overall_BrBa_InPlay2.csv',index = False)

# Validation Set

In [3]:
val_data = pd.read_csv('Data/Statcast/statcast23.csv')

In [16]:
val_df_clean = build_train_dataset(val_data, '2023-03-30')

Number of Statcast Errors Removed: 1767
Adding new features...
Adding Differentials, Noise Variables...


In [17]:
val_df_clean

Unnamed: 0,delta_run_exp,description,bb_type,p_throws,stand,home_team,balls,strikes,pitch_count,pitch_type,...,az_diff,spin_axis_diff,inferred_axis_diff,axis_diff_diff,mahalanobis,launch_speed,launch_angle,hc_x,hc_y,spray_angle
0,0.036,ball,,R,L,SEA,0,0,1,FF,...,0.577141,6.647059,3.994374,2.652685,4.281945,,,,,
1,0.135,ball,,R,L,SEA,2,0,3,FF,...,-0.101199,2.647059,-6.392367,9.039426,5.421181,,,,,
2,-0.078,called_strike,,R,L,SEA,3,0,4,FF,...,-1.023014,-1.352941,7.511476,-8.864417,4.921815,,,,,
3,-0.023,called_strike,,R,L,SEA,0,1,11,FF,...,3.124875,-4.352941,-3.367327,-0.985615,4.611812,,,,,
4,-0.026,foul,,R,L,SEA,0,0,20,FF,...,-2.600233,-15.352941,12.105358,-27.458299,4.483095,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706795,-0.055,called_strike,,R,R,MIL,1,1,24,ST,...,-2.862843,-101.000000,-27.742495,-73.257505,2.846030,,,,,
706796,-0.173,hit_into_play,ground_ball,R,R,MIL,1,2,25,ST,...,1.088593,-66.000000,-15.478943,-50.521057,2.846014,74.2,8.0,-5.32,70.62,94.308112
706797,-0.017,swinging_strike,,R,R,MIL,0,0,31,ST,...,-1.759976,-77.000000,-23.854877,-53.145123,2.846019,,,,,
706798,-0.023,called_strike,,R,R,MIL,0,1,32,ST,...,-3.268574,-68.000000,-32.462101,-35.537899,2.846036,,,,,


In [18]:
# Add columns for contact event, foul/fair, in-play event, la/ev bucket, spray bucket
print('Adding ID Columns...')
val_df_id = val_df_clean.apply(add_id_cols, axis=1)
print('Adding LA EV Buckets...')
val_df_la_ev = val_df_id.apply(add_la_ev_bucket, axis=1)
print("Adding Spray Bucket...")
val_df_labelled = val_df_la_ev.apply(add_spray_bucket, axis=1)

val_df_labelled

Adding ID Columns...
Adding LA EV Buckets...
Adding Spray Bucket...


Unnamed: 0,delta_run_exp,description,bb_type,p_throws,stand,home_team,balls,strikes,pitch_count,pitch_type,...,launch_speed,launch_angle,hc_x,hc_y,spray_angle,Swing_Event,Contact_Event,Foul_Event,LA_EV,Spray_Bucket
0,0.036,ball,,R,L,SEA,0,0,1,FF,...,,,,,,no_swing,,,,
1,0.135,ball,,R,L,SEA,2,0,3,FF,...,,,,,,no_swing,,,,
2,-0.078,called_strike,,R,L,SEA,3,0,4,FF,...,,,,,,no_swing,,,,
3,-0.023,called_strike,,R,L,SEA,0,1,11,FF,...,,,,,,no_swing,,,,
4,-0.026,foul,,R,L,SEA,0,0,20,FF,...,,,,,,swing,contact,foul,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706795,-0.055,called_strike,,R,R,MIL,1,1,24,ST,...,,,,,,no_swing,,,,
706796,-0.173,hit_into_play,ground_ball,R,R,MIL,1,2,25,ST,...,74.2,8.0,-5.32,70.62,94.308112,swing,contact,fair,<90_GB,Cent_GB
706797,-0.017,swinging_strike,,R,R,MIL,0,0,31,ST,...,,,,,,swing,swinging_strike,,,
706798,-0.023,called_strike,,R,R,MIL,0,1,32,ST,...,,,,,,no_swing,,,,


In [28]:
# Split data by pitch class, create datasets for each sub-model for each class 
val_fb = val_df_labelled[val_df_labelled.pitch_type.isin(['FF','SI','FC'])]
val_os = val_df_labelled[val_df_labelled.pitch_type.isin(['CH','FS','KN','FO'])]
val_brba = val_df_labelled[val_df_labelled.pitch_type.isin(['SL','KC','CU','ST','SV'])]

## Stuff

In [37]:
fb_contact, fb_foul, fb_in_play1, fb_in_play2 = build_stuff_datasets(val_fb)

fb_contact.to_csv('Data/Models/Val Data/Stuff_FB_Contact.csv',index = False)
fb_foul.to_csv('Data/Models/Val Data/Stuff_FB_Foul.csv',index = False)
fb_in_play1.to_csv('Data/Models/Val Data/Stuff_FB_InPlay1.csv',index = False)
fb_in_play2.to_csv('Data/Models/Val Data/Stuff_FB_InPlay2.csv',index = False)

In [38]:
os_contact, os_foul, os_in_play1, os_in_play2 = build_stuff_datasets(val_os)

os_contact.to_csv('Data/Models/Val Data/Stuff_OS_Contact.csv',index = False)
os_foul.to_csv('Data/Models/Val Data/Stuff_OS_Foul.csv',index = False)
os_in_play1.to_csv('Data/Models/Val Data/Stuff_OS_InPlay1.csv',index = False)
os_in_play2.to_csv('Data/Models/Val Data/Stuff_OS_InPlay2.csv',index = False)

In [39]:
brba_contact, brba_foul, brba_in_play1, brba_in_play2 = build_stuff_datasets(val_brba)

brba_contact.to_csv('Data/Models/Val Data/Stuff_BrBa_Contact.csv',index = False)
brba_foul.to_csv('Data/Models/Val Data/Stuff_BrBa_Foul.csv',index = False)
brba_in_play1.to_csv('Data/Models/Val Data/Stuff_BrBa_InPlay1.csv',index = False)
brba_in_play2.to_csv('Data/Models/Val Data/Stuff_BrBa_InPlay2.csv',index = False)

## Locations

In [40]:
fb_swing, fb_no_swing, fb_contact, fb_foul, fb_in_play1, fb_in_play2 = build_location_datasets(
    val_fb)

fb_swing.to_csv('Data/Models/Val Data/Location_FB_Swing.csv',index = False)
fb_no_swing.to_csv('Data/Models/Val Data/Location_FB_NoSwing.csv',index = False)
fb_contact.to_csv('Data/Models/Val Data/Location_FB_Contact.csv',index = False)
fb_foul.to_csv('Data/Models/Val Data/Location_FB_Foul.csv',index = False)
fb_in_play1.to_csv('Data/Models/Val Data/Location_FB_InPlay1.csv',index = False)
fb_in_play2.to_csv('Data/Models/Val Data/Location_FB_InPlay2.csv',index = False)

In [41]:
os_swing, os_no_swing, os_contact, os_foul, os_in_play1, os_in_play2 = build_location_datasets(
    val_os)

os_swing.to_csv('Data/Models/Val Data/Location_OS_Swing.csv',index = False)
os_no_swing.to_csv('Data/Models/Val Data/Location_OS_NoSwing.csv',index = False)
os_contact.to_csv('Data/Models/Val Data/Location_OS_Contact.csv',index = False)
os_foul.to_csv('Data/Models/Val Data/Location_OS_Foul.csv',index = False)
os_in_play1.to_csv('Data/Models/Val Data/Location_OS_InPlay1.csv',index = False)
os_in_play2.to_csv('Data/Models/Val Data/Location_OS_InPlay2.csv',index = False)

In [42]:
brba_swing, brba_no_swing, brba_contact, brba_foul, brba_in_play1, brba_in_play2 = build_location_datasets(
    val_brba)

brba_swing.to_csv('Data/Models/Val Data/Location_BrBa_Swing.csv',index = False)
brba_no_swing.to_csv('Data/Models/Val Data/Location_BrBa_NoSwing.csv',index = False)
brba_contact.to_csv('Data/Models/Val Data/Location_BrBa_Contact.csv',index = False)
brba_foul.to_csv('Data/Models/Val Data/Location_BrBa_Foul.csv',index = False)
brba_in_play1.to_csv('Data/Models/Val Data/Location_BrBa_InPlay1.csv',index = False)
brba_in_play2.to_csv('Data/Models/Val Data/Location_BrBa_InPlay2.csv',index = False)

## Overall

In [43]:
fb_swing, fb_no_swing, fb_contact, fb_foul, fb_in_play1, fb_in_play2 = overall_datasets(
    val_fb)

fb_swing.to_csv('Data/Models/Val Data/Overall_FB_Swing.csv',index = False)
fb_no_swing.to_csv('Data/Models/Val Data/Overall_FB_NoSwing.csv',index = False)
fb_contact.to_csv('Data/Models/Val Data/Overall_FB_Contact.csv',index = False)
fb_foul.to_csv('Data/Models/Val Data/Overall_FB_Foul.csv',index = False)
fb_in_play1.to_csv('Data/Models/Val Data/Overall_FB_InPlay1.csv',index = False)
fb_in_play2.to_csv('Data/Models/Val Data/Location_FB_InPlay2.csv',index = False)

In [44]:
os_swing, os_no_swing, os_contact, os_foul, os_in_play1, os_in_play2 = overall_datasets(
    val_os)

os_swing.to_csv('Data/Models/Val Data/Overall_OS_Swing.csv',index = False)
os_no_swing.to_csv('Data/Models/Val Data/Overall_OS_NoSwing.csv',index = False)
os_contact.to_csv('Data/Models/Val Data/Overall_OS_Contact.csv',index = False)
os_foul.to_csv('Data/Models/Val Data/Overall_OS_Foul.csv',index = False)
os_in_play1.to_csv('Data/Models/Val Data/Overall_OS_InPlay1.csv',index = False)
os_in_play2.to_csv('Data/Models/Val Data/Overall_OS_InPlay2.csv',index = False)

In [45]:
brba_swing, brba_no_swing, brba_contact, brba_foul, brba_in_play1, brba_in_play2 = overall_datasets(
    val_brba)

brba_swing.to_csv('Data/Models/Val Data/Overall_BrBa_Swing.csv',index = False)
brba_no_swing.to_csv('Data/Models/Val Data/Overall_BrBa_NoSwing.csv',index = False)
brba_contact.to_csv('Data/Models/Val Data/Overall_BrBa_Contact.csv',index = False)
brba_foul.to_csv('Data/Models/Val Data/Overall_BrBa_Foul.csv',index = False)
brba_in_play1.to_csv('Data/Models/Val Data/Overall_BrBa_InPlay1.csv',index = False)
brba_in_play2.to_csv('Data/Models/Val Data/Overall_BrBa_InPlay2.csv',index = False)

In [69]:
val_data[(val_data.home_team == 'CHC') & (val_data.release_speed == 91.5) & (val_data.p_throws == 'R') & 
        (val_data.release_pos_x == -1.86) & (val_data.release_pos_y == 53.9)]

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
624464,FC,2023-04-22,91.5,-1.86,5.3,"May, Dustin",641355,669160,,swinging_strike,...,3,3,1,1,3,Infield shade,Standard,200.0,0.0,-0.054
