# Imports

In [None]:
from pybaseball import statcast, statcast_batter, playerid_lookup
import pandas as pd
import numpy as np

from datetime import datetime
from scipy.stats import multivariate_normal
from scipy.spatial.distance import mahalanobis

import warnings
warnings.filterwarnings('ignore')

In [None]:
data21_clus = pd.read_csv('Data/ClusteredData/Clustered2021.csv', low_memory = False)
data22_clus = pd.read_csv('Data/ClusteredData/Clustered2022.csv', low_memory = False)

# Dataset Building

## First Pass, New Features

In [None]:
# Define a function to fill NaN values within groups
def fillna_by_pitcher(df, cols):
    '''
    Description: Fills NA values (pitch metrics), applied by pitch type per pitcher
    --------------------------------------------------------------------------------
    Inputs: df, cols
    
    Returns: df
        NA columns filled
    '''
    
    # For each column, take mean of column within dataframe, fill NA values with mean
    for i in cols:
        mean = df[i].mean()
        df[i].fillna(mean,inplace = True)
    
    return df

In [None]:
def clean_train_data(df):
    '''
    Description: Cleans training data, filters dataframe for relevant features, 
    removes non-pitches and fills in NA values for each unique pitch for all pitchers
    --------------------------------------------------------------------------------
    Inputs: df
    
    Returns: df_clean
        Cleaned input df
    '''
    # Define relevant feature columns, values to remove, columns with NA values to fill
    non_pitches = ['FA','PO','KN','EP']
    
    y = ['delta_run_exp']
    
    y_cats = ['description','bb_type']
    
    context_features = ['player_name','p_throws','batter','stand','pitch_type','pitch_number',
            'home_team','game_date','game_pk','at_bat_number',
            'balls','strikes']
    
    cont_features = ['release_speed','release_extension','effective_speed','release_spin_rate',
            'release_pos_x', 'release_pos_y', 'release_pos_z','spin_axis', 'pfx_x', 'pfx_z',
            'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot','launch_speed','launch_angle']
    
    features = y + y_cats + context_features + cont_features
    
    # Drop: game_pk, player_name, batter, game_date

    
    # Filter Dataframe for features
    df = df[features]
    
    # Remove pitchouts/non-pitches, pitches with 0 movement (Statcast errors)
    df_filt = df[(~df.pitch_type.isin(non_pitches)) & ((df.pfx_x != 0.0) & (df.pfx_z != 0.0))]
    
    # Define columns to fill or drop if NA
    fill_cols = ['release_speed','release_extension','effective_speed','release_spin_rate',
                 'release_pos_x','release_pos_y','release_pos_z','spin_axis']

    na_cols = ['delta_run_exp','pitch_type','pitch_number','pfx_x','pfx_z',
               'release_pos_x', 'release_pos_y', 'release_pos_z', 
               'release_speed','release_extension','effective_speed','release_spin_rate',
               'spin_axis','sz_top', 'sz_bot']
    
    # Fill in NA values for each pitch and pitcher with mean of each column for each unique 
    df_filled = df_filt.groupby(['player_name','pitch_type']).apply(fillna_by_pitcher, cols = fill_cols)
    df_clean = df_filled.dropna(subset=na_cols)
    
    df_clean = df_clean[(~(df_clean.bb_type.notna())) | ((df_clean.bb_type.notna()) & 
              ((df_clean.launch_speed.notna()) & (df_clean.launch_angle.notna())))]
    
    df_clean = df_clean.replace('foul_tip','swinging_strike').replace(
    'swinging_strike_blocked','swinging_strike').replace('blocked_ball','ball').replace(
    'missed_bunt','swinging_strike').replace('bunt_foul_tip','swinging_strike').replace('foul_bunt','foul')
        
    df_clean = df_clean[~((df_clean.description == 'hit_into_play') & (df_clean.bb_type.isna()))]
    
    # OHC Base variables to 0 and 1
    #df_clean[['on_1b','on_2b','on_3b']] = df_clean[['on_1b','on_2b','on_3b']].notna().astype(int)
    
    # Sort dataframe by pitches in chronological order, return
    df_clean = df_clean.sort_values(['game_date','game_pk','at_bat_number','pitch_number'])
    return df_clean

In [None]:
def add_new_features(df, season_start):
    '''
    Description: Adds new features in dataframe
        - inferred_axis: Inferred Spin Axis (SSW Effects)
        - axis_diff: Difference of Inferred and Observed Spin Axis
        - game_week: Change game date to week of season depending on start date of season
        - pitch_count: Pitch # of outing for each outing per pitcher 
    --------------------------------------------------------------------------------
    Inputs: df, season_start (str)
    
    Returns: df
        Dataframe with new features added
    '''
    
    # inferred_axis: 180 / pi * atan(pfx_z / pfx_x) + 90 (where pfx_x is < 0, add 180 degrees.)
    df['inferred_axis'] = np.degrees(np.arctan(df['pfx_z'] / df['pfx_x'])) + 90
    df.loc[df['pfx_x'] < 0, 'inferred_axis'] += 180
    df['axis_diff'] = df['spin_axis'] - df['inferred_axis']
    
    # axis_diff: spin_axis - inferred_axis
    
    # Pitch Count: Cumulative pitch number of outing for pitcher
    df['pitch_count'] = df.sort_values(
    ['game_date','game_pk','at_bat_number','pitch_number']).groupby(
    ['game_date','game_pk','player_name']).cumcount() + 1
    
    # Create game_week column, where week of season is taken from game_date in Savant
    start_date = datetime.strptime(season_start, '%Y-%m-%d').date()
    
    df['datetime'] = pd.to_datetime(df['game_date'])
    df['game_week'] = df.apply(lambda x: ((x['datetime'].date() - start_date).days // 7) + 1, axis = 1)
    df = df.drop('datetime',axis=1)
    
    return df

## Second Pass, New Features
Note: Need first pass of new features before added second pass

In [None]:
# For "noise" around pitch trajectory, calculate a multivariate normal distribution
# for each unique pitch thrown for each pitcher over a season

# Note: Calculating multivariate distributions for eachn pitch per game is 
# both extremely computationally intensive, and each distribution unstable due to small samples
# of each pitch per game

# Note: Does not include axis_diff for this iteration

def multivariate_normal_distribution(x):
    '''
    Description: Applied to each group, calculate multivariate normal distribution
    for each row's continuous features with mean and covariance matrix
    --------------------------------------------------------------------------------
    Inputs: x (dataframe row, Series)
    
    Returns: mvn_dist, SciPy multivariate normal distribution
    '''
    
    # Define all continuous features 
    cont_feats = ['release_speed','release_extension','effective_speed',
    'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
    'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis','inferred_axis','axis_diff']
    
    
    # Extract the continuous variables
    continuous_vars = x[cont_feats]  
    
    # Calculate the mean and covariance matrix for the continuous variables
    mean = continuous_vars.mean()
    cov_matrix = continuous_vars.cov().fillna(0) + (np.eye(continuous_vars.cov().shape[0]) * 1e-6)
    
    # Create a multivariate normal distribution object
    mvn_dist = multivariate_normal(mean=mean, cov=cov_matrix, allow_singular=True)
    
    return mvn_dist

In [None]:
def calc_mahalanobis(x):
    '''
    Description: Calculates mahalanobis distance of each pitch's continuous features
    from center, inverse covariance matrix of distribution
    --------------------------------------------------------------------------------
    Inputs: x (type Series)
    
    Returns: mahalanobis_distance (type float)
    '''
    # Defines distribution, continuous features
    distribution = x[-1]
    data = np.array(x[:-1])
    
    # Calculates distance
    mahalanobis_distance= mahalanobis(data, distribution.mean, np.linalg.inv(distribution.cov))
    return mahalanobis_distance

In [None]:
def add_diff_features(data):
    '''
    Description: Adds features of movement and velocity differentials for each pitch
    based on primary pitch per batter handedness per outing
    --------------------------------------------------------------------------------
    Inputs: df
    
    Returns: full_data 
        DataFrame with movement, velocity differentials and mahalanobis distance "noise"
        variables added
    '''
    
    # Groups data by each game_date and pitcher, finds primary fastball (or primary pitch if no fastball)
    primary_fb = data.groupby(['game_date','game_pk','stand','player_name']).agg({
        'pitch_type': lambda x: x[x.isin(['FC','SI','FF'])].value_counts().idxmax() 
        if any(x.isin(['FC','SI','FF'])) 
        else x.value_counts().idxmax()
    }).rename(columns={'pitch_type':'primary_pitch'})


    # Merges training data with primary fastball, defines new columns of primary_pitch
    primary_fb_data = data.merge(primary_fb.reset_index(), 
                                 left_on=['game_date','game_pk','stand','player_name','pitch_type'], 
                                 right_on=['game_date','game_pk','stand','player_name','primary_pitch'], 
                                 how='inner')

    # Define velocity, movement variables to calculate differentials from primary pitch
    velo_mvt_cols = ['release_speed','release_spin_rate','pfx_x', 'pfx_z', 
                     'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis', 'inferred_axis','axis_diff']

    # Calculates mean for velocity, movement variables for each primary pitch for each pitcher per outing
    primary_fb_means = primary_fb_data.groupby(['game_date','game_pk','stand','player_name','pitch_type']).agg({
        i: 'mean' for i in velo_mvt_cols}).rename(columns = {
        i: i + '_mean' for i in velo_mvt_cols
    }).reset_index()
    primary_fb_means = primary_fb_means.rename(columns={'pitch_type':'primary_pitch'})

    # Merges training data with primary pitch mean data
    data_merged = data.merge(primary_fb_means, 
                             on = ['game_date','game_pk','stand','player_name'], how = 'inner')

    # Calculates all differentials for velocity and movenent profiles
    data_merged['velo_diff'] = data_merged['release_speed'] - data_merged['release_speed_mean']
    data_merged['spin_rate_diff'] = data_merged['release_spin_rate'] - data_merged['release_spin_rate_mean']
    data_merged['pfx_x_diff'] = data_merged['pfx_x'] - data_merged['pfx_x_mean']
    data_merged['pfx_z_diff'] = data_merged['pfx_z'] - data_merged['pfx_z_mean']
    data_merged['vx0_diff'] = data_merged['vx0'] - data_merged['vx0_mean']
    data_merged['vy0_diff'] = data_merged['vy0'] - data_merged['vy0_mean']
    data_merged['vz0_diff'] = data_merged['vz0'] - data_merged['vz0_mean']
    data_merged['ax_diff'] = data_merged['ax'] - data_merged['ax_mean']
    data_merged['ay_diff'] = data_merged['ay'] - data_merged['ay_mean']
    data_merged['az_diff'] = data_merged['az'] - data_merged['az_mean']
    data_merged['spin_axis_diff'] = data_merged['spin_axis'] - data_merged['spin_axis_mean']
    data_merged['inferred_axis_diff'] = data_merged['inferred_axis'] - data_merged['inferred_axis_mean']
    data_merged['axis_diff_diff'] = data_merged['axis_diff'] - data_merged['axis_diff_mean']

    # Drops all primary pitch velocity, movement mean columns
    data_merged = data_merged.drop(['primary_pitch'] + [i + '_mean' for i in velo_mvt_cols],axis = 1)
    
    
    cont_feats = ['release_speed','release_extension','effective_speed',
    'release_spin_rate','release_pos_x', 'release_pos_y', 'release_pos_z',
    'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'spin_axis','inferred_axis','axis_diff']
    
    # Applies multivariate normal to all unique pitches for each pitcher per year(> 3000 pitches per year)
    pitch_noise_groups = data.groupby(['player_name','pitch_type']).apply(
        multivariate_normal_distribution).reset_index(name = 'MV_Dist')

    # Merges dataframes of training data, dataframe with multivariate distributions,
    # so each unique pitch's distribution included in column for each pitch in training data
    full_data = data_merged.merge(pitch_noise_groups, on = ['player_name','pitch_type'], how = 'inner')

    # Calculate mahalanobis distance for all unique pitch's continuous features based on 
    # center and inverse covariance matrix of each pitch's multivariate distribution
    full_data['mahalanobis'] = full_data[cont_feats + ['MV_Dist']].apply(
        calc_mahalanobis, axis = 1)

    # Drops all multivariate normal distributions, contextual features used for grouping/sorting
    full_data = full_data.drop(
        ['MV_Dist','game_pk', 'player_name', 'batter', 'game_date', 'at_bat_number', 'pitch_number'],axis = 1)
    
    ord_cols = ['delta_run_exp', 'description', 'bb_type','p_throws', 'stand',
       'home_team', 'balls', 'strikes', 'pitch_count', 'game_week',
       'pitch_type',
       'release_speed', 'release_extension', 'effective_speed',
       'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z',
       'spin_axis', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
       'sz_top', 'sz_bot', 'inferred_axis', 'axis_diff', 'velo_diff',
       'spin_rate_diff', 'pfx_x_diff', 'pfx_z_diff', 'vx0_diff', 'vy0_diff',
       'vz0_diff', 'ax_diff', 'ay_diff', 'az_diff', 'spin_axis_diff',
       'inferred_axis_diff', 'axis_diff_diff', 'mahalanobis', 'launch_speed','launch_angle']
    
    full_data = full_data[ord_cols]
   
    return full_data

### Build Training Datasets

In [None]:
test21 = pd.read_csv('Data/Statcast/statcast21.csv', low_memory = False)
test22 = pd.read_csv('Data/Statcast/statcast22.csv', low_memory = False)

In [None]:
'''
merge_cols = ['delta_run_exp', 'p_throws','batter', 'stand', 'pitch_number',
       'home_team', 'game_date', 'game_pk', 'at_bat_number', 'balls',
       'strikes','player_name', 'events', 'description','launch_speed', 'launch_angle','bb_type']

merged_cols = ['on_1b','on_2b','on_3b']
'''

In [None]:
'''
test21_filt = test21[merge_cols + merged_cols]
test22_filt = test22[merge_cols + merged_cols]
'''

In [None]:
'''
data21_clus = data21_clus.merge(test21_filt, on =merge_cols, how = 'inner')
data22_clus = data22_clus.merge(test22_filt, on =merge_cols, how = 'inner')
'''

In [None]:
# Add pitch differentials, noise feature
data21_train = add_diff_features(data21_clus)
data22_train = add_diff_features(data22_clus)

# Build training dataset
training_set = pd.concat([data21_train,data22_train])
training_set.columns, len(training_set.columns)

In [None]:
ord_cols = ['delta_run_exp', 'description', 'bb_type','p_throws', 'stand',
       'home_team', 'balls', 'strikes', 'pitch_count', 'game_week',
       'pitch_type', 
       'release_speed', 'release_extension', 'effective_speed',
       'release_spin_rate', 'release_pos_x', 'release_pos_y', 'release_pos_z',
       'spin_axis', 'pfx_x', 'pfx_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
       'sz_top', 'sz_bot', 'inferred_axis', 'axis_diff', 'velo_diff',
       'spin_rate_diff', 'pfx_x_diff', 'pfx_z_diff', 'vx0_diff', 'vy0_diff',
       'vz0_diff', 'ax_diff', 'ay_diff', 'az_diff', 'spin_axis_diff',
       'inferred_axis_diff', 'axis_diff_diff', 'mahalanobis', 'launch_speed','launch_angle']

len(ord_cols)

In [None]:
# Re-order training dataset, write to csv file
# Training set has no context-neutral run values
training_set = training_set[ord_cols].reset_index(drop=True)
training_set.to_csv('TrainingDataAll.csv',index=False)

# Full Test Run, Dataset Building

In [None]:
# Test: Import dataset of 2021 statcast data
test_df = pd.read_csv('Data/Statcast/statcast21.csv')
test_df_april = test_df[(test_df.game_date >= '2021-04-01') & (test_df.game_date <= '2021-05-01')]

In [None]:
# Clean data, add additional features, pitch "noise" features
test_df_april_clean = clean_train_data(test_df)
test_df_clean = add_new_features(test_df_april_clean,'2021-04-01')
test_df_train = add_diff_features(test_df_clean)
test_df_train

# Create Target Variables

In [None]:
# Import training data (all)
training_data = pd.read_csv('TrainingDataAll.csv', low_memory=False)

In [None]:
def add_id_cols(row):
    '''
    Description: Add swing/no-swing, contact/swinging-strike, foul/fair event columns for each  
    row based on description values in pitch data
    --------------------------------------------------------------------------------
    Inputs: row
    
    Returns: row 
        DataFrame row with added swing, contact, foul event columns I.e. '<90_GB, 100_105_FB, >105_LD, etc.'
    '''
    
    # Define, non-swing, contact, swstrikes, foul events
    non_swings = ['ball','called_strike','hit_by_pitch','blocked_ball']
    contact_events = ['foul', 'hit_into_play','foul_bunt']  
    sw_strikes = ['swinging_strike','foul_tip','bunt_foul_tip', 'missed_bunt', 'swinging_strike_blocked']
    foul_events = ['foul', 'foul_bunt']  
    
    # If a non-swing event, create no swing column
    if row['description'] in non_swings:
        # no-swing value
        swing_event = 'no_swing'
        # Standardize description for context-neutral run values
        row['description'] = row['description'].replace('blocked_ball', 'ball')
        row['description'] = row['description'].replace('called_strike', 'strike')
    else:
        # swing value
        swing_event = 'swing'

    # If a swinging strike or contact event, create contact event column
    if row['description'] in sw_strikes:
        # swinging_strike value, standardize description
        contact_event = 'swinging_strike'
        row['description'] = 'strike'
    elif row['description'] in contact_events:
        # Contact
        contact_event = 'contact'
    else:
        # If neither swinging strike nor contact, i.e. ball, HBP
        contact_event = np.nan
        
    # If contact is made, create foul/fair event column
    if row['description'] in foul_events:
        # Foul Event, standardize description to foul balls only
        foul_event = 'foul'
        row['description'] = row['description'].replace('foul_bunt', 'foul')
    elif row['description'] == 'hit_into_play':
        # Fair Event
        foul_event = 'fair'
    else:
        # If no contact is made
        foul_event = np.nan

    # Add all events to series, return row with series appended
    events = pd.Series({'Swing_Event': swing_event, 'Contact_Event': contact_event,'Foul_Event': foul_event})
    return row.append(events)

In [None]:
def add_la_ev_bucket(row):
    '''
    Description: Add la/ev bucket column value for each row based on bb_type, 
    launch_speed values in pitch data
    --------------------------------------------------------------------------------
    Inputs: row
    
    Returns: row 
        DataFrame row with added category str of bb_type/la bucket + ev_bucket
    '''
    
    # Define batted-ball types
    gb = row['bb_type'] == 'ground_ball'
    ld = row['bb_type'] == 'line_drive'
    fb = row['bb_type'] == 'fly_ball'
    pu = row['bb_type'] == 'popup'
    
    # Define EV bucket ranges (< 90, 90 < x < 95, etc.)
    less_90 = row['launch_speed'] < 90.0
    betw_90_95 = row['launch_speed'] >= 90.0 and row['launch_speed'] < 95.0
    betw_95_100 = row['launch_speed'] >= 95.0 and row['launch_speed'] < 100.0
    betw_100_105 = row['launch_speed'] >= 100.0 and row['launch_speed'] < 105.0
    greater_105 = row['launch_speed'] >= 105.0
    
    # If gb, lb, fb, pp batted-ball type
    if gb:
        # Else-IF logic for each EV bucket
        if less_90:
            category = '<90_GB'
        elif betw_90_95:
            category = '90_95_GB'
        elif betw_95_100:
            category = '95_100_GB'
        elif betw_100_105:
            category = '100_105_GB'
        elif greater_105:
             category = '>105_GB'
    elif ld:
        if less_90:
            category = '<90_LD'
        elif betw_90_95:
            category = '90_95_LD'
        elif betw_95_100:
            category = '95_100_LD'
        elif betw_100_105:
            category = '100_105_LD'
        elif greater_105:
             category = '>105_LD'
    elif fb:
        if less_90:
            category = '<90_FB'
        elif betw_90_95:
            category = '90_95_FB'
        elif betw_95_100:
            category = '95_100_FB'
        elif betw_100_105:
            category = '100_105_FB'
        elif greater_105:
             category = '>105_FB'
    elif pu:
        if less_90:
            category = '<90_PU'
        elif betw_90_95:
            category = '90_95_PU'
        elif betw_95_100:
            category = '95_100_PU'
        elif betw_100_105:
            category = '100_105_PU'
        elif greater_105:
             category = '>105_PU'
    else:
        category = np.nan
    
    # Define series for la/ev bucket, append new column to row
    la_ev_bucket = pd.Series({'LA_EV': category})
    return row.append(la_ev_bucket)

In [None]:
# Add columns for contact event, foul/fair, in-play event, la/ev bucket
train_data_labelled = training_data.apply(add_id_cols, axis=1).apply(add_la_ev_bucket, axis=1)
train_data_labelled

In [None]:
# save to csv file, (w/o context-neutral run values)
train_data_labelled.to_csv('TrainingDataLabelled.csv',index=False)

# Splitting Data by Sub-Model

Pitch Type
- Fastballs
- Offspeed
- Breaking Balls

Events
- Swing/No Swing Events
- Contact/No Contact Events
- Foul/Fair Ball Events

In [None]:
all_data = pd.read_csv('TrainingDataLabelled.csv', low_memory=False)

In [None]:
# Define irrelevant columns to drop for each dataset/model combination
# I.e. in-play dataset/model do not need swing, contact, or foul event columns

swing_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle','Contact_Event','Foul_Event','LA_EV']

no_swing_drop_cols = ['delta_run_exp','bb_type','launch_speed',
             'launch_angle','Swing_Event','Contact_Event','Foul_Event','LA_EV']

contact_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle','Swing_Event','Foul_Event','LA_EV']

foul_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle','Swing_Event','Contact_Event','LA_EV']

in_play_drop_cols = ['delta_run_exp','bb_type','description','launch_speed',
             'launch_angle','Swing_Event','Contact_Event','Foul_Event']

In [None]:
# Split data by pitch class, create datasets for each sub-model for each class 
fastballs = all_data[all_data.pitch_type.isin(['FF','SI','FC'])]
offspeeds = all_data[all_data.pitch_type.isin(['CH','FS'])]
breaking_balls = all_data[all_data.pitch_type.isin(['SL','KC','CU','ST','SV','CS'])]

In [None]:
#fastballs_swing = fastballs.drop(swing_drop_cols,axis=1)
#fastballs_no_swing = fastballs[fastballs.Swing_Event == 'no_swing'].drop(no_swing_drop_cols,axis=1)
fastballs_contact = fastballs[fastballs.Contact_Event.notna()].drop(
    contact_drop_cols,axis=1)
fastballs_foul = fastballs[fastballs.Foul_Event.notna()].drop(
    foul_drop_cols,axis=1)
fastballs_in_play = fastballs[fastballs.Foul_Event == 'fair'].drop(
    in_play_drop_cols,axis=1)

fastballs_contact.to_csv('Data/Models/Contact_Models/Stuff_FB_Contact.csv',index = False)
fastballs_foul.to_csv('Data/Models/Foul_Models/Stuff_FB_Foul.csv',index = False)
fastballs_in_play.to_csv('Data/Models/In_Play_Models/Stuff_FB_InPlay.csv',index = False)

In [None]:
#offspeeds_swing = offspeeds.drop(swing_drop_cols,axis=1)
#offspeeds_no_swing = offspeeds[offspeeds.Swing_Event == 'no_swing'].drop(no_swing_drop_cols,axis=1)
offspeeds_contact = offspeeds[offspeeds.Contact_Event.notna()].drop(
    contact_drop_cols,axis=1)
offspeeds_foul = offspeeds[offspeeds.Foul_Event.notna()].drop(
    foul_drop_cols,axis=1)
offspeeds_in_play = offspeeds[offspeeds.Foul_Event == 'fair'].drop(
    in_play_drop_cols,axis=1)

offspeeds_contact.to_csv('Data/Models/Contact_Models/Stuff_OS_Contact.csv',index = False)
offspeeds_foul.to_csv('Data/Models/Foul_Models/Stuff_OS_Foul.csv',index = False)
offspeeds_in_play.to_csv('Data/Models/In_Play_Models/Stuff_OS_InPlay.csv',index = False)

In [None]:
#breaking_balls_swing = breaking_balls.drop(swing_drop_cols,axis=1)
#breaking_balls_no_swing = breaking_balls[breaking_balls.Swing_Event == 'no_swing'].drop(no_swing_drop_cols,axis=1)
breaking_balls_contact = breaking_balls[breaking_balls.Contact_Event.notna()].drop(
    contact_drop_cols,axis=1)
breaking_balls_foul = breaking_balls[breaking_balls.Foul_Event.notna()].drop(
    foul_drop_cols,axis=1)
breaking_balls_in_play = breaking_balls[breaking_balls.Foul_Event == 'fair'].drop(
    in_play_drop_cols,axis=1)

breaking_balls_contact.to_csv('Data/Models/Contact_Models/Stuff_BrBall_Contact.csv',index = False)
breaking_balls_foul.to_csv('Data/Models/Foul_Models/Stuff_BrBall_Foul.csv',index = False)
breaking_balls_in_play.to_csv('Data/Models/In_Play_Models/Stuff_BrBall_InPlay.csv',index = False)