In [1]:
import regex as re
import ast
import math
import os

import pandas as pd
import numpy as np
from pandasql import sqldf
import warnings
warnings.simplefilter(action="ignore")

baseball_path = r"C:\Users\james\Documents\MLB\Data"

In [2]:
# At bat events
def create_events(df):
    df['eventsModel'] = np.where(df['event'] == 'Strikeout', "so", "")
    df['eventsModel'] = np.where(df['event'] == 'Strikeout Double Play', "so", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Groundout', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Fielders Choice', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Double Play', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Grounded Into DP', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Triple Play', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Field Error', "go", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Forceout', "go", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Lineout', "lo", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Bunt Lineout', "lo", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Flyout', "fo", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Sac Fly', "fo", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Sac Fly Double Play', "fo", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Pop Out', "po", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Bunt Pop Out', "po", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Hit By Pitch', "hbp", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Walk', "bb", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Intent Walk', "bb", df['eventsModel'])

    df['eventsModel'] = np.where(df['event'] == 'Single', "b1", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Double', "b2", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Triple', "b3", df['eventsModel'])
    df['eventsModel'] = np.where(df['event'] == 'Home Run', "hr", df['eventsModel'])

    df['eventsModel'] = np.where(df['eventsModel'] == "", "Cut", df['eventsModel'])
    
    return df

In [3]:
# Calculate wind vectors
# Note: 2 is to centerfield, 6 is from centerfield, clockwise
# Note: y vector is positive to centerfield, negative from centerfield
# Note: x vector is positive from left to right, negatives from right to left
# Assumption is wind is blowing in 8 cardinal directions, so we can use simple right isosceles triangles
def y_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "Out To CF": 
        y_vect = wind_speed
    elif df['windDirection'] == "Out To RF": 
        y_vect = angled
    elif df['windDirection'] == "L To R": 
        y_vect = 0
    elif df['windDirection'] == "In From LF": 
        y_vect = angled * -1
    elif df['windDirection'] == "In From CF": 
        y_vect = wind_speed * - 1
    elif df['windDirection'] == "In From RF": 
        y_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        y_vect = 0
    elif df['windDirection'] == "Out To LF": 
        y_vect = angled
    else:
        y_vect = 0
        
    return y_vect

def x_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "L To R": 
        x_vect = wind_speed
    elif df['windDirection'] == "In From LF": 
        x_vect = angled
    elif df['windDirection'] == "In From CF": 
        x_vect = 0
    elif df['windDirection'] == "In From RF": 
        x_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        x_vect = wind_speed * - 1
    elif df['windDirection'] == "Out To LF": 
        x_vect = angled * -1
    elif df['windDirection'] == "Out To CF": 
        x_vect = 0
    elif df['windDirection'] == "Out To RF": 
        x_vect = angled
    else:
        x_vect = 0
        
    return x_vect

In [4]:
# Creates weather variables
def clean_weather(df):
    # Separate weather into temperature and weather type
    df[['temperature', 'weather']] = df['weather'].str.split(", ", expand=True)
    df['temperature'] = df['temperature'].str.replace(" degrees", "").astype('int')
    # Separate wind into speed and direction
    df[['windSpeed', 'windDirection']] = df['wind'].str.split(", ", expand=True)
    df['windSpeed'].fillna("0 mph", inplace=True)
    df['windSpeed'] = df['windSpeed'].str.replace(" mph", "")
    df['windSpeed'] = pd.to_numeric(df['windSpeed'], errors='coerce')
    df['windSpeed'].fillna(0, inplace=True)
    df['windDirection'].fillna('L to R', inplace=True)
    df['windSpeed'].unique()
    df['windDirection'] = df['windDirection'].str.replace(".", "")
    # Calculate vectors
    df['x_vect'] = df.apply(x_vect, axis=1)
    df['y_vect'] = df.apply(y_vect, axis=1)
    
    return df

In [5]:
# This turns several variables, including events, venues, hands, and bases into dummies
def create_dummies(df):
    event_dummies = pd.get_dummies(df['eventsModel'])
    venue_dummies = pd.get_dummies(df['venue_id'], prefix='venue')
    pitcher_dummies = pd.get_dummies(df['pitchHand'], prefix='p')
    batter_dummies = pd.get_dummies(df['batSide'], prefix='b')
    year_dummies = pd.get_dummies(df['year'], prefix='year')
    
    venue_list = venue_dummies.columns.tolist()
    year_list = year_dummies.columns.tolist()
    dummy_list = venue_list + year_list
    
    df = pd.concat([df, event_dummies, venue_dummies, pitcher_dummies, batter_dummies, year_dummies], axis=1)
    
    df['preOnFirst'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnFirst'].shift(1)
    df['preOnSecond'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnSecond'].shift(1)
    df['preOnThird'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnThird'].shift(1)
    
    df['onFirst'] = df['preOnFirst'].notnull().astype('int')
    df['onSecond'] = df['preOnSecond'].notnull().astype('int')
    df['onThird'] = df['preOnThird'].notnull().astype('int')
    
    df['top'] = np.where(df['halfInning'] == "top", 1, 0)
    
    df['pa'] = np.where(df['eventsModel'] != "Cut", 1, 0)
    df['ab'] = df['pa'] - df['hbp']
    
    return df, dummy_list

In [6]:
def statcast(df):
    statcast = df.copy()
    # Hard hit dummy
    statcast['hard_hit'] = (statcast['hardness'].str.contains('hard')).astype('int')
    
    def find_max(lst):
        if lst:
            return max(lst)
        else:
            return 0
    
    # Max pitch speed
    statcast['startSpeeds'] = statcast['startSpeeds'].apply(lambda x: ast.literal_eval(x))
    statcast['maxSpeed'] = statcast['startSpeeds'].apply(find_max)
    # Have to drop, can't take lists
    statcast.drop(columns={'startSpeeds'}, inplace=True)
    
    # Max spin rate
    statcast['spinRates'] = statcast['spinRates'].apply(lambda x: ast.literal_eval(x))
    statcast['maxSpin'] = statcast['spinRates'].apply(find_max)
    # Have to drop, can't take lists
    statcast.drop(columns={'spinRates'}, inplace=True)
    
    # Launch speeds
    statcast['launchSpeeds'] = statcast['launchSpeeds'].str.replace("[", "")
    statcast['launchSpeeds'] = statcast['launchSpeeds'].str.replace("]", "")
    statcast['launchSpeed'] = (statcast['launchSpeeds']).astype('float', errors='ignore')
    statcast['launchSpeed'] = pd.to_numeric(statcast['launchSpeed'])
    
    # Launch angle
    statcast['launchAngles'] = statcast['launchAngles'].str.replace("[", "")
    statcast['launchAngles'] = statcast['launchAngles'].str.replace("]", "")
    statcast['launchAngle'] = (statcast['launchAngles']).astype('float', errors='ignore')
    statcast['launchAngle'] = pd.to_numeric(statcast['launchAngle'])
        
    # Total distances
    statcast['totalDistances'] = statcast['totalDistances'].str.replace("[", "")
    statcast['totalDistances'] = statcast['totalDistances'].str.replace("]", "")
    statcast['totalDistance'] = (statcast['totalDistances']).astype('float', errors='ignore')
    statcast['totalDistance'] = pd.to_numeric(statcast['totalDistance'])
    
    # Coordinates of batted ball
    statcast['coord'] = statcast['coord'].str.replace("[", "")
    statcast['coord'] = statcast['coord'].str.replace("]", "")    
    statcast[['x', 'y']] = statcast['coord'].str.split(",", expand=True)
    statcast['x'] = pd.to_numeric(statcast['x'])
    statcast['y'] = pd.to_numeric(statcast['y'])
    
    statcast['spray_angle'] = np.arctan((statcast['x']-125.42)/(198.27-statcast['y'])) * 180/np.pi * 0.75
    statcast['to_left'] = (statcast['spray_angle'] < -15).astype('int')
    statcast['to_middle'] = ((statcast['spray_angle'] >= -15) & (statcast['spray_angle'] <= 15)).astype('int')
    statcast['to_right'] = (statcast['spray_angle'] > 15).astype('int')

    
    return statcast

In [7]:
# This will return a dataframe that can eventually be used as the model input. Has pitcher vs hitter stats, specific to hand
def rolling_pas(df, pa_num):
    stat_list = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 'hard_hit', 'to_left', 'to_middle', 'to_right', 'pa', 'ab']
    max_list = ['totalDistance', 'maxSpeed', 'maxSpin']
        
    df['pa_num'] = df.index
    
    batter_stats = []
    pitcher_stats = []
    batter_stats2 = []
    pitcher_stats2 = []

    for stat in stat_list:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"
        batter_stats.append(batter_stat)
        pitcher_stats.append(pitcher_stat)
        
    for stat in max_list:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"
        batter_stats2.append(batter_stat)
        pitcher_stats2.append(pitcher_stat)

    df[batter_stats] = df.groupby(['batter', 'pitchHand'])[stat_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
    df[batter_stats2] = df.groupby(['batter', 'pitchHand'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
      
    df[pitcher_stats] = df.groupby(['pitcher', 'batSide'])[stat_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
    df[pitcher_stats2] = df.groupby(['pitcher', 'batSide'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())

    
    df.sort_values(['pa_num'], axis=0, ascending=True, inplace=True)

    # wOBA - using 2022 values throughout
    df['woba_b'] = (0.690 * df['bb_b']) + (0.721 * df['hbp_b']) + (0.885 * df['b1_b']) + (1.262 * df['b2_b']) + (1.601 * df['b3_b']) + (2.070 * df['hr_b'])
    df['woba_p'] = (0.690 * df['bb_p']) + (0.721 * df['hbp_p']) + (0.885 * df['b1_p']) + (1.262 * df['b2_p']) + (1.601 * df['b3_p']) + (2.070 * df['hr_p'])
    
    # Slugging
    df['slg_b'] = (1 * df['b1_b']) + (2 * df['b2_b']) + (3 * df['b3_b']) + (4 * df['hr_b'])
    df['slg_b'] = df['slg_b'] / df['ab_b']
    df['slg_p'] = (1 * df['b1_p']) + (2 * df['b2_p']) + (3 * df['b3_p']) + (4 * df['hr_p'])
    df['slg_p'] = df['slg_p'] / df['ab_p']

    # OBP    
    df['obp_b'] = df[['b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b']].sum(axis=1)
    df['obp_p'] = df[['b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p']].sum(axis=1)

    # Calculate rates
    stat_short = ['so', 'b1', 'b2', 'b3', 'hr', 'bb', 'hbp', 'lo', 'po', 'go', 'fo', 'woba', 'obp', 'hard_hit', 'to_left', 'to_middle', 'to_right']
    for stat in stat_short:
        batter_stat = stat + "_b"
        pitcher_stat = stat + "_p"  
        df[batter_stat] = df[batter_stat] / df['pa_b']
        df[pitcher_stat] = df[pitcher_stat] / df['pa_p']
        
    batter_stats = batter_stats + batter_stats2
    pitcher_stats = pitcher_stats + pitcher_stats2
        
    df.sort_values('pa_num', inplace=True)
    
                 
    return df, batter_stats, pitcher_stats

In [21]:
df_list = []
for year in range(2023, 2024):
    filename = "Play" + str(year) + ".csv"
    df = pd.read_csv(os.path.join(baseball_path, "A3. Raw API", filename))
    
    df['year'] = year
    # Only keep one observation per PA (don't keep each runner)
    df.drop_duplicates(['gamePk', 'atBatIndex'], keep='first', inplace=True, ignore_index=True)

    df = create_events(df)
    df = clean_weather(df)
    df, dummy_list = create_dummies(df)
    df = statcast(df)
    df['game_date'] = df['game_date'].str.replace("-", "")
    df['pitcher_outs'] = (df['inning'] - 1) * 3 + (df['outs'])
    df['start'] = (df['inning'] == 1).astype('int')

    df['pitcher_outs'] = df.groupby(['pitcher', 'gamePk'])['pitcher_outs'].transform('max')
    df['start'] = df.groupby(['pitcher', 'gamePk'])['start'].transform('max')
    
    # Determine score before PA
    df['preAwayScore'] = df.groupby(['gamePk', 'inning', 'halfInning'])['awayScore'].shift(1)
    df['preHomeScore'] = df.groupby(['gamePk', 'inning', 'halfInning'])['homeScore'].shift(1)
    
    df['preAwayScore'].fillna(df['awayScore'], inplace=True)
    df['preHomeScore'].fillna(df['homeScore'], inplace=True)
    
    # Calculate score differential
    df['score_diff'] = np.where(df['top'] == 1, df['preAwayScore'] - df['preHomeScore'], df['preHomeScore'] - df['preAwayScore'])
    
    df_list.append(df)
    
all_years = pd.concat(df_list, axis=0)
all_years.reset_index(inplace=True)
all_years.drop(columns={'Unnamed: 0', 'index'}, inplace=True)
all_years.rename(columns={'level_0':'index'}, inplace=True)
all_years

Unnamed: 0,index,atBatIndex,inning,halfInning,outs,type,id,event,eventType,description,...,x,y,spray_angle,to_left,to_middle,to_right,pitcher_outs,preAwayScore,preHomeScore,score_diff
0,0,0,1,top,0,atBat,660670,Single,single,Ronald Acuna Jr. singles on a sharp line driv...,...,201.3,110.5,30.633355,0,0,1,9,0.0,0.0,0.0
1,1,1,1,top,1,atBat,660670,Strikeout,strikeout,Matt Olson strikes out swinging.,...,,,,0,0,0,9,0.0,0.0,0.0
2,2,2,1,top,1,atBat,663586,Walk,walk,Austin Riley walks.,...,,,,0,0,0,9,0.0,0.0,0.0
3,3,3,1,top,2,atBat,645277,Flyout,field_out,Ozzie Albies flies out sharply to right fielde...,...,184.7,65.2,18.009006,0,0,1,9,0.0,0.0,0.0
4,4,4,1,top,3,atBat,669221,Strikeout,strikeout,Sean Murphy strikes out swinging.,...,,,,0,0,0,9,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14421,14421,72,10,top,3,atBat,669003,Groundout,field_out,"Garrett Mitchell grounds out sharply, second b...",...,143.6,156.1,17.491096,0,0,1,30,4.0,3.0,1.0
14422,14422,73,10,bottom,1,atBat,641525,Fielders Choice Out,fielders_choice_out,Jake Cronenworth reaches on a fielder's choice...,...,123.2,147.0,-1.859526,0,1,0,30,4.0,3.0,-1.0
14423,14423,74,10,bottom,2,atBat,673490,Lineout,field_out,Ha-Seong Kim lines out to center fielder Garre...,...,138.9,68.4,4.444394,0,1,0,30,4.0,3.0,-1.0
14424,14424,75,10,bottom,2,atBat,640492,Single,single,Jose Azocar singles on a ground ball to third ...,...,104.0,159.6,-21.737136,1,0,0,30,4.0,3.0,-1.0


In [22]:
df = all_years.copy()
df = df.query('eventsModel != "Cut"').reset_index(drop=True)

# Calculate advanced stats
# ISO
df['iso'] = df['b2'] * 1 + df['b3'] * 2 + df['hr'] * 3
df['iso'] = np.where(df['ab'] == 0, np.nan, df['iso']) # denominator is ab
# SLG
df['slg'] = df['b1'] * 1 + df['b2'] * 2 + df['b3'] * 3 + df['hr'] * 4 
df['slg'] = np.where(df['ab'] == 0, np.nan, df['slg']) # denominator is ab
# wOBA
df['woba'] = df['bb'] * 0.690 + df['hbp'] * 0.721 + df['b1'] * 0.885 + df['b2'] * 1.262 + df['b3'] * 1.601 + df['hr'] * 2.070 
# denominator is PA - IBB, but I think I'm ignoring IBBs for now


df

Unnamed: 0,index,atBatIndex,inning,halfInning,outs,type,id,event,eventType,description,...,to_left,to_middle,to_right,pitcher_outs,preAwayScore,preHomeScore,score_diff,iso,slg,woba
0,0,0,1,top,0,atBat,660670,Single,single,Ronald Acuna Jr. singles on a sharp line driv...,...,0,0,1,9,0.0,0.0,0.0,0.0,1.0,0.885
1,1,1,1,top,1,atBat,660670,Strikeout,strikeout,Matt Olson strikes out swinging.,...,0,0,0,9,0.0,0.0,0.0,0.0,0.0,0.000
2,2,2,1,top,1,atBat,663586,Walk,walk,Austin Riley walks.,...,0,0,0,9,0.0,0.0,0.0,0.0,0.0,0.690
3,3,3,1,top,2,atBat,645277,Flyout,field_out,Ozzie Albies flies out sharply to right fielde...,...,0,0,1,9,0.0,0.0,0.0,0.0,0.0,0.000
4,4,4,1,top,3,atBat,669221,Strikeout,strikeout,Sean Murphy strikes out swinging.,...,0,0,0,9,0.0,0.0,0.0,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14321,14420,71,10,top,2,atBat,661388,Strikeout,strikeout,William Contreras strikes out swinging.,...,0,0,0,30,4.0,3.0,1.0,0.0,0.0,0.000
14322,14421,72,10,top,3,atBat,669003,Groundout,field_out,"Garrett Mitchell grounds out sharply, second b...",...,0,0,1,30,4.0,3.0,1.0,0.0,0.0,0.000
14323,14423,74,10,bottom,2,atBat,673490,Lineout,field_out,Ha-Seong Kim lines out to center fielder Garre...,...,0,1,0,30,4.0,3.0,-1.0,0.0,0.0,0.000
14324,14424,75,10,bottom,2,atBat,640492,Single,single,Jose Azocar singles on a ground ball to third ...,...,1,0,0,30,4.0,3.0,-1.0,0.0,1.0,0.885


In [48]:
# Define the dictionary of statistics and aggregation functions
stat_dict = {
    'b1': 'mean',
    'b2': 'mean',
    'b3': 'mean',
    'hr': 'mean',
    'hbp': 'mean',
    'bb': 'mean',
    'so': 'mean',
    'fo': 'mean',
    'go': 'mean',
    'lo': 'mean',
    'po': 'mean',
    'iso': 'mean',
    'slg': 'mean',
    'woba': 'mean',
    'to_left': 'mean',
    'to_middle': 'mean',
    'to_right': 'mean',
    'hard_hit': 'mean',
    'totalDistance': 'max',
    'launchSpeed': 'max',
    'maxSpeed': 'max',
    'maxSpin': 'max',
    'ab': 'sum',
    'pa': 'sum'
}


# Calculates rolling stats using stats and aggregation method in stat_dict, a window of window PAs, position (batter or pitcher), 
# desired variable name suffix, and a boolean for if you want all stats or separated by hand
def rolling_stats(stat_dict, window, position, suffix, all=False):
    
    stat_list = ['b1', 'b2', 'b3', 'hr', 'hbp', 'bb', 'so', 'fo', 'go', 'lo', 'po', 'iso', 'slg', 'woba', 'to_left', 'to_middle', 'to_right', 'hard_hit', 'totalDistance', 'launchSpeed', 'maxSpeed', 'maxSpin', 'ab', 'pa']
    
    output_list = []
    for stat in stat_list:
        output = stat + suffix
        output_list.append(output)
        
        
    # Determine whether to use pitchHand or batSide (opponent's hand)
    if position == "batter":
        opp_hand = "pitchHand"
    else:
        opp_hand = "batSide"
    
    # If you don't want all stats, you want them separated by hand
    if all == False:
        # Use the rolling method to perform the aggregations, specify hand, shift by 1, minimum of 1 period, aggregate using stat_dict
        df[output_list] = df.query(f'{opp_hand} == "L"').groupby(position)[stat_list].transform(lambda x: x.shift().rolling(window=window, min_periods=1).agg(stat_dict))      
        df[output_list] = df.query(f'{opp_hand} == "R"').groupby(position)[stat_list].transform(lambda x: x.shift().rolling(window=window, min_periods=1).agg(stat_dict))

        
        # Will be a dataframe the size of original dataframe in the same order as the original dataframe that can be neatly merged onto it
        # pa_df = pd.concat([vl_df, vr_df], axis=0)
        
    # You want them for all pitcher hands
    else:
        # Use the rolling method to perform the aggregations, shift by 1, minimum of 1 period, aggregate using stat_dict
        df[output_list] = df.query(f'{opp_hand} == "L"').groupby(position)[stat_list].transform(lambda x: x.shift().rolling(window=window, min_periods=1).agg(stat_dict))

        # Add the suffix to the column names
        # pa_df.columns = [col + suffix for col in pa_df.columns]

    return pa_df

### Batters
# Short period of time by pitcher hand
batter_short = rolling_stats(stat_dict, 3, "batter", "_b")
# Long period of time by pitcher hand
batter_long = rolling_stats(stat_dict, 10, "batter", "_b_long")
# Long period of time, all pitcher hands
batter_all = rolling_stats(stat_dict, 10, "batter", "_b_all", all=True)

### Pitchers
# Short period of time by batter hand
pitcher_short = rolling_stats(stat_dict, 3, "pitcher", "_p")
# Long period of time, by batter hand
pitcher_long = rolling_stats(stat_dict, 10, "pitcher", "_p_long")
# Long period of time, all batter hands
pitcher_all = rolling_stats(stat_dict, 10, "pitcher", "_p_all", all=True)

# Merge all of these back onto the original dataframe
merged_df = pd.concat([df, batter_short, batter_long, batter_all, pitcher_short, pitcher_long, pitcher_all], axis=1)

ValueError: transform must return a scalar value for each group

In [None]:
merged_df.query('batterName == "Aaron Judge"').query('pitchHand == "R"')[['hr', 'hr_b', 'hr_b_long', 'pa_b', 'pa_b_long', 'eventsModel']].tail(60)

In [12]:
# Inputs
batter_stats = ['b1_b','b2_b','b3_b','hr_b','hbp_b','bb_b','so_b','fo_b','go_b','lo_b','po_b',
                'iso_b','slg_b','woba_b','to_left_b','to_middle_b','to_right_b',
                'hard_hit_b','totalDistance_b','launchSpeed_b','maxSpeed_b','maxSpin_b','ab_b','pa_b',
                'b1_b_long','b2_b_long','b3_b_long','hr_b_long','hbp_b_long','bb_b_long','so_b_long','fo_b_long','go_b_long','lo_b_long','po_b_long',
                'iso_b_long','slg_b_long','woba_b_long','to_left_b_long','to_middle_b_long','to_right_b_long',
                'hard_hit_b_long','totalDistance_b_long','launchSpeed_b_long','maxSpeed_b_long','maxSpin_b_long','ab_b_long','pa_b_long']

pitcher_stats = ['b1_p','b2_p','b3_p','hr_p','hbp_p','bb_p','so_p','fo_p','go_p','lo_p','po_p',
                'iso_p','slg_p','woba_p','to_left_p','to_middle_p','to_right_p',
                'hard_hit_p','totalDistance_p','launchSpeed_p','maxSpeed_p','maxSpin_p','ab_p','pa_p',
                'b1_p_long','b2_p_long','b3_p_long','hr_p_long','hbp_p_long','bb_p_long','so_p_long','fo_p_long','go_p_long','lo_p_long','po_p_long',
                'iso_p_long','slg_p_long','woba_p_long','to_left_p_long','to_middle_p_long','to_right_p_long',
                'hard_hit_p_long','totalDistance_p_long','launchSpeed_p_long','maxSpeed_p_long','maxSpin_p_long','ab_p_long','pa_p_long']

venues = ['venue_1', 'venue_2', 'venue_3', 'venue_4', 'venue_5', 'venue_7', 'venue_10', 'venue_12', 'venue_13', 'venue_14', 'venue_15',
 'venue_16', 'venue_17', 'venue_19', 'venue_22', 'venue_31', 'venue_32', 'venue_680', 'venue_2392', 'venue_2394', 'venue_2395', 'venue_2535',
 'venue_2536', 'venue_2602', 'venue_2680', 'venue_2681', 'venue_2701', 'venue_2735', 'venue_2756', 'venue_2889', 'venue_3289', 'venue_3309',
 'venue_3312', 'venue_3313', 'venue_4169', 'venue_4705', 'venue_5010', 'venue_5325', 'venue_5365', 'venue_5381', 'venue_5445']

years = ['year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019', 'year_2020', 'year_2021', 'year_2022', 'year_2023']

other_list = ['p_L','b_L','x_vect','y_vect','temperature','onFirst','onSecond','onThird','inning','top','score_diff']
# Doesn't have imp_b and imp_p from Model - PAs: nbd, but just know that. Might drop it from there.
x_list = batter_stats + pitcher_stats + venues + years + other_list 

In [13]:
keep_list = x_list + ['eventsModel', 'batter', 'batterName', 'batSide', 'pitcher', 'pitcherName', 'pitchHand']
merged_df = merged_df[keep_list]

In [14]:
# Going to need to make Batters and Pitchers files for use as inputs. Don't need for training/testing the PA model though.
# Would need steals? average outs for pitchers, starts #

In [15]:
merged_df.to_csv(os.path.join(baseball_path, "Inputs", "New Sample.csv"))