In [1]:
%run "U1. Imports.ipynb"
%run "U2. Utilities.ipynb"
%run "U3. Classes.ipynb"
%run "D3. Simulation Functions.ipynb"

baseball_path = r'C:\Users\james\Documents\MLB\Database'

Code was last run on: 2023-10-24


In [2]:
db_path = r'C:\Users\james\Documents\MLB\Database\MLBDB.db'
engine = create_engine(f'sqlite:///{db_path}')

In [3]:
# Dataset 

In [4]:
def dataset(engine, start_year, end_year):
    # Initialize an empty dataframe to store the results
    df = pd.DataFrame()
    
    # Iterate through the range of years
    for year in range(start_year, end_year + 1):
        # Define table names for Stats API and Statcast for the current year
        statsapi_table = f'Stats API {year}'
        statcast_table = f'Statcast {year}'
        
        # Load tables from the database for the current year
        statsapi_df = pd.read_sql_table(statsapi_table, engine)
        statcast_df = pd.read_sql_table(statcast_table, engine)
        
        # Merge the two dataframes based on 'gamePk' and 'atBatIndex'
        merged_df = pd.merge(statsapi_df, statcast_df, on=['gamePk', 'atBatIndex'], how='left')
        
        # Append the merged dataframe to the result dataframe
        df = df.append(merged_df, ignore_index=True)
        
        # Sort
        df.sort_values(['game_date', 'gamePk', 'atBatIndex'], inplace=True)

        # Only keep one observation per at bat
        df.drop_duplicates(['gamePk', 'atBatIndex'], keep='last', inplace=True)
    
    # Return the combined dataframe
    return df

In [5]:
df = dataset(engine, 2023, 2023)

In [6]:
# Wind

In [7]:
# Positive to centerfield, negative from centerfield
def y_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "Out To CF": 
        y_vect = wind_speed
    elif df['windDirection'] == "Out To RF": 
        y_vect = angled
    elif df['windDirection'] == "L To R": 
        y_vect = 0
    elif df['windDirection'] == "In From LF": 
        y_vect = angled * -1
    elif df['windDirection'] == "In From CF": 
        y_vect = wind_speed * - 1
    elif df['windDirection'] == "In From RF": 
        y_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        y_vect = 0
    elif df['windDirection'] == "Out To LF": 
        y_vect = angled
    else:
        y_vect = 0
        
    return y_vect

# Positive from left to right, negative from right to left
def x_vect(df):
    wind_speed = df['windSpeed']
    angled = df['windSpeed'] / 2 * math.sqrt(2)
    
    if df['windDirection'] == "L To R": 
        x_vect = wind_speed
    elif df['windDirection'] == "In From LF": 
        x_vect = angled
    elif df['windDirection'] == "In From CF": 
        x_vect = 0
    elif df['windDirection'] == "In From RF": 
        x_vect = angled * -1
    elif df['windDirection'] == "R To L": 
        x_vect = wind_speed * - 1
    elif df['windDirection'] == "Out To LF": 
        x_vect = angled * -1
    elif df['windDirection'] == "Out To CF": 
        x_vect = 0
    elif df['windDirection'] == "Out To RF": 
        x_vect = angled
    else:
        x_vect = 0
        
    return x_vect

# 2 is to centerfield, 6 is from centerfield, clockwise
# Assumption is wind is blowing in 8 cardinal directions, so we can use simple right isosceles triangles

In [8]:
def clean_weather(df):
    # Separate weather into temperature and weather type
    df[['temperature', 'weather']] = df['weather'].str.split(", ", expand=True)
    df['temperature'] = df['temperature'].str.replace(" degrees", "").astype('int')
    # Separate wind into speed and direction
    df[['windSpeed', 'windDirection']] = df['wind'].str.split(", ", expand=True)
    df['windSpeed'].fillna("0 mph", inplace=True)
    df['windSpeed'] = df['windSpeed'].str.replace(" mph", "")
    df['windSpeed'] = pd.to_numeric(df['windSpeed'], errors='coerce')
    df['windSpeed'].fillna(0, inplace=True)
    df['windDirection'].fillna('L to R', inplace=True)
    df['windSpeed'].unique()
    df['windDirection'] = df['windDirection'].str.replace(".", "")
    # Calculate vectors
    df['x_vect'] = df.apply(x_vect, axis=1)
    df['y_vect'] = df.apply(y_vect, axis=1)
    
    return df

In [9]:
# Assign play categories to full descriptions
def create_events(df):
    event_mapping = {
        'Strikeout': 'so',
        'Strikeout Double Play': 'so',
        'Groundout': 'go',
        'Fielders Choice': 'go',
        'Double Play': 'go',
        'Grounded Into DP': 'go',
        'Triple Play': 'go',
        'Field Error': 'go',
        'Forceout': 'go',
        'Lineout': 'lo',
        'Bunt Lineout': 'lo',
        'Flyout': 'fo',
        'Sac Fly': 'fo',
        'Sac Fly Double Play': 'fo',
        'Pop Out': 'po',
        'Bunt Pop Out': 'po',
        'Hit By Pitch': 'hbp',
        'Walk': 'bb',
        'Intent Walk': 'bb',
        'Single': 'b1',
        'Double': 'b2',
        'Triple': 'b3',
        'Home Run': 'hr'
    }

    df['eventsModel'] = df['event'].map(event_mapping).fillna('Cut')
    return df

In [10]:
df2 = create_events(df)
# df2.head(1)

In [11]:
# This turns several variables, including events, venues, hands, and bases into dummies
def create_dummies(df):    
    # Events
    event_dummies = pd.get_dummies(df['eventsModel'])
    # Venues
    venue_dummies = pd.get_dummies(df['venue_id'], prefix='venue')
    # Hands
    pitcher_dummies = pd.get_dummies(df['pitchHand'], prefix='p')
    batter_dummies = pd.get_dummies(df['batSide'], prefix='b')
    # Years
    df['year'] = df['game_date'].str[:4]
    year_dummies = pd.get_dummies(df['year'], prefix='year')
    
    # Create lists of dummies
    venue_list = venue_dummies.columns.tolist()
    year_list = year_dummies.columns.tolist()
    dummy_list = venue_list + year_list
    
    # Add dummies to dataframe
    df = pd.concat([df, event_dummies, venue_dummies, pitcher_dummies, batter_dummies, year_dummies], axis=1)
    
    # Create dummy for runners on base
    df['preOnFirst'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnFirst'].shift(1)
    df['preOnSecond'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnSecond'].shift(1)
    df['preOnThird'] = df.groupby(['gamePk', 'inning', 'halfInning'])['postOnThird'].shift(1)
    
    df['onFirst'] = df['preOnFirst'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onSecond'] = df['preOnSecond'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    df['onThird'] = df['preOnThird'].apply(lambda x: 1 if isinstance(x, str) and 'id' in x else 0)
    
    # Top of the inning dummy
    df['top'] = np.where(df['halfInning'] == "top", 1, 0)
    
    # Convert to numeric
    df['awayScore'] = df['awayScore'].astype('int')
    df['homeScore'] = df['homeScore'].astype('int')
    
    # Determine score before PA
    df['preAwayScore'] = df.groupby(['gamePk', 'inning', 'halfInning'])['awayScore'].shift(1)
    df['preHomeScore'] = df.groupby(['gamePk', 'inning', 'halfInning'])['homeScore'].shift(1)
    
    df['preAwayScore'].fillna(df['awayScore'], inplace=True)
    df['preHomeScore'].fillna(df['homeScore'], inplace=True)
    
    
    # Calculate differential
    df['score_diff'] = np.where(df['top'] == 1, df['preAwayScore'] - df['preHomeScore'], df['preHomeScore'] - df['preAwayScore'])

    # Create compatible date variable
    df['date'] = df['game_date'].str.replace('-', '')
    
    # Calculate PAs and ABs
    df['pa'] = np.where(df['eventsModel'] != "Cut", 1, 0)
    df['ab'] = df['pa'] - df['hbp'] - df['bb']
    
    return df

In [12]:
# # Add dummy variables
# # Do you need dummy list?!?!?!?!!?????????????
df3 = create_dummies(df2)
df3.head(1)

Unnamed: 0,atBatIndex,inning,halfInning,outs,type,id,event,eventType,description,rbi,awayScore,homeScore,batter,batterName,batSide,pitcher,pitcherName,pitchHand,postOnFirst,postOnSecond,postOnThird,start,end,movementReason,gamePk,weather,wind,venue,date,away_name,home_name,game_date,game_type_x,venue_id,pitch_number,pitch_name,game_type_y,hc_x,hc_y,hit_location,hit_distance_sc,launch_speed,launch_angle,launch_speed_angle,woba_value,woba_denom,estimated_ba_using_speedangle,estimated_woba_using_speedangle,iso_value,babip_value,maxSpeed,maxSpin,eventsModel,year,Cut,b1,b2,b3,bb,fo,go,hbp,hr,lo,po,so,venue_1,venue_10,venue_12,venue_14,venue_15,venue_17,venue_19,venue_2,venue_22,venue_2392,venue_2394,venue_2395,venue_2602,venue_2680,venue_2681,venue_2735,venue_2889,venue_3,venue_31,venue_32,venue_3289,venue_3309,venue_3312,venue_3313,venue_4,venue_4169,venue_4705,venue_5,venue_5325,venue_5340,venue_5381,venue_680,venue_7,p_L,p_R,b_L,b_R,year_2023,preOnFirst,preOnSecond,preOnThird,onFirst,onSecond,onThird,top,preAwayScore,preHomeScore,score_diff,pa,ab
1516,0,1,top,1,atBat,680757,Pop Out,field_out,Steven Kwan pops out to third baseman Eugenio Suarez in foul territory.,0,0,0,680757,Steven Kwan,L,622491,Luis Castillo,R,,,,,,,718767,"54 degrees, Partly Cloudy.","11 mph, Out To LF.",T-Mobile Park.,20230330,Cleveland Guardians,Seattle Mariners,2023-03-30,R,680,6,Sinker,R,89.84,169.11,5,120,70.4,70,3,0.0,1,0.003,0.003,0,0,94.9,2399,po,2023,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,,,,0,0,0,1,0.0,0.0,0.0,1,1


In [13]:
# def find_max(lst):
#     return max(lst) if lst else 0

def clean_statcast(df):
    # Convert variables to numeric
    df['launch_speed'] = pd.to_numeric(df['launch_speed'], errors='coerce')
    df['launch_speed_angle'] = pd.to_numeric(df['launch_speed_angle'], errors='coerce')
    df['hc_x'] = pd.to_numeric(df['hc_x'], errors='coerce')
    df['hc_y'] = pd.to_numeric(df['hc_y'], errors='coerce')
    
    # Hard hit dummy
    df['hard_hit'] = (df['launch_speed'] >= 95).astype('int')
    
    # Barrel dummy
    df['barrel'] = (df['launch_speed_angle'] == 6).astype('int')

    # Spray 
    df['spray_angle'] = np.arctan((df['hc_x'] - 125.42) / (198.27 - df['hc_y'])) * 180 / np.pi * 0.75
    df['to_left'] = (df['spray_angle'] < -15).astype('int')
    df['to_middle'] = ((df['spray_angle'] >= -15) & (df['spray_angle'] <= 15)).astype('int')
    df['to_right'] = (df['spray_angle'] > 15).astype('int')

    return df

In [14]:
# df4 = clean_statcast(df3)
# df4.head(1)

In [15]:
def read_park_factors():
    # Read in park factors
    park_factors = pd.read_sql_table('Statcast Park Factors', engine)
    
    # Clean
    park_factors['Team'] = park_factors['Team'].str.strip()
  
    # Read in team_map 
    team_map = pd.read_sql_table('Team Map', engine)
    
    # Merge with team map to get venue ID
    park_factors = park_factors.merge(team_map[['FANGRAPHSTEAM', 'VENUE_ID']], left_on='Team', right_on='FANGRAPHSTEAM', how='inner')
    park_factors.rename(columns={'VENUE_ID':'venue_id'}, inplace=True)
    
    # Keep relevant variables
    park_factors = park_factors[['venue_id', 'batSide', 'Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']]
    
    # Convert to mean of 1, not 100
    factor_list = ['Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']
    for factor in factor_list:
        park_factors[factor] = park_factors[factor] / 100
        
    # Convert to numeric
    park_factors['venue_id'] = park_factors['venue_id'].astype('str')
    
    # Sort
    park_factors.sort_values(['venue_id', 'batSide'], inplace=True)
    
    return park_factors

In [16]:
def park_adjustments(df):   
    # Read in park factors
    park_factors = read_park_factors()
    
    # Merge with park factors
    df = df.merge(park_factors, on=['venue_id', 'batSide'], how='left')
    
    # Old/other parks get all 1s
    df[['Park Factor', '1B', '2B', '3B', 'HR', 'BB', 'SO']].fillna(1, inplace=True)
    
    # Adjust stats by park factor
    df['b1'] = df['b1'] / df['1B']
    df['b2'] = df['b2'] / df['2B']
    df['b3'] = df['b3'] / df['3B']
    df['hr'] = df['hr'] / df['HR']
    df['bb'] = df['bb'] / df['BB']
    df['so'] = df['so'] / df['SO']
    
    return df

In [17]:
# df5 = park_adjustments(df4)
# df5.head(1)

In [18]:
# This will return a dataframe that can eventually be used as the model input. Has pitcher vs hitter stats, specific to hand
def rolling_pas(df, pa_num):
    # Note: batter_avg_short will work even when pa_num refers to the "long" period. Suffix will be added in post.
    # Rename for compatibility purposes
    df.rename(columns={'hit_distance_sc':'totalDistance', 'launch_speed':'launchSpeed'}, inplace=True)
    
    # Number PAs
    df['pa_num'] = df.index

    # Convert to numeric and fill with 0s
    combined_list = avg_list + max_list
    for col in combined_list:
        # Check if the column is not numeric
        if not pd.api.types.is_numeric_dtype(df[col]):
            # Convert the non-numeric column to numeric and fill missing values with 0
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    ### Batter stats 
    # Stats for which you want the average (will divide later)
    df[batter_avg_short] = df.groupby(['batter', 'pitchHand'])[avg_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
    # Stats for which you want the maximum
    df[batter_max_short] = df.groupby(['batter', 'pitchHand'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
    # Stats for which you just want the sum (won't divide later)
    df[['ab_b', 'pa_b']] = df.groupby(['batter', 'pitchHand'])[['ab', 'pa']].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
                
    ### Pitcher stats
    # Stats for which you want the average
    df[pitcher_avg_short] = df.groupby(['pitcher', 'batSide'])[avg_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
    # Stats for which you want the maximum
    df[pitcher_max_short] = df.groupby(['pitcher', 'batSide'])[max_list].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).max())
    # Stats for which you just want the sum (won't divide later)
    df[['ab_p', 'pa_p']] = df.groupby(['batter', 'pitchHand'])[['ab', 'pa']].transform(lambda x: x.shift().rolling(pa_num, min_periods=1).sum())
                
    # Sort by PA number
    df.sort_values(['pa_num'], axis=0, ascending=True, inplace=True)

    ### Advanced stats
    # wOBA - using 2022 values throughout
    df['woba_b'] = (0.690 * df['bb_b']) + (0.721 * df['hbp_b']) + (0.885 * df['b1_b']) + (1.262 * df['b2_b']) + (1.601 * df['b3_b']) + (2.070 * df['hr_b'])
    df['woba_p'] = (0.690 * df['bb_p']) + (0.721 * df['hbp_p']) + (0.885 * df['b1_p']) + (1.262 * df['b2_p']) + (1.601 * df['b3_p']) + (2.070 * df['hr_p'])
    
    # Slugging
    df['slg_b'] = (1 * df['b1_b']) + (2 * df['b2_b']) + (3 * df['b3_b']) + (4 * df['hr_b'])
    df['slg_b'] = df['slg_b'] / df['ab_b']
    df['slg_p'] = (1 * df['b1_p']) + (2 * df['b2_p']) + (3 * df['b3_p']) + (4 * df['hr_p'])
    df['slg_p'] = df['slg_p'] / df['ab_p']

    # OBP    
    df['obp_b'] = df[['b1_b', 'b2_b', 'b3_b', 'hr_b', 'bb_b', 'hbp_b']].sum(axis=1)
    df['obp_p'] = df[['b1_p', 'b2_p', 'b3_p', 'hr_p', 'bb_p', 'hbp_p']].sum(axis=1)
    
    # ISO
    df['iso_b'] = df['b2_b'] * 1 + df['b3_b'] * 2 + df['hr_b'] * 3
    df['iso_p'] = df['b2_p'] * 1 + df['b3_p'] * 2 + df['hr_p'] * 3

    
    
    # Calculate averages (they were sums before)
    # Batters
    for stat in batter_avg_short:
        df[stat] = df[stat] / df['pa_b']
    for stat in batter_calc_short:
        df[stat] = df[stat] / df['pa_b']
    
    # Pitchers
    for stat in pitcher_avg_short:
        df[stat] = df[stat] / df['pa_p']
    for stat in pitcher_calc_short:
        df[stat] = df[stat] / df['pa_p']
    
    # Sort values
    df.sort_values('pa_num', inplace=True)
               
        
    return df

In [19]:
# df6 = rolling_pas(df5, 50)

In [20]:
# df7 = rolling_pas(df5, 200)
# df7[batter_avg_short].tail()

In [21]:
def create_inputs(start_year, end_year, short=50, long=300):
    # Read in raw data
    df = dataset(engine, start_year, end_year)
    # Clean weather
    df2 = clean_weather(df)
    # Create PA events 
    df3 = create_events(df2)
    # Create dummy variables 
    df4 = create_dummies(df3)
    # Create Statcast variables
    df5 = clean_statcast(df4)
    # Adjust for park factors
    df6 = park_adjustments(df5)
    
    ### Rolling stats
    # Short
    df_short = rolling_pas(df6, short)
    # Long
    df_long = rolling_pas(df6, long)
    # We only need the rolling stats 
    rolling_stats_short = batter_stats_short + pitcher_stats_short
    df_long = df_long[rolling_stats_short]
    df_long = df_long.add_suffix("_long")
    
    # Dataset
    complete_dataset = pd.concat([df_short, df_long], axis=1)
    
    
    return complete_dataset

In [22]:
complete_dataset = create_inputs(2015, 2023, 50, 300)

In [23]:
complete_dataset.tail()

Unnamed: 0,atBatIndex,inning,halfInning,outs,type,id,event,eventType,description,rbi,awayScore,homeScore,batter,batterName,batSide,pitcher,pitcherName,pitchHand,postOnFirst,postOnSecond,postOnThird,start,end,movementReason,gamePk,weather,wind,venue,date,away_name,home_name,game_date,game_type_x,venue_id,pitch_number,pitch_name,game_type_y,hc_x,hc_y,hit_location,totalDistance,launchSpeed,launch_angle,launch_speed_angle,woba_value,woba_denom,estimated_ba_using_speedangle,estimated_woba_using_speedangle,iso_value,babip_value,maxSpeed,maxSpin,temperature,windSpeed,windDirection,x_vect,y_vect,eventsModel,year,Cut,b1,b2,b3,bb,fo,go,hbp,hr,lo,po,so,venue_1,venue_2,venue_3,venue_4,venue_5,venue_7,venue_10,venue_12,venue_13,venue_14,venue_15,venue_16,venue_17,venue_19,venue_22,venue_24,venue_31,venue_32,venue_680,venue_2392,venue_2394,venue_2395,venue_2500,venue_2503,venue_2504,venue_2507,venue_2508,venue_2511,venue_2513,venue_2514,venue_2516,venue_2518,venue_2520,venue_2523,venue_2526,venue_2529,venue_2530,venue_2532,venue_2534,venue_2535,venue_2536,venue_2542,venue_2602,venue_2603,venue_2680,venue_2681,venue_2700,venue_2701,venue_2722,venue_2723,venue_2724,venue_2735,venue_2756,venue_2766,venue_2769,venue_2781,venue_2852,venue_2856,venue_2858,venue_2861,venue_2862,venue_2889,venue_3289,venue_3309,venue_3312,venue_3313,venue_3809,venue_3834,venue_4169,venue_4249,venue_4309,venue_4510,venue_4629,venue_4669,venue_4670,venue_4705,venue_4960,venue_5000,venue_5010,venue_5315,venue_5325,venue_5365,venue_5380,venue_5381,venue_5445,venue_1.1,venue_10.1,venue_12.1,venue_14.1,venue_15.1,venue_17.1,venue_19.1,venue_2.1,venue_22.1,venue_2392.1,venue_2394.1,venue_2395.1,venue_2602.1,venue_2680.1,venue_2681.1,venue_2735.1,venue_2889.1,venue_3.1,venue_31.1,venue_32.1,venue_3289.1,venue_3309.1,venue_3312.1,venue_3313.1,venue_4.1,venue_4169.1,venue_4705.1,venue_5.1,venue_5325.1,venue_5340,venue_5381.1,venue_680.1,venue_7.1,p_L,p_R,b_L,b_R,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,preOnFirst,preOnSecond,preOnThird,onFirst,onSecond,onThird,top,preAwayScore,preHomeScore,score_diff,pa,ab,hard_hit,barrel,spray_angle,to_left,to_middle,to_right,Park Factor,1B,2B,3B,HR,BB,SO,pa_num,b1_b,b2_b,b3_b,hr_b,bb_b,hbp_b,so_b,fo_b,go_b,lo_b,po_b,estimated_woba_using_speedangle_b,to_left_b,to_middle_b,to_right_b,hard_hit_b,barrel_b,totalDistance_b,maxSpeed_b,maxSpin_b,launchSpeed_b,ab_b,pa_b,b1_p,b2_p,b3_p,hr_p,bb_p,hbp_p,so_p,fo_p,go_p,lo_p,po_p,estimated_woba_using_speedangle_p,to_left_p,to_middle_p,to_right_p,hard_hit_p,barrel_p,totalDistance_p,maxSpeed_p,maxSpin_p,launchSpeed_p,ab_p,pa_p,woba_b,woba_p,slg_b,slg_p,obp_b,obp_p,iso_b,iso_p,b1_b_long,b2_b_long,b3_b_long,hr_b_long,bb_b_long,hbp_b_long,so_b_long,fo_b_long,go_b_long,lo_b_long,po_b_long,estimated_woba_using_speedangle_b_long,to_left_b_long,to_middle_b_long,to_right_b_long,hard_hit_b_long,barrel_b_long,iso_b_long,slg_b_long,obp_b_long,woba_b_long,totalDistance_b_long,maxSpeed_b_long,maxSpin_b_long,launchSpeed_b_long,ab_b_long,pa_b_long,b1_p_long,b2_p_long,b3_p_long,hr_p_long,bb_p_long,hbp_p_long,so_p_long,fo_p_long,go_p_long,lo_p_long,po_p_long,estimated_woba_using_speedangle_p_long,to_left_p_long,to_middle_p_long,to_right_p_long,hard_hit_p_long,barrel_p_long,iso_p_long,slg_p_long,obp_p_long,woba_p_long,totalDistance_p_long,maxSpeed_p_long,maxSpin_p_long,launchSpeed_p_long,ab_p_long,pa_p_long
1597591,71,9,bottom,1,atBat,592206,Flyout,field_out,Nick Castellanos flies out to right fielder Corbin Carroll.,0,4,2,592206,Nick Castellanos,R,623149,Paul Sewald,R,,,,,,,748537,Clear.,"5 mph, R To L.",Citizens Bank Park.,20231024,Arizona Diamondbacks,Philadelphia Phillies,2023-10-24,L,2681,,,,,,,0.0,,,,,,,0.0,,,0.0,0.0,59,5,R To L,-5.0,0.0,fo,2023,0,0.0,0.0,0.0,0.0,1,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,,,,0,0,0,0,,,,1,1,0,0,,0,0,0,1.01,1.0,1.03,0.81,1.08,0.95,1.02,1597591,0.104767,0.019429,0.0,0.06069,0.041683,0.003367,0.306394,0.127946,0.212121,0.077441,0.030303,0.276559,0.272727,0.212121,0.144781,0.262626,0.074074,454.0,102.2,3257.0,112.4,284.0,297.0,0.055693,0.02624,0.0,0.007003,0.030347,0.010101,0.152702,0.070707,0.070707,0.030303,0.023569,0.091943,0.107744,0.124579,0.050505,0.080808,0.016835,424.0,95.7,2997.0,107.6,284.0,297.0,0.274055,0.125121,0.001361,0.00048,0.229935,0.129384,0.201498,0.047248,0.104767,0.019429,0.0,0.06069,0.041683,0.003367,0.306394,0.127946,0.212121,0.077441,0.030303,0.276559,0.272727,0.212121,0.144781,0.262626,0.074074,0.201498,0.001361,0.229935,0.274055,454.0,102.2,3257.0,112.4,284.0,297.0,0.055693,0.02624,0.0,0.007003,0.030347,0.010101,0.152702,0.070707,0.070707,0.030303,0.023569,0.091943,0.107744,0.124579,0.050505,0.080808,0.016835,0.047248,0.00048,0.129384,0.125121,424.0,95.7,2997.0,107.6,284.0,297.0
1597592,72,9,bottom,2,atBat,669016,Flyout,field_out,Brandon Marsh flies out to left fielder Lourdes Gurriel Jr.,0,4,2,669016,Brandon Marsh,L,623149,Paul Sewald,R,,,,,,,748537,Clear.,"5 mph, R To L.",Citizens Bank Park.,20231024,Arizona Diamondbacks,Philadelphia Phillies,2023-10-24,L,2681,,,,,,,0.0,,,,,,,0.0,,,0.0,0.0,59,5,R To L,-5.0,0.0,fo,2023,0,0.0,0.0,0.0,0.0,1,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,,,,0,0,0,0,4.0,2.0,-2.0,1,1,0,0,,0,0,0,1.01,1.0,1.03,0.81,1.08,0.95,1.02,1597592,0.161101,0.060512,0.004143,0.026141,0.119638,0.010067,0.289737,0.100671,0.157718,0.030201,0.030201,0.240228,0.16443,0.204698,0.201342,0.285235,0.053691,423.0,101.4,3253.0,109.8,260.0,298.0,0.049638,0.006567,0.0,0.020352,0.055127,0.0,0.14377,0.080537,0.050336,0.010067,0.020134,0.085903,0.067114,0.100671,0.073826,0.083893,0.016779,438.0,96.0,2752.0,110.4,260.0,298.0,0.369493,0.132382,0.001535,0.000555,0.381602,0.131683,0.14722,0.067621,0.161101,0.060512,0.004143,0.026141,0.119638,0.010067,0.289737,0.100671,0.157718,0.030201,0.030201,0.240228,0.16443,0.204698,0.201342,0.285235,0.053691,0.14722,0.001535,0.381602,0.369493,423.0,101.4,3253.0,109.8,260.0,298.0,0.049638,0.006567,0.0,0.020352,0.055127,0.0,0.14377,0.080537,0.050336,0.010067,0.020134,0.085903,0.067114,0.100671,0.073826,0.083893,0.016779,0.067621,0.000555,0.131683,0.132382,438.0,96.0,2752.0,110.4,260.0,298.0
1597593,73,9,bottom,3,atBat,595909,Flyout,field_out,Jake Cave flies out to right fielder Corbin Carroll.,0,4,2,595909,Jake Cave,L,623149,Paul Sewald,R,,,,,,,748537,Clear.,"5 mph, R To L.",Citizens Bank Park.,20231024,Arizona Diamondbacks,Philadelphia Phillies,2023-10-24,L,2681,,,,,,,0.0,,,,,,,0.0,,,0.0,0.0,59,5,R To L,-5.0,0.0,fo,2023,0,0.0,0.0,0.0,0.0,1,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,,,,0,0,0,0,4.0,2.0,-2.0,1,1,0,0,,0,0,0,1.01,1.0,1.03,0.81,1.08,0.95,1.02,1597593,0.129534,0.046693,0.007178,0.021442,0.071595,0.005814,0.256942,0.197674,0.19186,0.040698,0.023256,0.238221,0.180233,0.267442,0.215116,0.25,0.052326,428.0,101.4,3288.0,107.0,159.0,172.0,0.086001,0.011377,0.0,0.03526,0.09551,0.0,0.24909,0.145349,0.087209,0.017442,0.034884,0.148831,0.116279,0.174419,0.127907,0.145349,0.02907,438.0,96.0,2752.0,110.4,159.0,172.0,0.283034,0.22936,0.002077,0.001571,0.282256,0.228148,0.125375,0.117158,0.129534,0.046693,0.007178,0.021442,0.071595,0.005814,0.256942,0.197674,0.19186,0.040698,0.023256,0.238221,0.180233,0.267442,0.215116,0.25,0.052326,0.125375,0.002077,0.282256,0.283034,428.0,101.4,3288.0,107.0,159.0,172.0,0.086001,0.011377,0.0,0.03526,0.09551,0.0,0.24909,0.145349,0.087209,0.017442,0.034884,0.148831,0.116279,0.174419,0.127907,0.145349,0.02907,0.117158,0.001571,0.228148,0.22936,438.0,96.0,2752.0,110.4,159.0,172.0
1597594,8,2,top,0,atBat,666971,Single,single,Lourdes Gurriel Jr. singles on a sharp line drive to left fielder Brandon Marsh.,0,1,0,666971,Lourdes Gurriel Jr.,R,624133,Ranger Suarez,L,"{'id': 666971, 'fullName': 'Lourdes Gurriel Jr.', 'link': '/api/v1/people/666971'}",,,,1B,,748537,Clear.,"5 mph, R To L.",Citizens Bank Park.,20231024,Arizona Diamondbacks,Philadelphia Phillies,2023-10-24,L,2681,,,,,,,0.0,,,,,,,0.0,,,0.0,0.0,59,5,R To L,-5.0,0.0,b1,2023,0,1.0,0.0,0.0,0.0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,,,,0,0,0,1,1.0,0.0,1.0,1,1,0,0,,0,0,0,1.01,1.0,1.03,0.81,1.08,0.95,1.02,1597594,0.193865,0.044536,0.00518,0.027876,0.076,0.012121,0.109761,0.127273,0.29697,0.078788,0.030303,0.319564,0.260606,0.30303,0.248485,0.345455,0.072727,425.0,101.9,2984.0,107.5,151.0,165.0,0.298268,0.063119,0.0,0.047052,0.174247,0.006061,0.419038,0.218182,0.387879,0.127273,0.066667,0.423297,0.369697,0.563636,0.230303,0.424242,0.09697,410.0,96.8,2778.0,115.4,151.0,165.0,0.354951,0.565621,0.002715,0.004058,0.359578,0.588746,0.138525,0.204274,0.193865,0.044536,0.00518,0.027876,0.076,0.012121,0.109761,0.127273,0.29697,0.078788,0.030303,0.319564,0.260606,0.30303,0.248485,0.345455,0.072727,0.138525,0.002715,0.359578,0.354951,425.0,101.9,2984.0,107.5,151.0,165.0,0.298268,0.063119,0.0,0.047052,0.174247,0.006061,0.419038,0.218182,0.387879,0.127273,0.066667,0.423297,0.369697,0.563636,0.230303,0.424242,0.09697,0.204274,0.004058,0.588746,0.565621,410.0,96.8,2778.0,115.4,151.0,165.0
1597595,9,2,top,1,atBat,446334,Flyout,field_out,Evan Longoria flies out to left fielder Brandon Marsh.,0,1,0,446334,Evan Longoria,R,624133,Ranger Suarez,L,"{'id': 666971, 'fullName': 'Lourdes Gurriel Jr.', 'link': '/api/v1/people/666971'}",,,,,,748537,Clear.,"5 mph, R To L.",Citizens Bank Park.,20231024,Arizona Diamondbacks,Philadelphia Phillies,2023-10-24,L,2681,,,,,,,0.0,,,,,,,0.0,,,0.0,0.0,59,5,R To L,-5.0,0.0,fo,2023,0,0.0,0.0,0.0,0.0,1,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,"{'id': 666971, 'fullName': 'Lourdes Gurriel Jr.', 'link': '/api/v1/people/666971'}",,,1,0,0,1,1.0,0.0,1.0,1,1,0,0,,0,0,0,1.01,1.0,1.03,0.81,1.08,0.95,1.02,1597595,0.102078,0.052383,0.0,0.041428,0.107702,0.0,0.289845,0.166667,0.18254,0.039683,0.02381,0.257524,0.277778,0.198413,0.134921,0.34127,0.055556,463.0,97.8,2971.0,109.7,113.0,126.0,0.398526,0.082656,0.0,0.061615,0.228181,0.007937,0.54874,0.277778,0.507937,0.166667,0.087302,0.554294,0.484127,0.730159,0.301587,0.555556,0.126984,410.0,96.8,2778.0,115.4,113.0,126.0,0.316517,0.747717,0.003297,0.007171,0.303591,0.778914,0.176668,0.267502,0.102078,0.052383,0.0,0.041428,0.107702,0.0,0.289845,0.166667,0.18254,0.039683,0.02381,0.257524,0.277778,0.198413,0.134921,0.34127,0.055556,0.176668,0.003297,0.303591,0.316517,463.0,97.8,2971.0,109.7,113.0,126.0,0.398526,0.082656,0.0,0.061615,0.228181,0.007937,0.54874,0.277778,0.507937,0.166667,0.087302,0.554294,0.484127,0.730159,0.301587,0.555556,0.126984,0.267502,0.007171,0.778914,0.747717,410.0,96.8,2778.0,115.4,113.0,126.0


In [24]:
complete_dataset[inputs_plus].tail()

Unnamed: 0,batterName,pitcherName,batter,pitcher,batSide,pitchHand,eventsModel,b1_b,b2_b,b3_b,hr_b,bb_b,hbp_b,so_b,fo_b,go_b,lo_b,po_b,estimated_woba_using_speedangle_b,to_left_b,to_middle_b,to_right_b,hard_hit_b,barrel_b,iso_b,slg_b,obp_b,woba_b,totalDistance_b,launchSpeed_b,b1_b_long,b2_b_long,b3_b_long,hr_b_long,bb_b_long,hbp_b_long,so_b_long,fo_b_long,go_b_long,lo_b_long,po_b_long,estimated_woba_using_speedangle_b_long,to_left_b_long,to_middle_b_long,to_right_b_long,hard_hit_b_long,barrel_b_long,iso_b_long,slg_b_long,obp_b_long,woba_b_long,totalDistance_b_long,launchSpeed_b_long,b1_p,b2_p,b3_p,hr_p,bb_p,hbp_p,so_p,fo_p,go_p,lo_p,po_p,estimated_woba_using_speedangle_p,to_left_p,to_middle_p,to_right_p,hard_hit_p,barrel_p,iso_p,slg_p,obp_p,woba_p,maxSpeed_p,maxSpin_p,b1_p_long,b2_p_long,b3_p_long,hr_p_long,bb_p_long,hbp_p_long,so_p_long,fo_p_long,go_p_long,lo_p_long,po_p_long,estimated_woba_using_speedangle_p_long,to_left_p_long,to_middle_p_long,to_right_p_long,hard_hit_p_long,barrel_p_long,iso_p_long,slg_p_long,obp_p_long,woba_p_long,maxSpeed_p_long,maxSpin_p_long,venue_1,venue_1.1,venue_2,venue_2.1,venue_3,venue_3.1,venue_4,venue_4.1,venue_5,venue_5.1,venue_7,venue_7.1,venue_10,venue_10.1,venue_12,venue_12.1,venue_13,venue_14,venue_14.1,venue_15,venue_15.1,venue_16,venue_17,venue_17.1,venue_19,venue_19.1,venue_22,venue_22.1,venue_31,venue_31.1,venue_32,venue_32.1,venue_680,venue_680.1,venue_2392,venue_2392.1,venue_2394,venue_2394.1,venue_2395,venue_2395.1,venue_2535,venue_2536,venue_2602,venue_2602.1,venue_2680,venue_2680.1,venue_2681,venue_2681.1,venue_2701,venue_2735,venue_2735.1,venue_2756,venue_2889,venue_2889.1,venue_3289,venue_3289.1,venue_3309,venue_3309.1,venue_3312,venue_3312.1,venue_3313,venue_3313.1,venue_4169,venue_4169.1,venue_4705,venue_4705.1,venue_5010,venue_5325,venue_5325.1,venue_5365,venue_5381,venue_5381.1,venue_5445,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,p_L,b_L,x_vect,y_vect,temperature,onFirst,onSecond,onThird,inning,top,score_diff
1597591,Nick Castellanos,Paul Sewald,592206,623149,R,R,fo,0.104767,0.019429,0.0,0.06069,0.041683,0.003367,0.306394,0.127946,0.212121,0.077441,0.030303,0.276559,0.272727,0.212121,0.144781,0.262626,0.074074,0.201498,0.001361,0.229935,0.274055,454.0,112.4,0.104767,0.019429,0.0,0.06069,0.041683,0.003367,0.306394,0.127946,0.212121,0.077441,0.030303,0.276559,0.272727,0.212121,0.144781,0.262626,0.074074,0.201498,0.001361,0.229935,0.274055,454.0,112.4,0.055693,0.02624,0.0,0.007003,0.030347,0.010101,0.152702,0.070707,0.070707,0.030303,0.023569,0.091943,0.107744,0.124579,0.050505,0.080808,0.016835,0.047248,0.00048,0.129384,0.125121,95.7,2997.0,0.055693,0.02624,0.0,0.007003,0.030347,0.010101,0.152702,0.070707,0.070707,0.030303,0.023569,0.091943,0.107744,0.124579,0.050505,0.080808,0.016835,0.047248,0.00048,0.129384,0.125121,95.7,2997.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,-5.0,0.0,59,0,0,0,9,0,
1597592,Brandon Marsh,Paul Sewald,669016,623149,L,R,fo,0.161101,0.060512,0.004143,0.026141,0.119638,0.010067,0.289737,0.100671,0.157718,0.030201,0.030201,0.240228,0.16443,0.204698,0.201342,0.285235,0.053691,0.14722,0.001535,0.381602,0.369493,423.0,109.8,0.161101,0.060512,0.004143,0.026141,0.119638,0.010067,0.289737,0.100671,0.157718,0.030201,0.030201,0.240228,0.16443,0.204698,0.201342,0.285235,0.053691,0.14722,0.001535,0.381602,0.369493,423.0,109.8,0.049638,0.006567,0.0,0.020352,0.055127,0.0,0.14377,0.080537,0.050336,0.010067,0.020134,0.085903,0.067114,0.100671,0.073826,0.083893,0.016779,0.067621,0.000555,0.131683,0.132382,96.0,2752.0,0.049638,0.006567,0.0,0.020352,0.055127,0.0,0.14377,0.080537,0.050336,0.010067,0.020134,0.085903,0.067114,0.100671,0.073826,0.083893,0.016779,0.067621,0.000555,0.131683,0.132382,96.0,2752.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,-5.0,0.0,59,0,0,0,9,0,-2.0
1597593,Jake Cave,Paul Sewald,595909,623149,L,R,fo,0.129534,0.046693,0.007178,0.021442,0.071595,0.005814,0.256942,0.197674,0.19186,0.040698,0.023256,0.238221,0.180233,0.267442,0.215116,0.25,0.052326,0.125375,0.002077,0.282256,0.283034,428.0,107.0,0.129534,0.046693,0.007178,0.021442,0.071595,0.005814,0.256942,0.197674,0.19186,0.040698,0.023256,0.238221,0.180233,0.267442,0.215116,0.25,0.052326,0.125375,0.002077,0.282256,0.283034,428.0,107.0,0.086001,0.011377,0.0,0.03526,0.09551,0.0,0.24909,0.145349,0.087209,0.017442,0.034884,0.148831,0.116279,0.174419,0.127907,0.145349,0.02907,0.117158,0.001571,0.228148,0.22936,96.0,2752.0,0.086001,0.011377,0.0,0.03526,0.09551,0.0,0.24909,0.145349,0.087209,0.017442,0.034884,0.148831,0.116279,0.174419,0.127907,0.145349,0.02907,0.117158,0.001571,0.228148,0.22936,96.0,2752.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,-5.0,0.0,59,0,0,0,9,0,-2.0
1597594,Lourdes Gurriel Jr.,Ranger Suarez,666971,624133,R,L,b1,0.193865,0.044536,0.00518,0.027876,0.076,0.012121,0.109761,0.127273,0.29697,0.078788,0.030303,0.319564,0.260606,0.30303,0.248485,0.345455,0.072727,0.138525,0.002715,0.359578,0.354951,425.0,107.5,0.193865,0.044536,0.00518,0.027876,0.076,0.012121,0.109761,0.127273,0.29697,0.078788,0.030303,0.319564,0.260606,0.30303,0.248485,0.345455,0.072727,0.138525,0.002715,0.359578,0.354951,425.0,107.5,0.298268,0.063119,0.0,0.047052,0.174247,0.006061,0.419038,0.218182,0.387879,0.127273,0.066667,0.423297,0.369697,0.563636,0.230303,0.424242,0.09697,0.204274,0.004058,0.588746,0.565621,96.8,2778.0,0.298268,0.063119,0.0,0.047052,0.174247,0.006061,0.419038,0.218182,0.387879,0.127273,0.066667,0.423297,0.369697,0.563636,0.230303,0.424242,0.09697,0.204274,0.004058,0.588746,0.565621,96.8,2778.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,-5.0,0.0,59,0,0,0,2,1,1.0
1597595,Evan Longoria,Ranger Suarez,446334,624133,R,L,fo,0.102078,0.052383,0.0,0.041428,0.107702,0.0,0.289845,0.166667,0.18254,0.039683,0.02381,0.257524,0.277778,0.198413,0.134921,0.34127,0.055556,0.176668,0.003297,0.303591,0.316517,463.0,109.7,0.102078,0.052383,0.0,0.041428,0.107702,0.0,0.289845,0.166667,0.18254,0.039683,0.02381,0.257524,0.277778,0.198413,0.134921,0.34127,0.055556,0.176668,0.003297,0.303591,0.316517,463.0,109.7,0.398526,0.082656,0.0,0.061615,0.228181,0.007937,0.54874,0.277778,0.507937,0.166667,0.087302,0.554294,0.484127,0.730159,0.301587,0.555556,0.126984,0.267502,0.007171,0.778914,0.747717,96.8,2778.0,0.398526,0.082656,0.0,0.061615,0.228181,0.007937,0.54874,0.277778,0.507937,0.166667,0.087302,0.554294,0.484127,0.730159,0.301587,0.555556,0.126984,0.267502,0.007171,0.778914,0.747717,96.8,2778.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,-5.0,0.0,59,1,0,0,2,1,1.0


In [None]:
# Estimated woba using speedangle is 0 when missing but they're actually missing