# Missed Call Exploration for MLB Hitters 2023

<img src="https://thestadiumreviews.com/wp-content/uploads/2022/02/how-does-scoring-work-in-baseball.jpg" alt="Change in Run Expectancy" style="float: left; margin-right: 10px;" />

### by John Nahra

## Finding Change in Run Expectancy for each Base-Out-Count-Pitch

In [1]:
#import basic packages
import pandas as pd
import numpy as np

In [2]:
#install wheel
pip install --upgrade pip setuptools wheel

Note: you may need to restart the kernel to use updated packages.


In [3]:
#install pynacl
!pip install pynacl



In [4]:
#install pybaseball
!pip install pybaseball



In [5]:
#import statcast packages
from pybaseball import statcast
from pybaseball import statcast_batter

In [6]:
#import and enable cache
from pybaseball import cache

cache.enable()

In [7]:
#read in player ID csv
df_ids = pd.read_csv('player_ids.csv')

In [9]:
#keep ID and name
df_ids = df_ids[['MLBID','MLBNAME']]

In [10]:
#drop missing values
df_ids = df_ids.dropna()

In [11]:
#change ID to integer type
df_ids['MLBID'] = df_ids['MLBID'].astype(int)

In [12]:
#download statcast data for the 2023 season
df = statcast(start_dt="2023-03-30", end_dt="2023-08-14",verbose=True)

This is a large query, it may take a moment to complete


100%|██████████| 138/138 [00:06<00:00, 22.01it/s]


In [14]:
#merge IDs to statcast data
df = df.merge(df_ids,left_on='batter',right_on='MLBID')

In [15]:
#drop duplicate player name column
df = df.drop('player_name',axis=1)

In [16]:
#rename column to player name
df = df.rename(columns={'MLBNAME':'player_name'})

In [17]:
#only keep called strikes and balls
df = df.loc[(df['description'] == 'called_strike') | (df['description'] == 'ball')]

In [18]:
#only keep pitches with a non-missing run expectancy value
df = df[df['delta_run_exp'].notna()]

In [19]:
#only keep pitches where we can determine whether the call was missed
df = df[df['zone'].notna()]

In [20]:
#create a function that returns the team a batter plays for
def batter_team(df):
    if df['inning_topbot'] == 'Top':
        return df['away_team']
    else:
        return df['home_team']

In [21]:
#apply function to create a new variable
df['batter_team'] = df.apply(batter_team,axis=1)

In [22]:
#create a new variable that combines the player's name and their team (in order to group by team, account for trades)
df['player_name'] = df['player_name'] + '-' + df['batter_team']

In [23]:
#look at event types of pitches
df['events'].value_counts()

walk                          10334
strikeout                      6945
caught_stealing_2b              133
strikeout_double_play            17
pickoff_1b                        7
caught_stealing_home              6
wild_pitch                        5
pickoff_caught_stealing_2b        5
caught_stealing_3b                4
other_out                         3
pickoff_3b                        2
catcher_interf                    1
Name: events, dtype: int64

In [24]:
#only keep walks, strikeouts, and non-event pitch results
df = df.loc[(df['events']=='walk') | (df['events']=='strikeout') | (df['events'].isna())]

In [26]:
#create new variable of the count
df['count'] = df['balls'].astype(str) + '-' + df['strikes'].astype(str)

In [27]:
#creates variable of runner on first or not
df['on_base'] = np.where(df['on_1b'].notna(),'1','_')

In [28]:
#adds whether runner is on second or not
df['on_base'] = df['on_base'] + '-' + np.where(df['on_2b'].notna(),'2','_')

In [29]:
#adds whether runner is on third or not
df['on_base'] = df['on_base'] + '-' + np.where(df['on_3b'].notna(),'3','_')

In [30]:
#creates df of pitch type grouped by player name
df_all_calls = df.groupby(by=['player_name','description'])['pitch_type'].count().reset_index().pivot_table('pitch_type',['player_name'],'description').reset_index()

In [32]:
#creates new variable of unique plate appearance ID
df['pa'] = df['game_pk'].astype(str) + '-' + df['at_bat_number'].astype(str)

In [33]:
#creates df of change in run expectancy grouped by runners on base, outs, count, and pitch type
df_grouped = df.groupby(by=['on_base','outs_when_up','count','type'])['delta_run_exp'].agg(pd.Series.mode).to_frame().reset_index()

## Finding All Missed Calls

In [35]:
#creates function that categorizes each pitch as an actual strike or actual ball based on the zone
def actual_strike(df):
    if df['zone'] > 10:
        return 'B'
    else:
        return 'S'

In [36]:
#applies function to create a new variable correct_type
df['correct_type'] = df.apply(actual_strike,axis=1)

In [37]:
#merges dfs such that another delta run expectancy is added
#one is based on what actually happened, another is based on the change in run expectancy if the call was correct
df_merged = df.merge(df_grouped,left_on=['on_base','outs_when_up','count','correct_type'],right_on=['on_base','outs_when_up','count','type'])

In [38]:
#creates new variable that subtracts changes in run expectancy to get the swing effect of a missed call
df_merged['missed_call_delta_run_exp'] = df_merged['delta_run_exp_x'] - df_merged['delta_run_exp_y']

In [39]:
#sum the variable to get total net change in run expectancy from missed calls
df_merged['missed_call_delta_run_exp'].sum()

-188.21299999999997

In [40]:
#creates new df that only includes missed calls
df_mlb = df_merged[df_merged['missed_call_delta_run_exp']!=0].groupby(by=['player_name','correct_type']).count().reset_index().iloc[0:,0:3]

In [42]:
#pivots df
df_mlb_pivoted = df_mlb.pivot_table('pitch_type', ['player_name'], 'correct_type')

In [43]:
#resets index
df_mlb_pivoted.reset_index(inplace=True)

In [45]:
#fills missing values with 0
df_mlb_pivoted.fillna(0,inplace=True)

In [46]:
#merges two tables to have missed calls and all calls on each batter
df_missed_and_total = df_mlb_pivoted.merge(df_all_calls,left_on='player_name',right_on='player_name')

In [47]:
#fills missing values with 0
df_missed_and_total.fillna(0,inplace=True)

In [49]:
#creates df that has player name and total net change in run expectancy for missed balls and missed strikes
df_runs = df_merged.groupby(by=['player_name','correct_type'])['missed_call_delta_run_exp'].sum().to_frame().reset_index().pivot_table('missed_call_delta_run_exp','player_name','correct_type').reset_index()

In [50]:
#renames columns
df_runs.rename(columns={'B':'missed_call_dre_B','S':'missed_call_dre_S'},inplace=True)

In [52]:
#merges two tables to have missed calls, all calls, and net change in run expectancy for missed balls and strikes
df_missed_and_total_plus_runs = df_missed_and_total.merge(df_runs,left_on='player_name',right_on='player_name')

In [54]:
#creates subset df just of player name and team
df_batter_team = df[['player_name','batter_team']]

In [55]:
#drops duplicates
df_batter_team = df_batter_team.drop_duplicates()

In [56]:
#adds batter team to mlb pivoted df through merge
df_mlb_pivoted = df_mlb_pivoted.merge(df_batter_team,how='left',left_on='player_name',right_on='player_name')

In [58]:
#adds batter team to df missed and total plus runs through merge
df_missed_and_total_plus_runs = df_missed_and_total_plus_runs.merge(df_batter_team,how='left',left_on='player_name',right_on='player_name')

In [60]:
#saves mlb pivoted to csv
df_mlb_pivoted.to_csv('mlb_calls_pivoted.csv')

In [61]:
#saves df missed and total plus runs to csv
df_missed_and_total_plus_runs.to_csv('mlb_missed_and_total_calls_plus_runs.csv')