In [15]:
#Load packages
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# I. Comparing Event Run Values

In [16]:
# Load run expenctancy function
def calc_run_expectancy(data):
    print(data.columns)
    # Step 2
    df = data[['home_team','away_team','half','gameId','batterName','batterId','event','start1B','start2B',\
               'start3B','end1B','end2B','end3B','startOuts','endOuts','runsFuture','runsOnPlay','outsInInning',\
               'venueId','batterPos']].copy()
    
    # Step 3
    df['1B_st'] = np.where(df['start1B'].isna(), 0, 1)
    df['2B_st'] = np.where(df['start2B'].isna(), 0, 1)
    df['3B_st'] = np.where(df['start3B'].isna(), 0, 1)
    
    # Step 4
    df['start'] = df['1B_st'].astype(str) + df['2B_st'].astype(str) + df['3B_st'].astype(str) + " " + df['startOuts'].astype(str)
    
    # Step 5
    df['1B_en'] = np.where(df['end1B'].isna(), 0, 1)
    df['2B_en'] = np.where(df['end2B'].isna(), 0, 1)
    df['3B_en'] = np.where(df['end3B'].isna(), 0, 1)
    
    # Step 6
    df['end'] = df['1B_en'].astype(str) + df['2B_en'].astype(str) + df['3B_en'].astype(str) + " " + df['endOuts'].astype(str)

    # Step 7
    df = df[((df['start'] != df['end']) | (df['runsOnPlay']>0)) & (df['outsInInning']==3)]
    
    # Step 8
    run_exp = df.groupby('start')['runsFuture'].mean().reset_index().rename({'runsFuture':'startRE'},axis=1)
    df = df.merge(run_exp, on='start', how='left')
    
    # Step 9
    run_exp = run_exp.rename({'start':'state','startRE':'RE'},axis=1)
    re_tuples = list(run_exp.to_records(index=False))  # convert run_exp into list of tuples
    re_tuples = re_tuples + [(str(st)+" 3", 0) for st in run_exp.state.str[:3].unique()]
    run_exp = pd.DataFrame.from_records(re_tuples, columns=run_exp.columns)
    
    # Step 10
    df = (df.merge(run_exp, left_on='end', right_on='state', how='left')
            .drop(['state','1B_st','2B_st','3B_st','1B_en','2B_en','3B_en','start1B','start2B','start3B',
                   'end1B','end2B','end3B'], axis=1)
            .rename({'RE': 'endRE'}, axis=1))
    
    # Step 11
    df['RV'] = df['runsOnPlay'] + (df['endRE'] - df['startRE'])
    
    return df

# Step 2
# Read in MLBAM Data for 2014-2017
re14 = calc_run_expectancy(pd.read_csv("../Data/MLBAM14.csv"))
re15 = calc_run_expectancy(pd.read_csv("../Data/MLBAM15.csv"))
re16 = calc_run_expectancy(pd.read_csv("../Data/MLBAM16.csv"))
re17 = calc_run_expectancy(pd.read_csv("../Data/MLBAM17.csv"))

Index(['Unnamed: 0', 'inning', 'batterId', 'pitcherId', 'event', 'x', 'y',
       'ab_num', 'timestamp', 'stand', 'throws', 'runnerMovement', 'half',
       'balls', 'strikes', 'endOuts', 'actionId', 'description', 'game_type',
       'home_team', 'home_teamId', 'home_lg', 'away_team', 'away_teamId',
       'away_lg', 'venueId', 'stadium', 'field_teamId', 'playerId.1B',
       'playerId.2B', 'playerId.3B', 'playerId.C', 'playerId.CF',
       'playerId.LF', 'playerId.RF', 'playerId.SS', 'batterPos', 'batterName',
       'pitcherName', 'runsOnPlay', 'startOuts', 'runsInInning', 'runsITD',
       'runsFuture', 'start1B', 'start2B', 'start3B', 'end1B', 'end2B',
       'end3B', 'outsInInning', 'startCode', 'endCode', 'fielderId', 'gameId',
       'isPA', 'isAB', 'isHit', 'isBIP', 'our.x', 'our.y', 'r', 'theta'],
      dtype='object')
Index(['Unnamed: 0', 'inning', 'batterId', 'pitcherId', 'event', 'x', 'y',
       'ab_num', 'timestamp', 'stand', 'throws', 'runnerMovement', 'half',
       'b

In [17]:
re14_17 = [re14, re15, re16, re17]

# Step 3 & 4
rv_event = re14_17[0].groupby('event')['RV'].mean().reset_index().rename({'RV':'RV14'},axis=1)
for re_yr, yr in zip(re14_17[1:], range(15, 18)):
    rv_event = pd.merge(rv_event, re_yr.groupby('event')['RV'].mean().reset_index().rename({'RV':f'RV{yr}'}, axis=1), on='event')

# Step 5
# rv_event = rv_event[rv_event['event'] != 'Sacrifice Bunt DP']  # Row doesn't exist in original frame

# Step 6
corrs = rv_event.corr()

# Step 7
rv_event = rv_event.set_index('event')
rv_event['RVavg'] = rv_event.mean(axis=1)
# mean squared error
rv_event['RVmse'] = ((rv_event['RV14']-rv_event['RVavg'])**2 +
                     (rv_event['RV15']-rv_event['RVavg'])**2 +
                     (rv_event['RV16']-rv_event['RVavg'])**2 +
                     (rv_event['RV17']-rv_event['RVavg'])**2)

In [18]:
# Quiz Q1
corrs.unstack().sort_values()

RV16  RV17    0.994964
RV17  RV16    0.994964
RV15  RV17    0.996412
RV17  RV15    0.996412
RV14  RV17    0.997153
RV17  RV14    0.997153
RV15  RV16    0.997570
RV16  RV15    0.997570
RV14  RV16    0.997951
RV16  RV14    0.997951
RV14  RV15    0.998819
RV15  RV14    0.998819
RV14  RV14    1.000000
RV15  RV15    1.000000
RV16  RV16    1.000000
RV17  RV17    1.000000
dtype: float64

In [19]:
# Quiz Q2
rv_event.sort_values('RVmse', ascending=False)

Unnamed: 0_level_0,RV14,RV15,RV16,RV17,RVavg,RVmse
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Triple Play,-1.608127,-1.688951,-1.637543,-1.471223,-1.601461,0.025963
Sac Fly DP,-0.506647,-0.581165,-0.370443,-0.543715,-0.500493,0.025327
Batter Interference,-0.319625,-0.363838,-0.284649,-0.430019,-0.349533,0.011787
Bunt Lineout,-0.30381,-0.421575,-0.352295,-0.328292,-0.351493,0.007724
Catcher Interference,0.380337,0.318276,0.301623,0.39907,0.349826,0.006675
Fan interference,0.63356,0.577453,0.533316,0.590743,0.583768,0.005113
Grounded Into DP,-0.744986,-0.768327,-0.773318,-0.818005,-0.776159,0.002792
Fielders Choice,0.698076,0.719351,0.701447,0.764112,0.720747,0.002769
Double Play,-0.828774,-0.854665,-0.864981,-0.897164,-0.861396,0.002402
Strikeout - DP,-0.630963,-0.650543,-0.622787,-0.682819,-0.646778,0.002139


In [20]:
# Quiz Q3
rv_event['RV14']

event
Batter Interference    -0.319625
Bunt Groundout         -0.194784
Bunt Lineout           -0.303810
Bunt Pop Out           -0.316440
Catcher Interference    0.380337
Double                  0.737518
Double Play            -0.828774
Fan interference        0.633560
Field Error             0.462976
Fielders Choice         0.698076
Fielders Choice Out    -0.614846
Flyout                 -0.229247
Forceout               -0.314060
Grounded Into DP       -0.744986
Groundout              -0.194175
Hit By Pitch            0.312109
Home Run                1.397934
Intent Walk             0.161638
Lineout                -0.229463
Pop Out                -0.250836
Runner Out             -0.230461
Sac Bunt               -0.098461
Sac Fly                 0.006005
Sac Fly DP             -0.506647
Single                  0.438810
Strikeout              -0.242226
Strikeout - DP         -0.630963
Triple                  1.065651
Triple Play            -1.608127
Walk                    0.297689
Name

# II. Comparing Player Run Values

In [21]:
# Step 1 & 2
rv_player = re14_17[0].groupby(['batterId','batterName'],as_index=False).agg({'RV':'sum'}).rename({'RV':'RV14'},axis=1)
for re_yr, yr in zip(re14_17[1:], range(15, 18)):
    rv_player = pd.merge(rv_player, re_yr.groupby(['batterId','batterName'],as_index=False)
                                         .agg({'RV':'sum'})
                                         .rename({'RV':f'RV{yr}'}, axis=1), 
                         on=['batterId','batterName'],
                         how='inner',
                         suffixes=(f'_{yr}',f'_{int(yr)+1}'))

# Step 3
corrs

# Step 4
reg = smf.ols(formula='RV17~RV14+RV15+RV16', data=rv_player).fit()
reg.summary()

0,1,2,3
Dep. Variable:,RV17,R-squared:,0.308
Model:,OLS,Adj. R-squared:,0.302
Method:,Least Squares,F-statistic:,54.61
Date:,"Wed, 22 Dec 2021",Prob (F-statistic):,3.2e-29
Time:,08:55:37,Log-Likelihood:,-1458.4
No. Observations:,372,AIC:,2925.0
Df Residuals:,368,BIC:,2941.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0472,0.650,0.073,0.942,-1.231,1.326
RV14,0.0620,0.056,1.101,0.272,-0.049,0.173
RV15,0.3509,0.054,6.480,0.000,0.244,0.457
RV16,0.2673,0.059,4.532,0.000,0.151,0.383

0,1,2,3
Omnibus:,39.399,Durbin-Watson:,1.924
Prob(Omnibus):,0.0,Jarque-Bera (JB):,89.833
Skew:,0.548,Prob(JB):,3.11e-20
Kurtosis:,5.144,Cond. No.,20.1


In [22]:
# Quiz Q1: 
# 0.5461 WRONG
# 0.5101 WRONG
# 0.4663 WRONG
# Quiz Q2: 
# 0.305 WRONG
# 0.303 WRONG
# 0.308 CORRECT
# Quiz Q3: 
# 0.2673 WRONG
# 0.062 WRONG
# 0.3509 CORRECT
rv_player.corr()

Unnamed: 0,batterId,RV14,RV15,RV16,RV17
batterId,1.0,-0.14421,-0.003344,-0.056668,0.091117
RV14,-0.14421,1.0,0.466299,0.426629,0.322764
RV15,-0.003344,0.466299,1.0,0.546136,0.510132
RV16,-0.056668,0.426629,0.546136,1.0,0.457391
RV17,0.091117,0.322764,0.510132,0.457391,1.0


# III. Comparing Team Run Values

In [29]:
from functools import reduce
def team_rv(df, yr):
    # Step 1
    df['team'] = np.where(df['half']=='top', df['away_team'], df['home_team'])
    
    # Step 2
    rv_team = df.groupby('team', as_index=False)['RV'].sum().rename({'RV':f'RV{yr}'},axis=1)
    return rv_team

# Step 3
team_rvs = [team_rv(re_, yr) for re_, yr in zip(re14_17, range(14,18))]
team_rvs = reduce(lambda x, y: pd.merge(x, y, on='team'), team_rvs)

# Step 4
corr_tm = team_rvs.corr()

# Step 5
reg = smf.ols(formula='RV17~RV14+RV15+RV16', data=team_rvs).fit()
reg.summary()

0,1,2,3
Dep. Variable:,RV17,R-squared:,0.127
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,1.265
Date:,"Wed, 22 Dec 2021",Prob (F-statistic):,0.307
Time:,09:12:25,Log-Likelihood:,-166.93
No. Observations:,30,AIC:,341.9
Df Residuals:,26,BIC:,347.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.4437,12.406,-0.036,0.972,-25.944,25.056
RV14,-0.0553,0.241,-0.230,0.820,-0.550,0.440
RV15,0.0706,0.240,0.294,0.771,-0.424,0.565
RV16,0.3788,0.231,1.638,0.113,-0.097,0.854

0,1,2,3
Omnibus:,0.584,Durbin-Watson:,1.712
Prob(Omnibus):,0.747,Jarque-Bera (JB):,0.061
Skew:,0.065,Prob(JB):,0.97
Kurtosis:,3.179,Cond. No.,77.2


In [27]:
# Quiz Q1


[   team        RV14
 0   ana  104.641532
 1   ari  -48.253012
 2   atl  -81.885752
 3   bal   46.586965
 4   bos  -37.383914
 5   cha   -1.307578
 6   chn  -54.987532
 7   cin  -57.227567
 8   cle   -3.725729
 9   col   92.456063
 10  det  104.034237
 11  hou  -21.754850
 12  kca   -0.991208
 13  lan   61.903335
 14  mia  -25.016653
 15  mil   -6.885752
 16  min   59.586965
 17  nya  -29.729405
 18  nyn  -38.805740
 19  oak   63.139694
 20  phi  -56.278457
 21  pit   25.746988
 22  sdn -123.071220
 23  sea  -14.834861
 24  sfn    8.877890
 25  sln  -33.176676
 26  tba  -51.805740
 27  tex  -17.176676
 28  tor   69.954225
 29  was   23.114248,
    team        RV15
 0   ana  -25.056067
 1   ari   31.924551
 2   atl -108.464134
 3   bal   29.882644
 4   bos   65.372953
 5   cha  -79.136738
 6   chn   -3.627047
 7   cin  -61.401274
 8   cle   -4.178644
 9   col   47.127799
 10  det   -3.565758
 11  hou   47.413289
 12  kca   39.250376
 13  lan  -17.872201
 14  mia  -73.525423
 15  mil  -2