In [2]:
#Load packages
import pandas as pd
import numpy as np

nrows = 40
pd.set_option('display.min_rows', nrows)
pd.set_option('display.max_rows', nrows)
pd.set_option('display.max_columns', 50)

# I. Coding Run Expectancy Dataset (2017)

In [3]:
# Read in MLBAM Data for 2017

MLBAM17 = pd.read_csv("../Data/MLBAM17.csv")

In [10]:
# Run thru all steps in a function
def calc_run_expectancy(data):
    # Step 2
    df = data[['batterName','batterId','event', 'start1B', 'start2B', 'start3B', 'end1B', 'end2B', 'end3B',\
               'startOuts','endOuts','runsFuture','runsOnPlay','outsInInning','stand', 'throws','venueId', \
               'stadium', 'batterPos']].copy()
    
    # Step 3
    df['1B_st'] = np.where(df['start1B'].isna(), 0, 1)
    df['2B_st'] = np.where(df['start2B'].isna(), 0, 1)
    df['3B_st'] = np.where(df['start3B'].isna(), 0, 1)
    
    # Step 4
    df['start'] = df['1B_st'].astype(str) + df['2B_st'].astype(str) + df['3B_st'].astype(str) + " " + df['startOuts'].astype(str)
    
    # Step 5
    df['1B_en'] = np.where(df['end1B'].isna(), 0, 1)
    df['2B_en'] = np.where(df['end2B'].isna(), 0, 1)
    df['3B_en'] = np.where(df['end3B'].isna(), 0, 1)
    
    # Step 6
    df['end'] = df['1B_en'].astype(str) + df['2B_en'].astype(str) + df['3B_en'].astype(str) + " " + df['endOuts'].astype(str)

    # Step 7
    df = df[((df['start'] != df['end']) | (df['runsOnPlay']>0)) & (df['outsInInning']==3)]
    
    # Step 8
    run_exp = df.groupby('start')['runsFuture'].mean().reset_index().rename({'runsFuture':'startRE'},axis=1)
    df = df.merge(run_exp, on='start', how='left')
    
    # Step 9
    run_exp = run_exp.rename({'start':'state','startRE':'RE'},axis=1)
    re_tuples = list(run_exp.to_records(index=False))  # convert run_exp into list of tuples
    re_tuples = re_tuples + [(str(st)+" 3", 0) for st in run_exp.state.str[:3].unique()]
    run_exp = pd.DataFrame.from_records(re_tuples, columns=run_exp.columns)
    
    # Step 10
    df = (df.merge(run_exp, left_on='end', right_on='state', how='left')
            .drop(['state','1B_st','2B_st','3B_st','1B_en','2B_en','3B_en','start1B','start2B','start3B',
                   'end1B','end2B','end3B'], axis=1)
            .rename({'RE': 'endRE'}, axis=1))
    
    # Step 11
    df['RV'] = df['runsOnPlay'] + (df['endRE'] - df['startRE'])
    
    return df

re17=calc_run_expectancy(MLBAM17)

In [22]:
# Quiz Q1
re17['event'].value_counts(normalize=True)*100

Strikeout               21.571737
Groundout               17.910593
Single                  14.434022
Flyout                  10.697100
Walk                     7.975600
Lineout                  6.078452
Pop Out                  4.682519
Double                   4.487662
Home Run                 3.256275
Grounded Into DP         2.056822
Forceout                 1.970219
Hit By Pitch             0.946680
Field Error              0.850334
Sac Fly                  0.613799
Intent Walk              0.500674
Sac Bunt                 0.490931
Triple                   0.427061
Double Play              0.228957
Runner Out               0.214343
Fielders Choice Out      0.165087
Bunt Groundout           0.151555
Strikeout - DP           0.080108
Bunt Pop Out             0.070906
Fielders Choice          0.049255
Fan interference         0.023816
Catcher Interference     0.023275
Batter Interference      0.018944
Sac Fly DP               0.011908
Bunt Lineout             0.007578
Triple Play   

In [27]:
# Quiz Q2
re17['start'].value_counts().filter(regex='111').sum()

4364

In [36]:
# Quiz Q3
re17[re17['event']=='Strikeout'].groupby('batterPos')['batterName'].count().sort_values()

batterPos
PH       1
P     1872
DH    2354
UN    2594
2B    3710
SS    3734
3B    4023
C     4090
LF    4317
CF    4374
RF    4386
1B    4399
Name: batterName, dtype: int64

# II. Coding Run Expectancy Dataset (2016)

In [37]:
# Read in MLBAM Data for 2016

MLBAM16 = pd.read_csv("../Data/MLBAM16.csv")

In [38]:
re16=calc_run_expectancy(MLBAM16)

In [43]:
# Quiz Q1
re16['event'].value_counts(normalize=True)

Strikeout               0.210505
Groundout               0.184451
Single                  0.148367
Flyout                  0.110479
Walk                    0.076279
Lineout                 0.058300
Pop Out                 0.046326
Double                  0.044257
Home Run                0.030062
Forceout                0.020966
Grounded Into DP        0.020168
Hit By Pitch            0.008884
Field Error             0.008634
Sac Fly                 0.006402
Sac Bunt                0.005414
Intent Walk             0.004838
Triple                  0.004703
Double Play             0.002427
Runner Out              0.002292
Bunt Groundout          0.001808
Fielders Choice Out     0.001553
Bunt Pop Out            0.000755
Strikeout - DP          0.000733
Fielders Choice         0.000521
Fan interference        0.000239
Batter Interference     0.000206
Catcher Interference    0.000206
Sac Fly DP              0.000130
Bunt Lineout            0.000049
Triple Play             0.000038
Sacrifice 

In [47]:
# Quiz Q2
re16['start'].count()-re16['start'].value_counts().filter(regex='000').sum()

80387

In [50]:
# Quiz Q3
re16[re16['event']=="Home Run"].groupby('batterPos')['batterName'].count().sort_values()

batterPos
P      22
UN    178
DH    424
SS    486
CF    532
C     541
2B    576
LF    576
RF    694
3B    710
1B    797
Name: batterName, dtype: int64

# III. Comparing 2016 vs. 2017

In [58]:
# Step 1
rv_events16 = re16.groupby('event',as_index=False)['RV'].mean().rename({'RV':'RV16'},axis=1)
rv_events17 = re17.groupby('event',as_index=False)['RV'].mean().rename({'RV':'RV17'},axis=1)

# Step 2
rv_events = rv_events16.merge(rv_events17, on='event')

# Step 3
rv_batter16 = re16.groupby('batterName',as_index=False)['RV'].sum().rename({'RV':'RV16'},axis=1)
rv_batter17 = re17.groupby('batterName',as_index=False)['RV'].sum().rename({'RV':'RV17'},axis=1)

# Step 4
rv_batter = rv_batter16.merge(rv_batter17, on='batterName', how='inner')

In [62]:
# Quiz Q1
rv_batter17.sort_values('RV17', ascending=False).head()

Unnamed: 0,batterName,RV17
909,Votto,67.650742
87,Blackmon,62.032793
445,Judge,57.081625
37,Arenado,56.111576
331,Goldschmidt,56.078941


In [64]:
# Quiz Q2
rv_batter16.sort_values('RV16').head()

Unnamed: 0,batterName,RV16
370,Hechavarria,-32.142529
611,"Norris, De",-26.84125
245,"Escobar, A",-26.55553
102,Burns,-23.335961
808,Shuck,-23.297166


In [73]:
# Quiz Q3
rv_events['diff'] = rv_events['RV17']-rv_events['RV16']
rv_events.sort_values('diff', key=abs, ascending=False).head()

Unnamed: 0,event,RV16,RV17,diff
23,Sac Fly DP,-0.370443,-0.543715,-0.173272
28,Triple Play,-1.637543,-1.471223,0.166319
0,Batter Interference,-0.284649,-0.430019,-0.145369
4,Catcher Interference,0.301623,0.39907,0.097447
9,Fielders Choice,0.701447,0.764112,0.062665


In [74]:
# Quiz Q4
rv_batter['diff'] = rv_batter['RV17']-rv_batter['RV16']
rv_batter.sort_values('diff', ascending=False).head()

Unnamed: 0,batterName,RV16,RV17,diff
312,Judge,-4.179157,57.081625,61.260782
683,Zimmerman,-22.942623,33.179676,56.1223
230,"Gonzalez, M",-16.649256,35.951866,52.601121
595,Stanton,10.070038,51.784088,41.714049
121,Conforto,-3.607318,35.82716,39.434477
