In [202]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

nola = pd.read_csv("Nola_all.csv")
nola['game_date'] = pd.to_datetime(nola['game_date'])

In [203]:
#Create index because sv_id was incomplete
nola["pitch_pk"] = None
for ix, d in enumerate(nola.game_date):
    nola.pitch_pk[ix] = d.strftime("%Y%m%d")+str(nola.at_bat_number[ix]).zfill(3)+str(nola.pitch_number[ix]).zfill(2)

nola=nola.sort_values(by="pitch_pk")
nola.reset_index(inplace=True)
nola.drop(columns=['player_name', 'game_date', 'batter_name', 'home_team',
                   'away_team', 'inning_topbot', 'description', 'des', 'at_bat_number'], axis=1, inplace=True)
nola.drop(columns=['index'],  axis=1, inplace=True)

In [139]:
#Create pitch count & batter count.
nola["pitch_count"] = None
nola["batter_count"] = None
i=0
b=1

for ix, pk in enumerate(nola.game_pk):
    i += 1
    
    nola.pitch_count[ix]=i
    nola.batter_count[ix]=b
    
    if ix==len(nola.game_pk)-1:
        break
    if nola.batter[ix] != nola.batter[ix+1]:
        b += 1
    if pk != nola.game_pk[ix+1]:
        i=0
        b=1

In [148]:
#Dummies made it look like runners are always on
nola.on_3b = np.logical_xor(nola.on_3b,1).astype(int)
nola.on_2b = np.logical_xor(nola.on_2b,1).astype(int)
nola.on_1b = np.logical_xor(nola.on_1b,1).astype(int)

In [149]:
#Now make an interaction variable for balls and strikes. Higher favors pitcher.
nola["count_rating"] = .5 + nola.strikes*(.5/2) - nola.balls*(.5/3)  #3-0 count is a 0, 0-2 count is a 1
nola["runner_pressure"] = (nola.on_1b+2*nola.on_2b+3*nola.on_3b)/6.
nola["score_diff"] = nola.fld_score - nola.bat_score

In [180]:
#We want to use wOBA, a measure of at-bat success, and launch-speed angle, a measure of contact quality.
#To make them informative, they should be historical averages taking into account the extent of history.
#Engineer a feature to show the interaction.
nola["woba_hist"] = None
nola["woba_denom"] = None
#nola["launch_spang_hist"] = None
#nola["launch_spang_denom"] = None

for ix, woba in enumerate(nola.woba_value):
    if np.isnan(woba):
        continue
    else:
        batter = nola.batter[ix]
        for i in range(ix,-1,-1):
            if nola.batter[i] == batter:
                x = nola.woba_denom[i]
                if isinstance(x, int):
                    if x==0:
                        nola.woba_denom[ix] = 1
                        nola.woba_hist[ix] = nola.woba_value[i]
                        break
                    else:
                        nola.woba_denom[ix] = x+1
                        nola.woba_hist[ix] = nola.woba_hist[i]+woba
                        break
                else:
                    nola.woba_denom[ix] = 0
                    break

In [209]:
nola.head()

Unnamed: 0,pitch_type,batter,pitch_number,inning,bat_score,fld_score,stand,balls,strikes,outs_when_up,on_3b,on_2b,on_1b,woba_value,launch_speed_angle,game_pk,pitch_pk
0,FF,444379,1,1,0,0,1,0,0,0,1,1,1,,,415056,2015072100101
1,FF,444379,2,1,0,0,1,1,0,0,1,1,1,,,415056,2015072100102
2,FF,444379,3,1,0,0,1,1,1,0,1,1,1,,,415056,2015072100103
3,SI,444379,4,1,0,0,1,2,1,0,1,1,1,1.25,,415056,2015072100104
4,FF,519306,1,1,0,0,0,0,0,0,1,0,1,,,415056,2015072100201


In [204]:
nola.groupby(by='game_pk').count().batter.sort_values(ascending=False)

game_pk
567208    117
565970    115
530622    114
630853    113
631153    113
         ... 
448253     68
529416     68
415964     67
415638     65
565899     63
Name: batter, Length: 139, dtype: int64

In [205]:
nola.groupby(by='pitch_type').count().batter.sort_values(ascending=False)

pitch_type
CU    4138
FF    4015
SI    2761
CH    2256
Name: batter, dtype: int64

In [208]:
nola.groupby(by='batter').count().inning.sort_values(ascending=False)

batter
543685    182
518692    180
624424    178
455976    169
665742    160
         ... 
591741      2
543194      1
621261      1
621219      1
647336      1
Name: inning, Length: 506, dtype: int64

In [None]:
nola.to_csv('Nola_clean.csv', index=False)