# Cleaning Data

In [283]:
import pandas as pd
import numpy as np

In [284]:
change_str_to_onehot = True
change_str_to_num = False

Load Dataset

In [285]:
player = 'cole'
# player = 'scherzer'
# player = 'kershaw'

data = pd.read_csv('./data/' + player + '.csv')
print(data)

       Unnamed: 0 pitch_type   game_date  release_speed  release_pos_x  \
0               0         KC  2018-10-14           80.6          -2.04   
1               1         SL  2018-10-14           88.0          -2.01   
2               2         CH  2018-10-14           86.1          -2.23   
3               3         CH  2018-10-14           87.5          -2.14   
4               4         KC  2018-10-14           80.5          -2.04   
...           ...        ...         ...            ...            ...   
21489        5407         SL  2019-03-29           89.8          -2.35   
21490        5408         FF  2019-03-29           96.3          -2.16   
21491        5409         SL  2019-03-29           88.5          -2.34   
21492        5410         FT  2019-03-29           96.7          -2.34   
21493        5411         FF  2019-03-29           96.2          -2.24   

       release_pos_z   player_name  batter  pitcher     events  ... fld_score  \
0               5.61  Cole, Ge

Drop NA in the `pitch_type` column, which is our label

In [286]:
data.dropna(how='any', subset=['pitch_type'])

Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,0,KC,2018-10-14,80.6,-2.04,5.61,"Cole, Gerrit",543877,543037,field_out,...,4,4,5,5,4,Standard,Strategic,27.0,-0.010,-0.084
1,1,SL,2018-10-14,88.0,-2.01,5.74,"Cole, Gerrit",543877,543037,,...,4,4,5,5,4,Standard,Strategic,133.0,0.000,-0.016
2,2,CH,2018-10-14,86.1,-2.23,5.45,"Cole, Gerrit",598265,543037,field_out,...,4,4,5,5,4,Infield shift,Standard,220.0,-0.015,-0.160
3,3,CH,2018-10-14,87.5,-2.14,5.37,"Cole, Gerrit",598265,543037,,...,4,4,5,5,4,Infield shift,Standard,229.0,0.000,-0.034
4,4,KC,2018-10-14,80.5,-2.04,5.63,"Cole, Gerrit",598265,543037,,...,4,4,5,5,4,Infield shift,Standard,33.0,0.000,0.027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21489,5407,SL,2019-03-29,89.8,-2.35,5.95,"Cole, Gerrit",640457,543037,,...,0,0,0,0,0,Infield shift,Standard,158.0,0.000,0.000
21490,5408,FF,2019-03-29,96.3,-2.16,5.89,"Cole, Gerrit",640457,543037,,...,0,0,0,0,0,Infield shift,Standard,209.0,0.000,-0.073
21491,5409,SL,2019-03-29,88.5,-2.34,5.96,"Cole, Gerrit",640457,543037,,...,0,0,0,0,0,Infield shift,Standard,174.0,0.000,0.054
21492,5410,FT,2019-03-29,96.7,-2.34,5.77,"Cole, Gerrit",640457,543037,,...,0,0,0,0,0,Infield shift,Standard,225.0,0.000,0.027


Simplify the batters on base columns:

In [287]:
data['on_base'] = np.sum([
    pd.notnull(data['on_1b']),
    pd.notnull(data['on_2b']),
    pd.notnull(data['on_3b'])
], axis=0)

In the data we have, the pitch (label) and the results of the pitch are in the same row. We would like the label to be on the same row as the previous pitch data, so we need to shift some of the columns down one row. 

In [288]:
to_shift_down = [
    'events',
    'release_speed',
    'zone',
    'type',
    'bb_type',
    'plate_x',
    'plate_z',
    'pitch_type'
]

groups = data.groupby(['game_pk', 'inning'])

list_of_dfs = []
for _, g in groups:
    list_of_dfs.append(g)

for i, df in enumerate(list_of_dfs):
    for column in df:
        if column in to_shift_down:
            df["prev_" + column] = df[column].shift(periods=1)
            df.drop(df.head(1).index, inplace=True)
    list_of_dfs[i] = df
            
cleaned_data = pd.concat(list_of_dfs, axis=0, sort=False)

for col in to_shift_down:
    if col != 'pitch_type':
        cleaned_data.pop(col)

cleaned_data = cleaned_data.sort_index()

Now, we explicitly list the columns that we want to keep:

In [289]:
keep_columns = [
    'pitch_type', # label
    'prev_pitch_type', # include the previous pitch type as well
    # 'release_speed', # previous pitch velocity
    'stand',
    'balls',
    'strikes',
    'plate_x',
    'plate_y',
    'plate_z',
    'outs_when_up',
    'inning',
    'game_pk',
    'pitch_num',
    'bat_score',
    'fld_score',
    'on_base',
    'prev_type',
    # 'prev_bb_type',
    'prev_plate_x',
    'prev_plate_z'
]

Delete the unnecessary columns:

In [290]:
for column in cleaned_data:
    if column not in keep_columns:
        cleaned_data.pop(column)

print("Kept Columns:")
for column in cleaned_data:
    print(column)

Kept Columns:
pitch_type
stand
balls
strikes
outs_when_up
inning
game_pk
bat_score
fld_score
on_base
prev_pitch_type
prev_type
prev_plate_x
prev_plate_z


Change pitch_type to a binary classification: 1 for fastball, 0 for not fastball.

In [291]:
change_pitches = True

print("Unique pitch types:", cleaned_data['pitch_type'].unique())

fastball_types = [
    'FF', # Four-seam Fastball
    'FA', # Four-seam Fastball
    'FT', # Two-seam Fastball / Sinker
    'SI'  # Two-seam Fastball / Sinker
]

if change_pitches:
    cleaned_data['pitch_type'] = cleaned_data['pitch_type'].map(lambda pt: 1 if pt in fastball_types else 0)
    cleaned_data['prev_pitch_type'] = cleaned_data['prev_pitch_type'].map(lambda pt: 1 if pt in fastball_types else 0)

pct_fastball = cleaned_data['pitch_type'].sum() / len(cleaned_data['pitch_type'])
print('Percent of pitches that are fastballs:', pct_fastball * 100)

Unique pitch types: ['FF' 'CH' 'SL' 'KC' nan 'FT' 'PO' 'SI' 'IN' 'CU']
Percent of pitches that are fastballs: 61.64656727756148


Change some of the kept data to one-hot format:

In [292]:
change_to_oh = [
    'stand',
    'prev_type',
    # 'prev_bb_type',
]

In [293]:
if change_str_to_onehot:
    for oh_column in change_to_oh:
        drop_first = (oh_column == 'stand')
        dummies = pd.get_dummies(cleaned_data[oh_column], drop_first=drop_first, prefix=oh_column)
        cleaned_data = pd.concat([cleaned_data, dummies], axis=1)
        cleaned_data.pop(oh_column)

print(cleaned_data)

       pitch_type  balls  strikes  outs_when_up  inning  game_pk  bat_score  \
8               1      0        0             0       6   563394          5   
17              0      0        1             0       5   563394          5   
18              1      0        0             0       5   563394          5   
34              0      0        2             1       3   563394          2   
35              0      0        1             1       3   563394          2   
...           ...    ...      ...           ...     ...      ...        ...   
21489           0      2        2             0       1   566665          0   
21490           1      2        1             0       1   566665          0   
21491           0      1        1             0       1   566665          0   
21492           1      0        1             0       1   566665          0   
21493           1      0        0             0       1   566665          0   

       fld_score  on_base  prev_pitch_type  prev_pl

Change field score and batting score to score differential. Positive differential means the field team (pitching team) is ahead, negative means behind.

In [294]:
fld_score = 'fld_score'
bat_score = 'bat_score'

cleaned_data['score_diff'] = cleaned_data[fld_score] - cleaned_data[bat_score]

cleaned_data.pop(fld_score)
cleaned_data.pop(bat_score)

print(cleaned_data['score_diff'])

8       -1
17      -1
18      -1
34       2
35       2
        ..
21489    0
21490    0
21491    0
21492    0
21493    0
Name: score_diff, Length: 10531, dtype: int64


Inspect the fully cleaned data:

- Balls: the number of balls 

In [295]:
print(cleaned_data)

       pitch_type  balls  strikes  outs_when_up  inning  game_pk  on_base  \
8               1      0        0             0       6   563394        0   
17              0      0        1             0       5   563394        0   
18              1      0        0             0       5   563394        0   
34              0      0        2             1       3   563394        3   
35              0      0        1             1       3   563394        3   
...           ...    ...      ...           ...     ...      ...      ...   
21489           0      2        2             0       1   566665        0   
21490           1      2        1             0       1   566665        0   
21491           0      1        1             0       1   566665        0   
21492           1      0        1             0       1   566665        0   
21493           1      0        0             0       1   566665        0   

       prev_pitch_type  prev_plate_x  prev_plate_z  stand_R  prev_type_B  \

Save the data:

In [296]:
cleaned_data.to_csv('./cleaned_data/' + player + '.csv')

Remove remaining NaNs and write data to `.npy` files

In [297]:
import os, re, os.path

playerPath = 'cleaned_data/' + player
dataPath = playerPath + '/data'
labelsPath = playerPath + '/labels'

def deleteAllFilesInFolder(mypath):
    for root, dirs, files in os.walk(mypath):
        for file in files:
            os.remove(os.path.join(root, file))

deleteAllFilesInFolder(dataPath)
deleteAllFilesInFolder(labelsPath)
deleteAllFilesInFolder(playerPath)

In [298]:
cleaned_data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

#########################################
# Write data separated by inning for LSTM
#########################################

inning_data = cleaned_data.copy(deep=True)

groups = inning_data.groupby(['game_pk', 'inning'])

inning_dfs = []
for _, g in groups:
    inning_dfs.append(g)

def writeFiles(inning_df):
    game = inning_df.iloc[0]['game_pk']
    inning = inning_df.iloc[0]['inning']
    labels = inning_df.pop('pitch_type').astype('category').cat.codes
    np.save('./cleaned_data/' + player + '/labels/' + player + '_' + str(int(game)) + '_' + str(int(inning)) + '.npy', labels)
    np.save('./cleaned_data/' + player + '/data/' + player + '_' + str(int(game)) + '_' + str(int(inning)) + '.npy', inning_df)

for inning_df in inning_dfs:
    writeFiles(inning_df)

#####################################
# Write all data at once, unseparated
#####################################

l = cleaned_data.pop('pitch_type').astype('category')
labels = l.cat.codes
np.save('./cleaned_data/' + player + '/labels.npy', labels)
np.save('./cleaned_data/' + player + '/data.npy', cleaned_data)

In [299]:
print(l.cat)
print(labels.value_counts())

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x0000018C10B1BCD0>
1    6481
0    4020
dtype: int64
