# Cleaning Data

In [50]:
import pandas as pd
import numpy as np

In [51]:
change_str_to_onehot = True
change_str_to_num = False

Load Dataset

In [52]:
# player = 'cole'
# player = 'scherzer'
# player = 'maddux' # does not have plate_x/plate_z for all pitches
player = 'kershaw'
data = pd.read_csv('./data/' + player + '.csv')
print(data)

       Unnamed: 0 pitch_type   game_date  release_speed  release_pos_x  \
0               0        NaN  2003-10-03            NaN            NaN   
1               1        NaN  2003-10-03            NaN            NaN   
2               2        NaN  2003-10-03            NaN            NaN   
3               3        NaN  2003-10-03            NaN            NaN   
4               4        NaN  2003-10-03            NaN            NaN   
...           ...        ...         ...            ...            ...   
29742       14212        NaN  2004-04-07            NaN            NaN   
29743       14213        NaN  2004-04-07            NaN            NaN   
29744       14214        NaN  2004-04-07            NaN            NaN   
29745       14215        NaN  2004-04-07            NaN            NaN   
29746       14216        NaN  2004-04-07            NaN            NaN   

       release_pos_z   player_name  batter  pitcher        events  ...  \
0                NaN  Maddux, Greg  1

Simplify the batters on base columns:

In [53]:
data['on_base'] = np.sum([
    pd.notnull(data['on_1b']),
    pd.notnull(data['on_2b']),
    pd.notnull(data['on_3b'])
], axis=0)

In the data we have, the pitch (label) and the results of the pitch are in the same row. We would like the label to be on the same row as the previous pitch data, so we need to shift some of the columns down one row. 

In [54]:
to_shift_down = [
    'events',
    'release_speed',
    'zone',
    'type',
    'bb_type',
    'plate_x',
    'plate_z'
]

groups = data.groupby(['game_pk', 'inning'])

list_of_dfs = []
for _, g in groups:
    list_of_dfs.append(g)

for i, df in enumerate(list_of_dfs):
    for column in df:
        if column in to_shift_down:
            df["prev_" + column] = df[column].shift(periods=1)
            df.drop(df.head(1).index, inplace=True)
    list_of_dfs[i] = df
            
cleaned_data = pd.concat(list_of_dfs, axis=0, sort=False)

for col in to_shift_down:
    cleaned_data.pop(col)

Now, we explicitly list the columns that we want to keep:

In [55]:
keep_columns = [
    'pitch_type', # label
    'release_speed', # previous pitch velocity
    'stand',
    'balls',
    'strikes',
    'plate_x',
    'plate_y',
    'plate_z',
    'outs_when_up',
    'inning',
    'game_pk',
    'pitch_num',
    'bat_score',
    'fld_score',
    'on_base',
    'prev_type',
    # 'prev_bb_type',
    'prev_plate_x',
    'prev_plate_z'
]

Delete the unnecessary columns:

In [56]:
for column in cleaned_data:
    if column not in keep_columns:
        cleaned_data.pop(column)

print("Kept Columns:")
for column in cleaned_data:
    print(column)

Kept Columns:
pitch_type
stand
balls
strikes
outs_when_up
inning
game_pk
bat_score
fld_score
on_base
prev_type
prev_plate_x
prev_plate_z


Change pitch_type to a binary classification: 1 for fastball, 0 for not fastball.

In [57]:
change_pitches = True

print("Unique pitch types:", cleaned_data['pitch_type'].unique())

fastball_types = [
    'FF', # Four-seam Fastball
    'FA', # Four-seam Fastball
    'FT', # Two-seam Fastball / Sinker
    'SI'  # Two-seam Fastball / Sinker
]

if change_pitches:
    cleaned_data['pitch_type'] = cleaned_data['pitch_type'].map(lambda pt: 1 if pt in fastball_types else 0)

pct_fastball = cleaned_data['pitch_type'].sum() / len(cleaned_data['pitch_type'])
print('Percent of pitches that are fastballs:', pct_fastball * 100)

Unique pitch types: [nan 'SL' 'FA' 'CH' 'CU' 'FC' 'SI' 'PO' 'FF' 'IN']
Percent of pitches that are fastballs: 10.635575145741505


Change some of the kept data to one-hot format:

In [58]:
change_to_oh = [
    'stand',
    'prev_type',
    # 'prev_bb_type',
]

In [59]:
if change_str_to_onehot:
    for oh_column in change_to_oh:
        drop_first = (oh_column == 'stand')
        dummies = pd.get_dummies(cleaned_data[oh_column], drop_first=drop_first, prefix=oh_column)
        cleaned_data = pd.concat([cleaned_data, dummies], axis=1)
        cleaned_data.pop(oh_column)

print(cleaned_data)

       pitch_type  balls  strikes  outs_when_up  inning  game_pk  bat_score  \
14352           0      1        1             1       2      112          0   
14353           0      0        1             1       2      112          0   
14354           0      0        0             1       2      112          0   
14355           0      0        1             1       2      112          0   
14356           0      0        0             1       2      112          0   
...           ...    ...      ...           ...     ...      ...        ...   
15550           1      1        0             1       5   243837          3   
15551           1      0        0             1       5   243837          3   
15552           1      0        0             0       5   243837          3   
15553           0      0        1             0       5   243837          3   
15554           1      0        0             0       5   243837          3   

       fld_score  on_base  prev_plate_x  prev_plate

Change field score and batting score to score differential. Positive differential means the field team (pitching team) is ahead, negative means behind.

In [60]:
fld_score = 'fld_score'
bat_score = 'bat_score'

cleaned_data['score_diff'] = cleaned_data[fld_score] - cleaned_data[bat_score]

cleaned_data.pop(fld_score)
cleaned_data.pop(bat_score)

print(cleaned_data['score_diff'])

14352    0
14353    0
14354    0
14355    0
14356    0
        ..
15550   -3
15551   -3
15552   -3
15553   -3
15554   -3
Name: score_diff, Length: 14066, dtype: int64


Inspect the fully cleaned data:

- Balls: the number of balls 

In [61]:
print(cleaned_data)

       pitch_type  balls  strikes  outs_when_up  inning  game_pk  on_base  \
14352           0      1        1             1       2      112        2   
14353           0      0        1             1       2      112        2   
14354           0      0        0             1       2      112        2   
14355           0      0        1             1       2      112        1   
14356           0      0        0             1       2      112        1   
...           ...    ...      ...           ...     ...      ...      ...   
15550           1      1        0             1       5   243837        1   
15551           1      0        0             1       5   243837        1   
15552           1      0        0             0       5   243837        1   
15553           0      0        1             0       5   243837        0   
15554           1      0        0             0       5   243837        0   

       prev_plate_x  prev_plate_z  stand_R  prev_type_B  prev_type_S  \
143

Save the data:

In [62]:
cleaned_data.to_csv('./cleaned_data/' + player + '.csv')

Remove remaining NaNs and write data to `.npy` files

In [63]:
import os, re, os.path

playerPath = 'cleaned_data/' + player
dataPath = playerPath + '/data'
labelsPath = playerPath + '/labels'

def deleteAllFilesInFolder(mypath):
    for root, dirs, files in os.walk(mypath):
        for file in files:
            os.remove(os.path.join(root, file))

deleteAllFilesInFolder(dataPath)
deleteAllFilesInFolder(labelsPath)
deleteAllFilesInFolder(playerPath)

In [64]:
cleaned_data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

#########################################
# Write data separated by inning for LSTM
#########################################

inning_data = cleaned_data.copy(deep=True)

groups = inning_data.groupby(['game_pk', 'inning'])

inning_dfs = []
for _, g in groups:
    inning_dfs.append(g)

def writeFiles(inning_df):
    game = inning_df.iloc[0]['game_pk']
    inning = inning_df.iloc[0]['inning']
    labels = inning_df.pop('pitch_type').astype('category').cat.codes
    np.save('./cleaned_data/' + player + '/labels/' + player + '_' + str(int(game)) + '_' + str(int(inning)) + '.npy', labels)
    np.save('./cleaned_data/' + player + '/data/' + player + '_' + str(int(game)) + '_' + str(int(inning)) + '.npy', inning_df)

for inning_df in inning_dfs:
    writeFiles(inning_df)

#####################################
# Write all data at once, unseparated
#####################################

l = cleaned_data.pop('pitch_type').astype('category')
labels = l.cat.codes
np.save('./cleaned_data/' + player + '/labels.npy', labels)
np.save('./cleaned_data/' + player + '/data.npy', cleaned_data)

In [65]:
# print(l.cat)
# print(labels.value_counts())