# Cleaning Data

In [182]:
import pandas as pd
import numpy as np

In [183]:
change_str_to_onehot = True
change_str_to_num = False

Load Dataset

In [184]:
player = 'cole'
# player = 'scherzer'
# player = 'maddux' # does not have plate_x/plate_z for all pitches
# player = 'kershaw'
data = pd.read_csv('./data/' + player + '.csv')
print(data)

       Unnamed: 0 pitch_type   game_date  release_speed  release_pos_x  \
0               0         CH  2013-10-19           85.0          -3.00   
1               1         SL  2013-10-19           84.2          -3.20   
2               2         FF  2013-10-19           95.8          -3.29   
3               3         FF  2013-10-19           94.6          -3.29   
4               4         FF  2013-10-19           93.0          -3.22   
...           ...        ...         ...            ...            ...   
40332        1208         FF  2020-07-23           96.1          -3.28   
40333        1209         CU  2020-07-23           79.7          -3.36   
40334        1210         FF  2020-07-23           96.7          -3.21   
40335        1211         FF  2020-07-23           96.1          -3.38   
40336        1212         FF  2020-07-23           95.3          -3.28   

       release_pos_z    player_name  batter  pitcher     events  ...  \
0               5.25  Scherzer, Max  59

Simplify the batters on base columns:

In [185]:
data['on_base'] = np.sum([
    pd.notnull(data['on_1b']),
    pd.notnull(data['on_2b']),
    pd.notnull(data['on_3b'])
], axis=0)

In the data we have, the pitch (label) and the results of the pitch are in the same row. We would like the label to be on the same row as the previous pitch data, so we need to shift some of the columns down one row. 

In [186]:
to_shift_down = [
    'events',
    'release_speed',
    'zone',
    'type',
    'bb_type',
    'plate_x',
    'plate_z'
]

groups = data.groupby(['game_pk', 'inning'])

list_of_dfs = []
for _, g in groups:
    list_of_dfs.append(g)

for i, df in enumerate(list_of_dfs):
    for column in df:
        if column in to_shift_down:
            df["prev_" + column] = df[column].shift(periods=1)
            df.drop(df.head(1).index, inplace=True)
    list_of_dfs[i] = df
            
cleaned_data = pd.concat(list_of_dfs, axis=0, sort=False)

for col in to_shift_down:
    cleaned_data.pop(col)

Now, we explicitly list the columns that we want to keep:

In [187]:
keep_columns = [
    'pitch_type', # label
    'release_speed', # previous pitch velocity
    'stand',
    'balls',
    'strikes',
    'plate_x',
    'plate_y',
    'plate_z',
    'outs_when_up',
    'inning',
    'game_pk',
    'pitch_num',
    'bat_score',
    'fld_score',
    'on_base',
    'prev_type',
    # 'prev_bb_type',
    'prev_plate_x',
    'prev_plate_z'
]

Delete the unnecessary columns:

In [188]:
for column in cleaned_data:
    if column not in keep_columns:
        cleaned_data.pop(column)

print("Kept Columns:")
for column in cleaned_data:
    print(column)

Kept Columns:
pitch_type
stand
balls
strikes
outs_when_up
inning
game_pk
bat_score
fld_score
on_base
prev_type
prev_plate_x
prev_plate_z


Change pitch_type to a binary classification: 1 for fastball, 0 for not fastball.

In [189]:
change_pitches = True

print("Unique pitch types:", cleaned_data['pitch_type'].unique())

fastball_types = [
    'FF', # Four-seam Fastball
    'FA', # Four-seam Fastball
    'FT', # Two-seam Fastball / Sinker
    'SI'  # Two-seam Fastball / Sinker
]

if change_pitches:
    cleaned_data['pitch_type'] = cleaned_data['pitch_type'].map(lambda pt: 1 if pt in fastball_types else 0)

pct_fastball = cleaned_data['pitch_type'].sum() / len(cleaned_data['pitch_type'])
print('Percent of pitches that are fastballs:', pct_fastball * 100)

Unique pitch types: ['FT' 'FF' 'CH' 'SL' nan 'IN' 'PO' 'FC' 'CU']
Percent of pitches that are fastballs: 58.378045562494506


Change some of the kept data to one-hot format:

In [190]:
change_to_oh = [
    'stand',
    'prev_type',
    # 'prev_bb_type',
]

In [191]:
if change_str_to_onehot:
    for oh_column in change_to_oh:
        drop_first = (oh_column == 'stand')
        dummies = pd.get_dummies(cleaned_data[oh_column], drop_first=drop_first, prefix=oh_column)
        cleaned_data = pd.concat([cleaned_data, dummies], axis=1)
        cleaned_data.pop(oh_column)

print(cleaned_data)

       pitch_type  balls  strikes  outs_when_up  inning  game_pk  bat_score  \
18251           1      1        0             1       4   234156          6   
18252           1      0        0             1       4   234156          6   
18253           1      0        0             0       4   234156          6   
18241           1      1        1             0       5   234156          6   
18242           0      1        0             0       5   234156          6   
...           ...    ...      ...           ...     ...      ...        ...   
39893           1      2        2             0       7   631679          4   
39894           0      1        2             0       7   631679          4   
39895           1      1        1             0       7   631679          4   
39896           1      1        0             0       7   631679          4   
39897           0      0        0             0       7   631679          4   

       fld_score  on_base  prev_plate_x  prev_plate

Change field score and batting score to score differential. Positive differential means the field team (pitching team) is ahead, negative means behind.

In [192]:
fld_score = 'fld_score'
bat_score = 'bat_score'

cleaned_data['score_diff'] = cleaned_data[fld_score] - cleaned_data[bat_score]

cleaned_data.pop(fld_score)
cleaned_data.pop(bat_score)

print(cleaned_data['score_diff'])

18251   -4
18252   -4
18253   -4
18241   -3
18242   -3
        ..
39893    1
39894    1
39895    1
39896    1
39897    1
Name: score_diff, Length: 22738, dtype: int64


Inspect the fully cleaned data:

- Balls: the number of balls 

In [193]:
print(cleaned_data)

       pitch_type  balls  strikes  outs_when_up  inning  game_pk  on_base  \
18251           1      1        0             1       4   234156        0   
18252           1      0        0             1       4   234156        0   
18253           1      0        0             0       4   234156        0   
18241           1      1        1             0       5   234156        0   
18242           0      1        0             0       5   234156        0   
...           ...    ...      ...           ...     ...      ...      ...   
39893           1      2        2             0       7   631679        0   
39894           0      1        2             0       7   631679        0   
39895           1      1        1             0       7   631679        0   
39896           1      1        0             0       7   631679        0   
39897           0      0        0             0       7   631679        0   

       prev_plate_x  prev_plate_z  stand_R  prev_type_B  prev_type_S  \
182

Save the data:

In [194]:
cleaned_data.to_csv('./cleaned_data/' + player + '.csv')

Remove remaining NaNs and write data to `.npy` files

In [195]:
separate_by_inning = False

cleaned_data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

if separate_by_inning:
    groups = cleaned_data.groupby(['game_pk', 'inning'])

    inning_dfs = []
    for _, g in groups:
        inning_dfs.append(g)

    def writeFiles(inning_df):
        game = inning_df.iloc[0]['game_pk']
        inning = inning_df.iloc[0]['inning']
        labels = inning_df.pop('pitch_type').astype('category').cat.codes
        np.save('./cleaned_data/' + player + '/labels/' + player + '_' + str(game) + '_' + str(inning) + '.npy', labels)
        np.save('./cleaned_data/' + player + '/data/' + player + '_' + str(game) + '_' + str(inning) + '.npy', inning_df)

    for inning_df in inning_dfs:
        writeFiles(inning_df)
else:
    l = cleaned_data.pop('pitch_type').astype('category')
    labels = l.cat.codes
    np.save('./cleaned_data/' + player + '/labels.npy', labels)
    np.save('./cleaned_data/' + player + '/data.npy', cleaned_data)

In [196]:
print(l.cat)
print(labels.value_counts())

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x000002059668D0D0>
1    13201
0     9312
dtype: int64
