# Cleaning Data

In [311]:
import pandas as pd
import numpy as np

Load Dataset

In [312]:
player = 'cole'
data = pd.read_csv('./data/' + player + '.csv')
print(data)

       Unnamed: 0 pitch_type   game_date  release_speed  release_pos_x  \
0               0         KC  2018-10-14           80.6          -2.04   
1               1         SL  2018-10-14           88.0          -2.01   
2               2         CH  2018-10-14           86.1          -2.23   
3               3         CH  2018-10-14           87.5          -2.14   
4               4         KC  2018-10-14           80.5          -2.04   
...           ...        ...         ...            ...            ...   
21489        5407         SL  2019-03-29           89.8          -2.35   
21490        5408         FF  2019-03-29           96.3          -2.16   
21491        5409         SL  2019-03-29           88.5          -2.34   
21492        5410         FT  2019-03-29           96.7          -2.34   
21493        5411         FF  2019-03-29           96.2          -2.24   

       release_pos_z   player_name  batter  pitcher     events  ... fld_score  \
0               5.61  Cole, Ge

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Simplify the batters on base columns:

In [313]:
data['on_base'] = np.sum([
    pd.notnull(data['on_1b']),
    pd.notnull(data['on_2b']),
    pd.notnull(data['on_3b'])
], axis=0)

In the data we have, the pitch (label) and the results of the pitch are in the same row. We would like the label to be on the same row as the previous pitch data, so we need to shift some of the columns down one row. 

In [314]:
to_shift_down = [
    'events',
    'zone',
    'type',
    'bb_type',
    'plate_x',
    'plate_z'
]

groups = data.groupby(['game_pk', 'inning'])

list_of_dfs = []
for _, g in groups:
    list_of_dfs.append(g)

for i, df in enumerate(list_of_dfs):
    for column in df:
        if column in to_shift_down:
            df["prev_" + column] = df[column].shift(periods=1)
            df.drop(df.head(1).index, inplace=True)
    list_of_dfs[i] = df
            
cleaned_data = pd.concat(list_of_dfs, axis=0, sort=False)

for col in to_shift_down:
    cleaned_data.pop(col)

Now, we explicitly list the columns that we want to keep:

In [315]:
keep_columns = [
    'pitch_type',
    'release_speed',
    'stand',
    'balls',
    'strikes',
    'plate_x',
    'plate_y',
    'plate_z',
    'outs_when_up',
    'inning',
    'game_pk',
    'pitch_num',
    'bat_score',
    'fld_score',
    'on_base',
    'prev_type',
    'prev_bb_type',
    'prev_plate_x',
    'prev_plate_z'
]

Delete the unnecessary columns:

In [316]:
for column in cleaned_data:
    if column not in keep_columns:
        cleaned_data.pop(column)

print("Kept Columns:")
for column in cleaned_data:
    print(column)

Kept Columns:
pitch_type
release_speed
stand
balls
strikes
outs_when_up
inning
game_pk
bat_score
fld_score
on_base
prev_type
prev_bb_type
prev_plate_x
prev_plate_z


Change some of the kept data to one-hot format:

In [317]:
change_to_oh = [
    'stand',
    'prev_type',
    'prev_bb_type',
]

In [318]:
for oh_column in change_to_oh:
    drop_first = (oh_column == 'stand')
    dummies = pd.get_dummies(cleaned_data[oh_column], drop_first=drop_first, prefix=oh_column)
    cleaned_data = pd.concat([cleaned_data, dummies], axis=1)
    cleaned_data.pop(oh_column)
    
print(cleaned_data)

      pitch_type  release_speed  balls  strikes  outs_when_up  inning  \
16070         SL           86.0      0        2             2       1   
16071         SI           97.6      0        1             2       1   
16072         FF           97.2      0        0             2       1   
16073         CH           87.5      0        2             1       1   
16074         FF           96.4      0        1             1       1   
...          ...            ...    ...      ...           ...     ...   
16092         KC           83.4      0        1             1       5   
16093         SL           89.1      0        0             1       5   
16094         KC           82.9      1        1             0       5   
16095         FF           96.1      1        0             0       5   
16096         FF           97.1      0        0             0       5   

       game_pk  bat_score  fld_score  on_base  prev_plate_x  prev_plate_z  \
16070   347712          0          0        1 

Save the data:

In [319]:
cleaned_data.to_csv('./cleaned_data/' + player + '.csv')

Write data to `.npy` files

In [320]:
groups = data.groupby(['game_pk', 'inning'])

inning_dfs = []
for _, g in groups:
    inning_dfs.append(g)
    
def writeFiles(inning_df):
    game = inning_df.iloc[0]['game_pk']
    inning = inning_df.iloc[0]['inning']
    labels = inning_df.pop('pitch_type').astype('category').cat.codes
    np.save('./cleaned_data/cole/cole_' + str(game) + '_' + str(inning) + '_labels.npy', labels)
    np.save('./cleaned_data/cole/cole_' + str(game) + '_' + str(inning) + '_data.npy', inning_df)

for inning_df in inning_dfs:
    writeFiles(inning_df)

In [321]:
np.load('./cleaned_data/cole/cole_347712_1_data.npy', allow_pickle=True)

array([[16064, '2013-06-11', 97.7, ..., 0.042, -0.352, 2],
       [16065, '2013-06-11', 99.6, ..., 0.0, 0.05, 2],
       [16066, '2013-06-11', 97.4, ..., 0.0, -0.092, 2],
       ...,
       [16079, '2013-06-11', 100.1, ..., 0.022, -0.141, 0],
       [16080, '2013-06-11', 97.2, ..., 0.0, -0.046, 0],
       [16081, '2013-06-11', 96.8, ..., 0.0, -0.033, 0]], dtype=object)