Any and all data was scraped from the Baseball savant website [here](https://baseballsavant.mlb.com/) using the `pybaseball` library.

# Imports

In [74]:
import pybaseball as pb
import project_baseball as project

In [75]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Import Data
mlb2020 = pd.read_csv('data/2020mlb.csv')
mlb2020.shape

(279660, 90)

For a full explanation of what the data dictionary is, go [here](https://baseballsavant.mlb.com/csv-docs).

# Cleaning

---
The cleaning will be shown on the `mlb2020` dataset but will be applied on past data as well.

This function will rename the fielders columns from the `project_baseball.py` file.

```python
def renaming_fielders(mlb):
    mlb.rename(columns={'player_name':'Pitcher_name'}, inplace=True) # Rename pitcher
    mlb.rename(columns={'fielder_2':'Catcher'}, inplace=True) # rename Catcher
    # Rename other fielders
    mlb.rename(columns={'fielder_3':'FirstBasemen',
                        'fielder_4':'SecondBasemen',
                        'fielder_5':'ThirdBasemen',
                        'fielder_6':'ShortStop',
                        'fielder_7':'LeftField',
                        'fielder_8':'CenterField',
                        'fielder_9':'RightField'}, inplace=True)
    mlb.rename(columns={'batter':'batter_id',
                       'pitcher':'pitcher_id'})
    return mlb
```

In [76]:
project.renaming_fielders(mlb2020)

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,Pitcher_name,batter_id,pitcher_id,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,Catcher,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,0,FF,2020-10-27,96.7,1.58,5.99,Julio Urias,642715.0,628711.0,strikeout,called_strike,,,,,4.0,Willy Adames called out on strikes.,W,R,L,LAD,TB,S,2.0,,0.0,2.0,2020.0,0.2,1.6,-0.53,2.29,,,,2.0,9.0,Top,,,,,605131.0,,,-5.950264,-140.490456,-7.897391,3.772000,32.321911,-8.981441,3.50,1.69,,,,95.4,2615.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.82,,,0.0,1.0,0.0,0.0,,65.0,3.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
1,1,FF,2020-10-27,94.1,2.91,5.45,Julio Urias,642715.0,628711.0,,called_strike,,,,,1.0,,W,R,L,LAD,TB,S,,,0.0,1.0,2020.0,0.8,1.3,-0.55,3.03,,,,2.0,9.0,Top,,,,,605131.0,,,-10.560246,-136.599519,-3.429867,11.723598,29.183810,-15.237217,3.50,1.69,,,,93.4,2470.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.59,,,,,,,,65.0,2.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
2,2,FF,2020-10-27,94.9,1.77,6.02,Julio Urias,642715.0,628711.0,,swinging_strike,,,,,2.0,,W,R,L,LAD,TB,S,,,0.0,0.0,2020.0,0.2,1.5,-0.04,3.32,,,,2.0,9.0,Top,,,,,605131.0,,,-5.199252,-138.098234,-4.637970,4.158758,30.838499,-12.535677,3.50,1.69,,,,94.0,2397.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.76,,,,,,,,65.0,1.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
3,3,FF,2020-10-27,94.4,1.66,5.93,Julio Urias,670712.0,628711.0,strikeout,called_strike,,,,,4.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,S,2.0,,3.0,2.0,2020.0,0.3,1.5,-0.37,2.15,,,,1.0,9.0,Top,,,,,605131.0,,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,3.34,1.53,,,,93.7,2508.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.60,,,0.0,1.0,0.0,0.0,,64.0,6.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
4,4,CU,2020-10-27,81.4,1.46,6.06,Julio Urias,670712.0,628711.0,,ball,,,,,13.0,,W,R,L,LAD,TB,B,,,2.0,2.0,2020.0,-1.7,-0.2,-0.14,0.96,,,,1.0,9.0,Top,,,,,605131.0,,,-0.383207,-118.447810,-4.454166,-15.553576,24.451936,-32.892744,3.34,1.53,,,,80.2,3031.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.84,,,,,,,,64.0,5.0,Curveball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279655,20155,FF,2020-07-23,96.1,-3.28,5.13,Max Scherzer,543305.0,453286.0,field_out,hit_into_play,,,,,5.0,"Aaron Hicks grounds out sharply, second basema...",R,L,R,WSH,NYY,X,4.0,ground_ball,2.0,2.0,2020.0,-0.8,1.2,-0.19,2.41,,,,0.0,1.0,Top,163.49,140.27,,,435559.0,,,9.899700,-139.626080,-4.473902,-12.016982,30.128689,-14.962333,3.41,1.56,91.0,106.7,4.0,96.3,2450.0,6.4,630851.0,453286.0,435559.0,519346.0,516770.0,452678.0,607208.0,664057.0,645302.0,594809.0,54.09,0.657,0.591,0.0,1.0,0.0,0.0,4.0,1.0,5.0,4-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infield shift,Standard
279656,20157,CU,2020-07-23,79.7,-3.36,5.48,Max Scherzer,543305.0,453286.0,,called_strike,,,,,7.0,,R,L,R,WSH,NYY,S,,,2.0,1.0,2020.0,0.9,-0.8,-0.41,1.85,,,,0.0,1.0,Top,,,,,435559.0,,,4.691537,-116.064292,0.319935,7.680712,22.130450,-39.867212,3.51,1.65,,,,79.6,2846.0,6.2,630851.0,453286.0,435559.0,519346.0,516770.0,452678.0,607208.0,664057.0,645302.0,594809.0,54.28,,,,,,,,1.0,4.0,Curveball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infield shift,Standard
279657,20159,FF,2020-07-23,96.7,-3.21,5.33,Max Scherzer,543305.0,453286.0,,called_strike,,,,,9.0,,R,L,R,WSH,NYY,S,,,2.0,0.0,2020.0,-0.8,1.3,0.48,2.07,,,,0.0,1.0,Top,,,,,435559.0,,,11.547336,-140.289508,-5.998194,-12.798480,30.880442,-14.099707,3.33,1.65,,,,96.4,2421.0,6.2,630851.0,453286.0,435559.0,519346.0,516770.0,452678.0,607208.0,664057.0,645302.0,594809.0,54.29,,,,,,,,1.0,3.0,4-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infield shift,Standard
279658,20162,FF,2020-07-23,96.1,-3.38,5.33,Max Scherzer,543305.0,453286.0,,ball,,,,,11.0,,R,L,R,WSH,NYY,B,,,1.0,0.0,2020.0,-0.8,1.4,-1.17,2.75,,,,0.0,1.0,Top,,,,,435559.0,,,7.600636,-139.767747,-4.483222,-11.964765,30.151437,-12.476899,3.58,1.75,,,,95.9,2477.0,6.1,630851.0,453286.0,435559.0,519346.0,516770.0,452678.0,607208.0,664057.0,645302.0,594809.0,54.38,,,,,,,,1.0,2.0,4-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infield shift,Standard


In [77]:
# Drop what we don't need

mlb2020.drop(columns=['spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated','tfs_deprecated', 'tfs_zulu_deprecated',
       'umpire', 'sv_id', 'pitcher.1', 'fielder_2.1'], inplace=True)

In [78]:
# Make datetime
mlb2020['game_date']=pd.to_datetime(mlb2020['game_date'])

In [79]:
mlb2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279660 entries, 0 to 279659
Data columns (total 80 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   index                            279660 non-null  int64         
 1   pitch_type                       279089 non-null  object        
 2   game_date                        279660 non-null  datetime64[ns]
 3   release_speed                    279652 non-null  float64       
 4   release_pos_x                    279652 non-null  float64       
 5   release_pos_z                    279652 non-null  float64       
 6   Pitcher_name                     279660 non-null  object        
 7   batter_id                        279660 non-null  float64       
 8   pitcher_id                       279660 non-null  float64       
 9   events                           70437 non-null   object        
 10  description                      279660 non-

In [80]:
mlb2020.columns

Index(['index', 'pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'Pitcher_name', 'batter_id', 'pitcher_id', 'events',
       'description', 'zone', 'des', 'game_type', 'stand', 'p_throws',
       'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'balls',
       'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b',
       'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'hc_x',
       'hc_y', 'Catcher', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top',
       'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle',
       'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk',
       'FirstBasemen', 'SecondBasemen', 'ThirdBasemen', 'ShortStop',
       'LeftField', 'CenterField', 'RightField', 'release_pos_y',
       'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
       'launch_speed_angle', 'at_bat_number', '

In [81]:
mlb2020[mlb2020['pitch_type']=='FO']['pitch_name']

249966    Forkball
250000    Forkball
252757    Forkball
Name: pitch_name, dtype: object

In [82]:
# Rename CS to CU for Curveball
mlb2020['pitch_type']=mlb2020['pitch_type'].map({'CS':'CU',
                           'FF':'FF',
                           'CU':'CU',
                           'CH':'CH',
                           'SI':'SI',
                           'SL':'SL',
                           'FC':'FC',
                           'KC':'KC',
                           'FS':'FS',
                           'KN':'KN',
                           'FO':'FO'})

# Dealing with Nulls

---
I will drop any nulls in the release_speed as it will most likely drop the same nulls in release position, zone, movement, velocity, and acceleration columns.

In [83]:
# re run to check every so often
pd.DataFrame(mlb2020.isnull().sum()).T

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,Pitcher_name,batter_id,pitcher_id,events,description,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,Catcher,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,0,571,0,8,8,8,0,0,0,209223,0,8,209224,0,0,0,0,0,0,219112,233143,0,0,0,8,8,8,8,252778,226576,191254,0,0,0,233192,233192,0,8,8,8,8,8,8,8,8,195879,197092,197092,8,354,595,0,0,0,0,0,0,0,0,8,233450,233450,209223,209530,209223,209223,233450,0,0,571,0,0,0,0,0,0,0,0,525,525


In [84]:
# mlb2020 = mlb2020[mlb2020['release_speed'].notnull()].copy() # dropped 8 nulls

For `pitch_type`, I will replace them with "U". And `pitch_name` is "Unknown"

In [85]:
mlb2020['pitch_type'].fillna('U', inplace=True)
mlb2020['pitch_name'].fillna('Unknown', inplace=True)

For fielding alignments, I will also fill with "Unknown". Although they may or may not be vital for our model, I will still apply them.

In [86]:
mlb2020['if_fielding_alignment'].fillna('Unknown', inplace=True)
mlb2020['of_fielding_alignment'].fillna('Unknown', inplace=True)

The other NaNs are based on the batter/runners causing hits, steals, or sacrifice flys. Those will remain and may not play a role.

# Feature Engineering 

---
## Strike Attempt

This will show whether or not a pitch result in a strike or other event. 

In [87]:
# Gives a pitch and determines what happens at that particular at-bat

# Good for creating a model to determine if a ball is a strike or not
mlb2020['strike_attempt']=mlb2020['description'].map(
{'called_strike':'strike',
 'swinging_strike':'strike',
 'ball':'ball',
 'foul':'strike', # False in out_via_description
 'hit_into_play':'out',
 'blocked_ball':'ball',
 'hit_into_play_score':'ob',
 'swinging_strike_blocked':'strike',
 'hit_into_play_no_out':'ob',
 'foul_bunt':'strike',
 'foul_tip':'strike',
 'hit_by_pitch':'ob',
 'missed_bunt':'strike',
 'pitchout':'out',
 'bunt_foul_tip':'strike'})

In [88]:
mlb2020['strike_attempt'].value_counts(normalize=True)

strike    0.463309
ball      0.367157
out       0.107009
ob        0.062526
Name: strike_attempt, dtype: float64

## Create Batter Name

---


```python
def batter_name(des):
    try:
        name = ' '.join(des.split(' ',2)[:2])
        return name
    except:
        return np.nan
mlb2020['batter_name'] = mlb2020['des'].map(batter_name)
mlb2020['batter_name'].ffill(axis=0, inplace=True)
mlb2020.head()
```

In [89]:
def batter_name(des):
    try:
        name = ' '.join(des.split(' ',2)[:2])
        return name
    except:
        return np.nan

mlb2020['batter_name'] = mlb2020['des'].map(batter_name)
mlb2020['batter_name'].ffill(axis=0, inplace=True)
# mlb2020.head(10)

Creates an easier way to access the count of a current pitch.

```python
def the_count(df):
    
    df['count'] = df['balls'].map(int).map(str) +"-"+ df['strikes'].map(int).map(str)
    return df
```

In [90]:
# Convert balls and strikes into `int` then into `str` then concatenate strings to make the count
mlb2020['count'] = mlb2020['balls'].map(int).map(str) +"-"+ mlb2020['strikes'].map(int).map(str)
mlb2020['count'].unique()

array(['0-2', '0-1', '0-0', '3-2', '2-2', '1-2', '1-1', '1-0', '2-1',
       '2-0', '3-1', '3-0', '4-2'], dtype=object)

---
# Select Features

If I were to go with trying to figure out whether a pitch was a `strike`, `ball`, `out`, or `on-base`, I would select these features.

```python
features = ['game_date','pitch_type','Pitcher_name', 'batter_id','pitcher_id','release_speed', 'release_pos_x', 'release_pos_z', 'stand', 'p_throws', 'balls', 'strikes',
            'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'inning', 'inning_topbot',
            'effective_speed', 'release_spin_rate', 'release_extension','Catcher','FirstBasemen', 'SecondBasemen', 'ThirdBasemen', 'ShortStop',
            'LeftField', 'CenterField', 'RightField', 'at_bat_number', 'pitch_number', 'pitch_name',
            'bat_score', 'fld_score', 'post_bat_score', 'post_fld_score', 'if_fielding_alignment', 'of_fielding_alignment',
            'count', 'strike_attempt','events','description','type','bb_type','vx0','vy0','vz0',
            'ax','ay','az','outs_when_up']

target = mlb2020['strike_attempt'] or mlb2020['pitch_name']
```

If I wanted to predict the next pitch of a sequence based on factors such as lineup or current at-bat, I would need these features, most likely.

```python
features = <need to select>

target = mlb2020['pitch_name', 'pitch_speed'] # anything after pitch name is debatable
```

In [91]:
features = ['game_date','pitch_type','Pitcher_name','pitcher_id','batter_name', 'batter_id','release_speed', 'release_pos_x', 'release_pos_z', 'stand', 'p_throws', 'balls', 'strikes',
            'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'inning', 'inning_topbot',
            'effective_speed', 'release_spin_rate', 'release_extension','Catcher','FirstBasemen', 'SecondBasemen', 'ThirdBasemen', 'ShortStop',
            'LeftField', 'CenterField', 'RightField', 'at_bat_number', 'pitch_number', 'pitch_name',
            'bat_score', 'fld_score', 'post_bat_score', 'post_fld_score', 'if_fielding_alignment', 'of_fielding_alignment',
            'count', 'strike_attempt','events','description','type','bb_type','vx0','vy0','vz0',
            'ax','ay','az','outs_when_up']
mlb2020new = mlb2020[features].copy()

# Results

In [92]:
mlb2020new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279660 entries, 0 to 279659
Data columns (total 55 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   game_date              279660 non-null  datetime64[ns]
 1   pitch_type             279660 non-null  object        
 2   Pitcher_name           279660 non-null  object        
 3   pitcher_id             279660 non-null  float64       
 4   batter_name            279660 non-null  object        
 5   batter_id              279660 non-null  float64       
 6   release_speed          279652 non-null  float64       
 7   release_pos_x          279652 non-null  float64       
 8   release_pos_z          279652 non-null  float64       
 9   stand                  279660 non-null  object        
 10  p_throws               279660 non-null  object        
 11  balls                  279660 non-null  float64       
 12  strikes                279660 non-null  floa

In [93]:
mlb2020new.sample(5)

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
200435,2020-08-14,FF,Justin Wilson,458677.0,Jay Bruce,457803.0,95.3,1.58,6.57,L,L,0.0,2.0,0.5,1.7,-0.65,2.7,,,516416.0,8.0,Bot,94.7,2245.0,6.0,467092.0,624413.0,572821.0,665926.0,642708.0,642086.0,607043.0,624424.0,70.0,3.0,4-Seam Fastball,5.0,4.0,5.0,4.0,Infield shift,Standard,0-2,strike,,foul,S,,-6.845119,-138.46487,-8.205981,7.510568,30.18321,-9.207605,1.0
51629,2020-09-19,SI,Nate Jones,518858.0,Nick Madrigal,663611.0,95.1,-1.26,6.22,R,R,0.0,0.0,-1.4,1.3,0.96,1.03,,,,9.0,Top,94.4,2516.0,5.9,571466.0,458015.0,519058.0,553993.0,520471.0,608385.0,669222.0,592206.0,68.0,1.0,Sinker,5.0,0.0,5.0,0.0,Standard,Standard,0-0,ball,,ball,B,,8.807103,-137.91956,-10.723336,-19.50801,28.109883,-13.6625,2.0
12889,2020-09-30,CU,Tyler Glasnow,607192.0,Lourdes Gurriel,666971.0,81.6,-1.5,6.25,R,R,0.0,1.0,0.5,-1.6,0.76,1.54,,,,5.0,Top,82.3,3106.0,7.3,572287.0,670712.0,664040.0,650490.0,642715.0,622534.0,595281.0,592669.0,46.0,2.0,Curveball,2.0,8.0,2.0,8.0,Standard,Standard,0-1,strike,,swinging_strike,S,,4.259492,-118.571869,-0.790301,3.405501,27.710315,-46.878804,2.0
104081,2020-09-06,CU,Masahiro Tanaka,547888.0,Rio Ruiz,547004.0,76.4,-1.42,5.72,L,R,0.0,0.0,0.5,-0.5,0.83,0.63,,,,4.0,Bot,75.9,2594.0,6.0,456124.0,645801.0,518934.0,609280.0,642180.0,643565.0,458731.0,640449.0,30.0,1.0,Curveball,2.0,1.0,2.0,1.0,Infield shift,Standard,0-0,ball,,ball,B,,3.727574,-111.173482,-2.826898,3.831759,20.43721,-35.704322,0.0
171448,2020-08-21,SI,Matt Shoemaker,533167.0,Joey Wendle,621563.0,90.9,-1.87,6.11,L,R,0.0,0.0,-1.2,1.1,-1.37,2.9,,596847.0,,1.0,Bot,91.0,2074.0,6.5,643376.0,665489.0,592273.0,543768.0,669289.0,666971.0,545341.0,624415.0,8.0,1.0,Sinker,1.0,0.0,1.0,0.0,Standard,Standard,0-0,ball,,ball,B,,3.948463,-132.272785,-4.402896,-15.402631,29.927042,-18.908887,2.0


In [94]:
# Save to new csv
mlb2020new.to_csv('data/2020cleaned.csv', index=False)

In [95]:
check = pd.read_csv('data/cleaned2020.csv')

In [96]:
check.head()

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,events,description,stand,p_throws,balls,strikes,type,bb_type,vx0,vy0,vz0,ax,ay,az,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,effective_speed,pitcher,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,home_team,away_team,first,second,third
0,0,FF,2020-10-27,96.7,1.58,5.99,Julio Urias,642715.0,strikeout,called_strike,R,L,0.0,2.0,S,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,0.2,1.6,-0.53,2.29,,,,2.0,9.0,Top,95.4,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,3.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0
1,1,FF,2020-10-27,94.1,2.91,5.45,Julio Urias,642715.0,,called_strike,R,L,0.0,1.0,S,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,0.8,1.3,-0.55,3.03,,,,2.0,9.0,Top,93.4,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,2.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0
2,2,FF,2020-10-27,94.9,1.77,6.02,Julio Urias,642715.0,,swinging_strike,R,L,0.0,0.0,S,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,0.2,1.5,-0.04,3.32,,,,2.0,9.0,Top,94.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,1.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0
3,3,FF,2020-10-27,94.4,1.66,5.93,Julio Urias,670712.0,strikeout,called_strike,R,L,3.0,2.0,S,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,0.3,1.5,-0.37,2.15,,,,1.0,9.0,Top,93.7,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,6.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0
4,4,CU,2020-10-27,81.4,1.46,6.06,Julio Urias,670712.0,,ball,R,L,2.0,2.0,B,,-0.383207,-118.44781,-4.454166,-15.553576,24.451936,-32.892744,-1.7,-0.2,-0.14,0.96,,,,1.0,9.0,Top,80.2,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,5.0,Curveball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0


In [97]:
for col in mlb2020new.columns:
    if col not in check.columns:
        print(col)

Pitcher_name
pitcher_id
batter_name
batter_id
release_spin_rate
release_extension
Catcher
FirstBasemen
SecondBasemen
ThirdBasemen
ShortStop
LeftField
CenterField
RightField
count
strike_attempt


In [98]:
check[check['type']=='X']['events'].unique()

array(['field_out', 'grounded_into_double_play', 'home_run', 'double',
       'single', 'fielders_choice', 'triple', 'force_out', 'sac_bunt',
       'sac_fly', 'fielders_choice_out', 'double_play', 'field_error',
       'sac_fly_double_play', 'triple_play'], dtype=object)

In [99]:
# compare what is in original cleaned and add it into mlb2020new
for col in check.columns:
    if col not in mlb2020new.columns:
        print(col)

index
player_name
batter
pitcher
fielder_2
fielder_3
fielder_4
fielder_5
fielder_6
fielder_7
fielder_8
fielder_9
home_team
away_team
first
second
third


In [100]:
# Check the changes from the original to now
mlb2020.shape, mlb2020new.shape

((279660, 83), (279660, 55))

After selecting the columns I want for this, I will import 2019 and before data. I want to concatenate and clean it up so I can put it in a model soon.

---
# Import the other years and apply changes


In [101]:
mlb2020new.head()

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,2020-10-27,FF,Julio Urias,628711.0,Willy Adames,642715.0,96.7,1.58,5.99,R,L,0.0,2.0,0.2,1.6,-0.53,2.29,,,,9.0,Top,95.4,2615.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,3.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-2,strike,strikeout,called_strike,S,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,2.0
1,2020-10-27,FF,Julio Urias,628711.0,Willy Adames,642715.0,94.1,2.91,5.45,R,L,0.0,1.0,0.8,1.3,-0.55,3.03,,,,9.0,Top,93.4,2470.0,5.9,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,2.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-1,strike,,called_strike,S,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,2.0
2,2020-10-27,FF,Julio Urias,628711.0,Willy Adames,642715.0,94.9,1.77,6.02,R,L,0.0,0.0,0.2,1.5,-0.04,3.32,,,,9.0,Top,94.0,2397.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,1.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-0,strike,,swinging_strike,S,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,2.0
3,2020-10-27,FF,Julio Urias,628711.0,Mike Brosseau,670712.0,94.4,1.66,5.93,R,L,3.0,2.0,0.3,1.5,-0.37,2.15,,,,9.0,Top,93.7,2508.0,5.9,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,6.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,3-2,strike,strikeout,called_strike,S,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,1.0
4,2020-10-27,CU,Julio Urias,628711.0,Mike Brosseau,670712.0,81.4,1.46,6.06,R,L,2.0,2.0,-1.7,-0.2,-0.14,0.96,,,,9.0,Top,80.2,3031.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,5.0,Curveball,1.0,3.0,1.0,3.0,Standard,Standard,2-2,ball,,ball,B,,-0.383207,-118.44781,-4.454166,-15.553576,24.451936,-32.892744,1.0


In [102]:
mlb2019 = pd.read_csv('data/2019mlb.csv')
mlb2018 = pd.read_csv('data/2018mlb.csv')
mlb2017 = pd.read_csv('data/2017mlb.csv')
mlb2016 = pd.read_csv('data/2016mlb.csv')
mlb2019.shape, mlb2018.shape, mlb2017.shape, mlb2016.shape

((743572, 90), (731207, 90), (732476, 90), (726023, 90))

In [103]:
# Apply changes to 2016-2019 data
mlb2019 = project.cleaning(mlb2019)
mlb2019 = project.strike_attempt_column(mlb2019)
mlb2019 = project.fill_in_batters(mlb2019)
mlb2019 = project.the_count(mlb2019)
mlb2019 = mlb2019[features]
print(mlb2019.shape)
mlb2019.sample(3)

(743572, 55)


Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
86751,2019-09-11,SI,Brusdar Graterol,660813.0,Adam Eaton,594809.0,99.3,-1.25,6.47,L,R,2.0,2.0,-1.3,0.8,0.18,1.92,,,,7.0,Top,98.8,1996.0,5.9,488771.0,553902.0,570731.0,501303.0,593871.0,650333.0,664774.0,592696.0,53.0,7.0,Sinker,5.0,2.0,5.0,2.0,Standard,Standard,2-2,strike,,foul,S,,6.75522,-144.273524,-8.753788,-18.788046,30.987968,-20.049696,1.0
100795,2019-09-08,FF,Ryan Brasier,518489.0,Gary Sanchez,596142.0,97.2,-1.21,5.72,R,R,2.0,2.0,-0.4,1.4,-0.06,2.44,,,544369.0,5.0,Top,98.6,2390.0,7.0,506702.0,519048.0,571788.0,646240.0,593428.0,643217.0,598265.0,605141.0,40.0,5.0,4-Seam Fastball,7.0,4.0,7.0,4.0,Standard,Standard,2-2,strike,strikeout,swinging_strike,S,,4.103698,-141.553618,-6.543501,-6.698093,31.627955,-12.29496,0.0
145716,2019-08-28,SL,Tanner Roark,543699.0,Meibrys Viloria,650619.0,88.4,-2.13,5.99,L,R,2.0,0.0,0.2,0.6,0.86,2.75,,656811.0,,2.0,Bot,88.3,2191.0,5.8,543302.0,621566.0,595777.0,640461.0,543760.0,664913.0,592192.0,543257.0,19.0,3.0,Slider,0.0,2.0,0.0,2.0,Standard,Standard,2-0,strike,,foul,S,,6.900045,-128.699067,-3.158942,0.565006,22.095825,-25.007257,2.0


In [104]:
mlb2018 = project.cleaning(mlb2018)
mlb2018 = project.strike_attempt_column(mlb2018)
mlb2018 = project.fill_in_batters(mlb2018)
mlb2018 = project.the_count(mlb2018)
mlb2018 = mlb2018[features]
print(mlb2018.shape)
mlb2018.sample(3)

(731207, 55)


Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
468023,2018-06-04,SL,Louis Coleman,488786.0,Giancarlo Stanton,519317.0,82.6,-3.87,4.75,R,R,3.0,2.0,0.6,-0.1,0.27,1.38,,,596142.0,7.0,Top,82.6,2410.0,6.4,543510.0,408234.0,500135.0,600869.0,592348.0,622682.0,547982.0,592206.0,58.0,6.0,Slider,2.0,3.0,2.0,3.0,Standard,Strategic,3-2,strike,strikeout,swinging_strike,S,,8.198005,-120.029645,-1.055199,4.575824,23.392827,-32.713017,1.0
28918,2018-09-26,CU,Wei-Yin Chen,612672.0,Anthony Rendon,543685.0,72.3,1.48,6.04,R,L,0.0,0.0,-0.4,-1.0,-0.92,1.18,,645302.0,607208.0,2.0,Bot,71.7,2244.0,5.8,595453.0,518618.0,607471.0,605119.0,595375.0,542364.0,642423.0,598284.0,19.0,1.0,Curveball,5.0,0.0,5.0,0.0,Standard,Standard,0-0,strike,,swinging_strike_blocked,S,,-4.165788,-105.152472,-0.683396,-1.871679,17.290108,-39.23976,1.0
554769,2018-05-13,SI,Raisel Iglesias,628452.0,Matt Kemp,461314.0,97.7,-3.25,4.74,R,R,1.0,1.0,-1.3,1.0,0.76,2.82,,,,9.0,Bot,97.7,2501.0,6.4,488810.0,458015.0,607468.0,553993.0,606299.0,594807.0,571740.0,594988.0,75.0,3.0,Sinker,3.0,5.0,3.0,5.0,Standard,Standard,1-1,ball,,ball,B,,13.808765,-141.64033,-1.937811,-20.752453,30.671155,-18.654505,2.0


In [105]:
mlb2017 = project.cleaning(mlb2017)
mlb2017 = project.strike_attempt_column(mlb2017)
mlb2017 = project.fill_in_batters(mlb2017)
mlb2017 = project.the_count(mlb2017)
mlb2017 = mlb2017[features]
print(mlb2017.shape)
mlb2017.sample(3)

(732476, 55)


Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
408788,2017-06-23,CH,Jose Urena,570632.0,Miguel Montero,471083.0,89.5,-1.65,5.9,L,R,3.0,2.0,-0.7,1.6,-0.24,1.88,,,,4.0,Top,89.1,1709.0,6.2,592663.0,571506.0,543829.0,445988.0,595375.0,542303.0,592885.0,519317.0,26.0,9.0,Changeup,0.0,1.0,0.0,1.0,Standard,Standard,3-2,out,field_out,hit_into_play,X,ground_ball,4.977495,-129.870406,-7.247442,-9.267667,28.097312,-12.644219,1.0
211346,2017-08-13,CH,Chris Stratton,608717.0,Andrew Stevenson,664057.0,84.2,-1.6,6.23,L,R,0.0,0.0,-1.2,0.9,0.35,1.25,,,,5.0,Bot,84.1,1693.0,6.4,460026.0,624507.0,605412.0,467055.0,543063.0,592620.0,491676.0,519037.0,38.0,1.0,Changeup,0.0,3.0,0.0,3.0,Strategic,Standard,0-0,strike,,swinging_strike,S,,6.908137,-122.106076,-6.946384,-13.520489,23.641127,-21.904281,0.0
300136,2017-07-22,U,Tanner Roark,543699.0,Daniel Descalso,518614.0,92.8,-3.05,5.78,L,R,2.0,1.0,-1.4,0.9,0.69,1.28,,,,4.0,Bot,91.4,2129.0,5.5,446308.0,475582.0,502517.0,543685.0,594694.0,502317.0,571718.0,547180.0,29.0,4.0,2-Seam Fastball,1.0,1.0,1.0,1.0,Standard,Standard,2-1,ball,,ball,B,,12.156258,-134.113999,-7.484473,-18.301582,25.546871,-20.181385,2.0


In [106]:
mlb2016 = project.cleaning(mlb2016)
mlb2016 = project.strike_attempt_column(mlb2016)
mlb2016 = project.fill_in_batters(mlb2016)
mlb2016 = project.the_count(mlb2016)
mlb2016 = mlb2016[features]
print(mlb2016.shape)
mlb2016.sample(3)

(726023, 55)


Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
104643,2016-09-09,FF,Adam Warren,476589.0,Kevin Kiermaier,595281.0,92.6,-1.39,5.89,L,R,0.0,0.0,-0.6,1.9,-0.67,1.57,,,,7.0,Top,91.0,2201.0,5.9,596142.0,407893.0,516770.0,452104.0,544369.0,608701.0,453056.0,592450.0,63.0,1.0,4-Seam Fastball,4.0,7.0,4.0,7.0,Standard,Standard,0-0,out,field_out,hit_into_play,X,ground_ball,2.582,-134.513,-9.527,-6.739,32.686,-11.047,0.0
650275,2016-04-23,FF,Blake Snell,605483.0,Carlos Beltran,136860.0,95.0,0.91,7.15,R,L,3.0,0.0,0.6,2.3,0.71,3.59,,,,1.0,Bot,94.4,2510.0,6.7,474233.0,489149.0,523253.0,456665.0,543543.0,457775.0,595281.0,572816.0,6.0,4.0,4-Seam Fastball,0.0,0.0,0.0,0.0,Infield shift,Standard,3-0,ball,walk,ball,B,,-1.736,-138.035,-8.363,9.219,37.86,-4.838,2.0
583653,2016-05-10,FF,Sean O'Sullivan,457711.0,Coco Crisp,424825.0,93.0,-1.9,5.91,L,R,1.0,2.0,-0.7,1.5,0.72,1.77,,,,1.0,Top,93.0,2226.0,6.5,452672.0,543768.0,456030.0,592710.0,593428.0,455759.0,598265.0,605141.0,1.0,4.0,4-Seam Fastball,0.0,0.0,0.0,0.0,Standard,Strategic,1-2,ball,,ball,B,,7.667,-135.069,-7.845,-7.672,29.104,-15.045,0.0


---
### Check for nulls in the new dataframes

In [107]:
mlb2019.shape[0]*.95

706393.4

In [108]:
pd.DataFrame(mlb2019.isnull().sum()).T
# mlb2019.dropna(thresh=49).shape

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,0,0,0,0,0,0,5738,5757,5757,0,0,0,0,5757,5757,5757,5757,675504,607993,518755,0,0,5332,18564,5759,57,57,57,57,57,57,57,57,0,0,0,0,0,0,0,0,0,0,0,554707,0,0,616038,5757,5757,5757,5757,5757,5757,0


In [109]:
mlb2018.shape[0]*.95

694646.65

In [110]:
pd.DataFrame(mlb2018.isnull().sum()).T
# mlb2018.dropna(thresh=49).shape

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,0,0,0,0,0,0,3163,3039,3039,0,0,0,0,3039,3039,3039,3039,663247,598019,508052,0,0,3059,14280,3192,335,335,335,335,335,335,335,335,0,0,0,0,0,0,0,0,0,0,2,544045,0,0,603274,3039,3039,3039,3039,3039,3039,0


In [111]:
mlb2017.shape[0]*.95

695852.2

In [112]:
pd.DataFrame(mlb2017.isnull().sum()).T
# mlb2017.dropna(thresh=49).shape

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,0,0,0,0,0,0,5725,3198,3198,0,0,0,0,3198,3198,3198,3198,664432,597917,508844,0,0,5545,17799,5746,1766,1766,1766,1766,1766,1766,1766,1766,0,0,0,0,0,0,0,0,0,0,0,544982,0,0,603092,3198,3198,3198,3198,3198,3198,0


In [113]:
mlb2016.shape[0]*.95

689721.85

In [114]:
pd.DataFrame(mlb2016.isnull().sum()).T
# mlb2016.dropna(thresh=49).shape

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,0,0,0,0,0,0,106,11179,11179,0,0,0,0,107,107,107,107,655516,590537,507004,0,0,10650,57417,10929,6710,6710,6710,6710,6710,6710,6710,6710,0,0,0,0,0,0,0,0,0,0,3329,538452,0,0,595480,107,107,107,107,107,107,0


---
### Examine nulls in certain parts of the data

In [115]:
mlb2019[mlb2019['release_pos_x'].isnull()] # make a model to determine the perfect strike zone segment

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
514,2019-10-29,U,Justin Verlander,434378.0,Anthony Rendon,543685.0,,,,R,R,0.0,0.0,,,,,,,594809.0,3.0,Top,,,,455139.0,493329.0,514888.0,608324.0,621043.0,488726.0,543807.0,502210.0,20.0,1.0,Unknown,1.0,2.0,1.0,2.0,Strategic,Standard,0-0,ball,,ball,B,,,,,,,,2.0
2130,2019-10-22,U,Max Scherzer,453286.0,Alex Bregman,608324.0,,,,R,R,0.0,2.0,,,,,,543807.0,514888.0,1.0,Bot,,,,435559.0,475582.0,452678.0,543685.0,607208.0,665742.0,645302.0,594809.0,8.0,3.0,Unknown,0.0,0.0,0.0,0.0,Standard,Standard,0-2,ball,,ball,B,,,,,,,,1.0
12669,2019-09-29,U,Sam Selman,572125.0,Enrique Hernandez,571771.0,,,,R,L,0.0,2.0,,,,,,,666158.0,3.0,Top,,,,457763.0,474832.0,643289.0,446334.0,543063.0,596103.0,573262.0,664041.0,25.0,3.0,Unknown,5.0,0.0,5.0,0.0,Standard,Standard,0-2,ball,,blocked_ball,B,,,,,,,,1.0
12698,2019-09-29,U,Dereck Rodriguez,605446.0,A.J. Pollock,572041.0,,,,R,R,1.0,2.0,,,,,,621035.0,571970.0,2.0,Top,,,,457763.0,474832.0,643289.0,446334.0,543063.0,596103.0,573262.0,664041.0,17.0,5.0,Unknown,5.0,0.0,5.0,0.0,Standard,Standard,1-2,ball,,blocked_ball,B,,,,,,,,1.0
12980,2019-09-29,U,Tanner Roark,543699.0,Austin Nola,543592.0,,,,R,R,0.0,0.0,,,,,,,,3.0,Bot,,,,572033.0,664913.0,595777.0,641914.0,620439.0,543257.0,621450.0,572039.0,24.0,1.0,Unknown,2.0,0.0,2.0,0.0,Standard,Standard,0-0,strike,,called_strike,S,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743567,2019-03-20,U,Mike Fiers,571666.0,Mitch Haniger,571745.0,,,,R,R,0.0,2.0,,,,,,,,1.0,Top,,,,460026.0,621566.0,595777.0,656305.0,543760.0,640461.0,657656.0,572039.0,2.0,3.0,Unknown,0.0,0.0,0.0,0.0,Unknown,Unknown,0-2,strike,,foul,S,,,,,,,,1.0
743568,2019-03-20,U,Mike Fiers,571666.0,Mitch Haniger,571745.0,,,,R,R,0.0,1.0,,,,,,,,1.0,Top,,,,460026.0,621566.0,595777.0,656305.0,543760.0,640461.0,657656.0,572039.0,2.0,2.0,Unknown,0.0,0.0,0.0,0.0,Unknown,Unknown,0-1,strike,,called_strike,S,,,,,,,,1.0
743569,2019-03-20,U,Mike Fiers,571666.0,Mitch Haniger,571745.0,,,,R,R,0.0,0.0,,,,,,,,1.0,Top,,,,460026.0,621566.0,595777.0,656305.0,543760.0,640461.0,657656.0,572039.0,2.0,1.0,Unknown,0.0,0.0,0.0,0.0,Unknown,Unknown,0-0,strike,,called_strike,S,,,,,,,,1.0
743570,2019-03-20,U,Mike Fiers,571666.0,Dee Gordon,543829.0,,,,L,R,1.0,0.0,,,,,,,,1.0,Top,,,,460026.0,621566.0,595777.0,656305.0,543760.0,640461.0,657656.0,572039.0,1.0,2.0,Unknown,0.0,0.0,0.0,0.0,Unknown,Unknown,1-0,out,field_out,hit_into_play,X,line_drive,,,,,,,0.0


In [116]:
mlb2016[(mlb2016['Catcher'].isnull()) & (mlb2016['Pitcher_name']=='Rich Hill')] # practically missing an innings worth of data in 2016
# Missing who were his defensive peeps

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
21632,2016-09-30,FF,Rich Hill,448179.0,Angel Pagan,434636.0,90.7,,,R,L,2.0,2.0,0.9,1.5,-0.62,2.45,,474832.0,,3.0,Bot,,,,,,,,,,,,28.0,5.0,4-Seam Fastball,2.0,2.0,2.0,2.0,Unknown,Unknown,2-2,out,field_out,hit_into_play,X,line_drive,-7.891,-131.796,-6.373,13.298,29.163,-16.821,2.0
82303,2016-09-15,FF,Rich Hill,448179.0,Kyle Jensen,571812.0,91.7,,,R,L,1.0,1.0,1.3,1.0,-0.19,3.36,,,,2.0,Bot,,,,,,,,,,,,15.0,3.0,4-Seam Fastball,0.0,0.0,0.0,0.0,Unknown,Unknown,1-1,strike,,called_strike,S,,-9.246,-133.267,-2.493,18.616,28.747,-21.567,2.0
550412,2016-05-18,CU,Rich Hill,448179.0,Mitch Moreland,519048.0,74.6,,,L,L,2.0,2.0,-1.1,-1.3,0.16,2.08,,,607387.0,2.0,Top,,,,,,,,,,,,14.0,5.0,Curveball,0.0,3.0,0.0,3.0,Unknown,Unknown,2-2,strike,strikeout,swinging_strike,S,,-2.129,-108.633,2.154,-8.098,21.596,-44.134,1.0
684304,2016-04-15,FF,Rich Hill,448179.0,Alex Gordon,460086.0,90.1,,,L,L,0.0,0.0,1.1,1.4,1.15,3.89,,,434778.0,3.0,Top,,,,,,,,,,,,25.0,1.0,4-Seam Fastball,3.0,1.0,3.0,1.0,Unknown,Unknown,0-0,ball,,ball,B,,-4.728,-131.23,-1.873,15.257,29.437,-17.319,1.0
725139,2016-04-04,CU,Rich Hill,448179.0,Melky Cabrera,466320.0,75.4,,,R,L,0.0,0.0,-1.3,-0.8,1.52,3.28,,,,2.0,Top,,,,,,,,,,,,8.0,1.0,Curveball,0.0,0.0,0.0,0.0,Unknown,Unknown,0-0,ball,,ball,B,,0.23,-109.75,4.38,-9.68,23.76,-40.74,0.0
725156,2016-04-04,U,Rich Hill,448179.0,Todd Frazier,453943.0,91.2,,,R,L,1.0,2.0,1.8,1.0,1.02,1.89,,,547989.0,1.0,Top,,,,,,,,,,,,4.0,4.0,2-Seam Fastball,0.0,0.0,0.0,0.0,Unknown,Unknown,1-2,strike,strikeout,swinging_strike,S,,-7.93,-132.65,-3.66,24.83,32.02,-22.59,2.0
725157,2016-04-04,CU,Rich Hill,448179.0,Todd Frazier,453943.0,75.3,,,R,L,1.0,1.0,-1.2,-0.7,-0.12,2.92,,,547989.0,1.0,Top,,,,,,,,,,,,4.0,3.0,Curveball,0.0,0.0,0.0,0.0,Unknown,Unknown,1-1,strike,,foul,S,,-2.63,-109.55,3.31,-9.45,21.9,-39.72,2.0
725158,2016-04-04,U,Rich Hill,448179.0,Todd Frazier,453943.0,90.2,,,R,L,0.0,1.0,1.6,0.9,-1.09,2.47,,,547989.0,1.0,Top,,,,,,,,,,,,4.0,2.0,2-Seam Fastball,0.0,0.0,0.0,0.0,Unknown,Unknown,0-1,ball,,ball,B,,-11.37,-130.92,-3.32,21.55,25.97,-23.98,2.0
725159,2016-04-04,CU,Rich Hill,448179.0,Todd Frazier,453943.0,75.4,,,R,L,0.0,0.0,-1.2,-1.0,0.24,1.87,,,547989.0,1.0,Top,,,,,,,,,,,,4.0,1.0,Curveball,0.0,0.0,0.0,0.0,Unknown,Unknown,0-0,strike,,called_strike,S,,-2.07,-109.88,1.93,-9.45,23.7,-42.32,2.0
725160,2016-04-04,CU,Rich Hill,448179.0,Jose Abreu,547989.0,75.6,,,R,L,3.0,2.0,-1.1,-1.0,-2.44,1.79,,,,1.0,Top,,,,,,,,,,,,3.0,6.0,Curveball,0.0,0.0,0.0,0.0,Unknown,Unknown,3-2,ob,hit_by_pitch,hit_by_pitch,B,,-7.08,-109.89,1.38,-7.95,26.09,-41.99,2.0


In [117]:
# save the cleaned data
# mlb2019.to_csv('data/2019cleaned.csv', index=False)
# mlb2018.to_csv('data/2018cleaned.csv', index=False)
# mlb2017.to_csv('data/2017cleaned.csv', index=False)
# mlb2016.to_csv('data/2016cleaned.csv', index=False)

In [118]:
# quick check
mlb2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 743572 entries, 0 to 743571
Data columns (total 55 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   game_date              743572 non-null  datetime64[ns]
 1   pitch_type             743572 non-null  object        
 2   Pitcher_name           743572 non-null  object        
 3   pitcher_id             743572 non-null  float64       
 4   batter_name            743572 non-null  object        
 5   batter_id              743572 non-null  float64       
 6   release_speed          737834 non-null  float64       
 7   release_pos_x          737815 non-null  float64       
 8   release_pos_z          737815 non-null  float64       
 9   stand                  743572 non-null  object        
 10  p_throws               743572 non-null  object        
 11  balls                  743572 non-null  float64       
 12  strikes                743572 non-null  floa

### Import the dataframe created from EDA notebook.

This came from a concatenation created in another notebook. The code was similar to what was done. (Sorry, I accidently deleted the actual code)

```python
df = pd.concat(<list of mlb df>, something )
df.reset_index(drop=True, inplace=True)
```

In [119]:
df = pd.read_csv('data/big_daddy_baseball.csv')

In [120]:
df.head()

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,2020-10-27,FF,Julio Urias,628711.0,Willy Adames,642715.0,96.7,1.58,5.99,R,L,0.0,2.0,0.2,1.6,-0.53,2.29,,,,9.0,Top,95.4,2615.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,3.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-2,strike,strikeout,called_strike,S,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,2.0
1,2020-10-27,FF,Julio Urias,628711.0,Willy Adames,642715.0,94.1,2.91,5.45,R,L,0.0,1.0,0.8,1.3,-0.55,3.03,,,,9.0,Top,93.4,2470.0,5.9,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,2.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-1,strike,,called_strike,S,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,2.0
2,2020-10-27,FF,Julio Urias,628711.0,Willy Adames,642715.0,94.9,1.77,6.02,R,L,0.0,0.0,0.2,1.5,-0.04,3.32,,,,9.0,Top,94.0,2397.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,1.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-0,strike,,swinging_strike,S,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,2.0
3,2020-10-27,FF,Julio Urias,628711.0,Mike Brosseau,670712.0,94.4,1.66,5.93,R,L,3.0,2.0,0.3,1.5,-0.37,2.15,,,,9.0,Top,93.7,2508.0,5.9,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,6.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,3-2,strike,strikeout,called_strike,S,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,1.0
4,2020-10-27,CU,Julio Urias,628711.0,Mike Brosseau,670712.0,81.4,1.46,6.06,R,L,2.0,2.0,-1.7,-0.2,-0.14,0.96,,,,9.0,Top,80.2,3031.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,5.0,Curveball,1.0,3.0,1.0,3.0,Standard,Standard,2-2,ball,,ball,B,,-0.383207,-118.44781,-4.454166,-15.553576,24.451936,-32.892744,1.0


# More Feature Engineering

---
The reason why I am doing this here is because now I have the large dataframe in which to build on. Basically, I concatenated on the EDA notebook so I can save it and reimport it there.

The `subset` dataframe where I test out all my engineering.

In [121]:
subset = df[:300].copy()
subset.shape

(300, 55)

In [122]:
subset['out_type'] = np.where(subset['bb_type'].isnull(), subset['strike_attempt'], subset['bb_type'])

In [123]:
# re-ran many times
subset.sample(5)

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up,out_type
271,2020-10-25,SL,Ryan Thompson,657044.0,Mookie Betts,605141.0,78.4,-3.14,3.5,R,R,2.0,2.0,1.0,1.0,0.13,1.65,,,605131.0,9.0,Top,78.9,2142.0,6.8,605421.0,670712.0,664040.0,621563.0,642715.0,622534.0,595281.0,640457.0,68.0,5.0,Slider,4.0,2.0,4.0,2.0,Standard,Standard,2-2,strike,strikeout,swinging_strike,S,,5.231537,-114.083932,1.160008,7.835857,22.473663,-24.344484,2.0,strike
205,2020-10-27,FS,Tony Gonsolin,664062.0,Mike Zunino,572287.0,86.5,-0.73,6.17,R,R,3.0,2.0,-1.5,0.8,0.6,1.88,,595281.0,,2.0,Top,86.0,1909.0,6.1,605131.0,571970.0,621035.0,457759.0,608369.0,572041.0,641355.0,605141.0,12.0,8.0,Split-Finger,1.0,0.0,1.0,0.0,Infield shift,Standard,3-2,strike,strikeout,swinging_strike,S,,6.175086,-125.793829,-5.672806,-17.024449,26.489874,-22.299717,1.0,strike
144,2020-10-27,CU,Blake Snell,605483.0,Corey Seager,608369.0,81.0,1.7,6.62,L,L,2.0,2.0,-1.2,-1.1,0.25,1.52,,,,4.0,Bot,81.9,2196.0,7.2,572287.0,596847.0,664040.0,621563.0,642715.0,668227.0,595281.0,622534.0,28.0,5.0,Curveball,0.0,1.0,0.0,1.0,Infield shift,Standard,2-2,strike,strikeout,swinging_strike,S,,-1.019145,-117.862399,-2.580918,-10.562428,25.253453,-42.109005,0.0,strike
153,2020-10-27,SI,Alex Wood,622072.0,Willy Adames,642715.0,92.4,2.09,5.18,R,L,0.0,2.0,1.0,1.3,0.13,3.99,,,,4.0,Top,92.4,2148.0,6.3,605131.0,571970.0,621035.0,457759.0,608369.0,572041.0,641355.0,605141.0,26.0,4.0,Sinker,1.0,0.0,1.0,0.0,Standard,Standard,0-2,strike,strikeout,swinging_strike,S,,-7.275763,-134.443773,-0.08059,14.218321,29.243224,-16.45485,1.0,strike
218,2020-10-27,SL,Tony Gonsolin,664062.0,Willy Adames,642715.0,88.9,-0.33,6.3,R,R,2.0,1.0,0.2,0.4,1.24,1.78,,,,2.0,Top,88.9,2690.0,6.1,605131.0,571970.0,621035.0,457759.0,608369.0,572041.0,641355.0,605141.0,10.0,4.0,Slider,1.0,0.0,1.0,0.0,Standard,Standard,2-1,strike,,swinging_strike,S,,3.423566,-129.439657,-5.989458,1.655473,25.024084,-26.210436,0.0,strike


In [124]:
# Fill the subsets with 0s
subset[['on_2b','on_3b']]=subset[['on_2b','on_3b']].fillna(0)

In [125]:
subset['on_3b'].map(int)

0      0
1      0
2      0
3      0
4      0
      ..
295    0
296    0
297    0
298    0
299    0
Name: on_3b, Length: 300, dtype: int64

In [126]:
subset.sample()

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up,out_type
189,2020-10-27,CH,Blake Snell,605483.0,Will Smith,669257.0,90.9,2.17,6.62,R,L,2.0,0.0,1.1,1.1,1.17,2.31,0.0,0.0,,2.0,Bot,91.9,1870.0,7.0,572287.0,596847.0,664040.0,621563.0,642715.0,668227.0,595281.0,622534.0,16.0,3.0,Changeup,0.0,1.0,0.0,1.0,Infield shift,Standard,2-0,strike,,called_strike,S,,-4.861208,-132.128125,-7.263912,13.55682,28.56894,-18.197502,1.0,strike


In [127]:
# subset['on_1b']=subset['on_1b'].map(int)

In [128]:
# Make 'stand','p_throws','inning_topbot' binary columns
# make_binary = ['stand','p_throws']
# pd.get_dummies(subset, columns=['inning_topbot'], drop_first=True)


In [129]:
subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 56 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   game_date              300 non-null    object 
 1   pitch_type             300 non-null    object 
 2   Pitcher_name           300 non-null    object 
 3   pitcher_id             300 non-null    float64
 4   batter_name            300 non-null    object 
 5   batter_id              300 non-null    float64
 6   release_speed          300 non-null    float64
 7   release_pos_x          300 non-null    float64
 8   release_pos_z          300 non-null    float64
 9   stand                  300 non-null    object 
 10  p_throws               300 non-null    object 
 11  balls                  300 non-null    float64
 12  strikes                300 non-null    float64
 13  pfx_x                  300 non-null    float64
 14  pfx_z                  300 non-null    float64
 15  plate_

---
---
Execute what is done on the subset into `df`

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3212938 entries, 0 to 3212937
Data columns (total 55 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   game_date              object 
 1   pitch_type             object 
 2   Pitcher_name           object 
 3   pitcher_id             float64
 4   batter_name            object 
 5   batter_id              float64
 6   release_speed          float64
 7   release_pos_x          float64
 8   release_pos_z          float64
 9   stand                  object 
 10  p_throws               object 
 11  balls                  float64
 12  strikes                float64
 13  pfx_x                  float64
 14  pfx_z                  float64
 15  plate_x                float64
 16  plate_z                float64
 17  on_3b                  float64
 18  on_2b                  float64
 19  on_1b                  float64
 20  inning                 float64
 21  inning_topbot          object 
 22  effective_speed   

In [139]:
df.head(3)

Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,2020-10-27,FF,Julio Urias,628711,Willy Adames,642715,96.7,1.58,5.99,1,0,0,2,0.2,1.6,-0.53,2.29,0,0,0,9,1,95.4,2615.0,5.7,605131,571970,571771,621458,608369,621035,641355,605141,65,3,4-Seam Fastball,1,3,1,3,Standard,Standard,0-2,strike,strikeout,called_strike,S,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,2
1,2020-10-27,FF,Julio Urias,628711,Willy Adames,642715,94.1,2.91,5.45,1,0,0,1,0.8,1.3,-0.55,3.03,0,0,0,9,1,93.4,2470.0,5.9,605131,571970,571771,621458,608369,621035,641355,605141,65,2,4-Seam Fastball,1,3,1,3,Standard,Standard,0-1,strike,,called_strike,S,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,2
2,2020-10-27,FF,Julio Urias,628711,Willy Adames,642715,94.9,1.77,6.02,1,0,0,0,0.2,1.5,-0.04,3.32,0,0,0,9,1,94.0,2397.0,5.7,605131,571970,571771,621458,608369,621035,641355,605141,65,1,4-Seam Fastball,1,3,1,3,Standard,Standard,0-0,strike,,swinging_strike,S,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,2


In [132]:
# Convert to date time
df['game_date']=pd.to_datetime(df['game_date'])

##### Make IDs into integers

In [133]:
# Apply a fillna to the bases with 0s and turn them into ints to get IDs
ids = ['pitcher_id','batter_id','on_1b','on_2b','on_3b','Catcher','FirstBasemen',
      'SecondBasemen', 'ThirdBasemen', 'ShortStop', 'LeftField','CenterField', 'RightField']
df[ids] = df[ids].fillna(0)

In [134]:
for i in ids:
    df[i] = df[i].map(int)

##### Make Binary columns

In [135]:
# Make 'stand','p_throws','inning_topbot' binary columns
make_binary = ['stand','p_throws']
df = pd.get_dummies(df, columns=make_binary, drop_first=True)

df.rename(columns={'stand_R':'stand', 'p_throws_R':'p_throws'},inplace=True)

In [136]:
# Same application for 'inning_topbot'
df = pd.get_dummies(df, columns=['inning_topbot'], drop_first=True)
df.rename(columns={'inning_topbot_Top':'inning_topbot'},inplace=True)

##### Turn the rest of the column dtypes into `int`

In [137]:
# Turn these columns into int dtypes
integerify = ['outs_when_up','bat_score','fld_score','post_bat_score','post_fld_score','pitch_number',
             'at_bat_number','inning','balls','strikes']
for i in integerify:
    df[i] = df[i].map(int)

In [138]:
# save df
df = df[features]
df.to_csv('data/big_daddy_baseball_cleaned.csv', index=False)

In [142]:
df.columns

Index(['game_date', 'pitch_type', 'Pitcher_name', 'pitcher_id', 'batter_name',
       'batter_id', 'release_speed', 'release_pos_x', 'release_pos_z', 'stand',
       'p_throws', 'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z',
       'on_3b', 'on_2b', 'on_1b', 'inning', 'inning_topbot', 'effective_speed',
       'release_spin_rate', 'release_extension', 'Catcher', 'FirstBasemen',
       'SecondBasemen', 'ThirdBasemen', 'ShortStop', 'LeftField',
       'CenterField', 'RightField', 'at_bat_number', 'pitch_number',
       'pitch_name', 'bat_score', 'fld_score', 'post_bat_score',
       'post_fld_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'count', 'strike_attempt', 'events', 'description', 'type', 'bb_type',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'outs_when_up'],
      dtype='object')