# Imports

In [1]:
import pybaseball as pb
import project_baseball as project

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Import Data
mlb2020 = pd.read_csv('data/2020mlb.csv')
mlb2020.shape

(279660, 90)

For a full explanation of what the data dictionary is, go [here](https://baseballsavant.mlb.com/csv-docs).

# Cleaning

---
The cleaning will be shown on the `mlb2020` dataset but will be applied on past data as well.

This function will rename the fielders columns from the `project_baseball.py` file.

```python
def renaming_fielders(mlb):
    mlb.rename(columns={'player_name':'Pitcher_name'}, inplace=True) # Rename pitcher
    mlb.rename(columns={'fielder_2':'Catcher'}, inplace=True) # rename Catcher
    # Rename other fielders
    mlb.rename(columns={'fielder_3':'FirstBasemen',
                        'fielder_4':'SecondBasemen',
                        'fielder_5':'ThirdBasemen',
                        'fielder_6':'ShortStop',
                        'fielder_7':'LeftField',
                        'fielder_8':'CenterField',
                        'fielder_9':'RightField'}, inplace=True)
    mlb.rename(columns={'batter':'batter_id',
                       'pitcher':'pitcher_id'})
    return mlb
```

In [3]:
project.renaming_fielders(mlb2020)

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,Pitcher_name,batter_id,pitcher_id,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,Catcher,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,0,FF,2020-10-27,96.7,1.58,5.99,Julio Urias,642715.0,628711.0,strikeout,called_strike,,,,,4.0,Willy Adames called out on strikes.,W,R,L,LAD,TB,S,2.0,,0.0,2.0,2020.0,0.2,1.6,-0.53,2.29,,,,2.0,9.0,Top,,,,,605131.0,,,-5.950264,-140.490456,-7.897391,3.772000,32.321911,-8.981441,3.50,1.69,,,,95.4,2615.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.82,,,0.0,1.0,0.0,0.0,,65.0,3.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
1,1,FF,2020-10-27,94.1,2.91,5.45,Julio Urias,642715.0,628711.0,,called_strike,,,,,1.0,,W,R,L,LAD,TB,S,,,0.0,1.0,2020.0,0.8,1.3,-0.55,3.03,,,,2.0,9.0,Top,,,,,605131.0,,,-10.560246,-136.599519,-3.429867,11.723598,29.183810,-15.237217,3.50,1.69,,,,93.4,2470.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.59,,,,,,,,65.0,2.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
2,2,FF,2020-10-27,94.9,1.77,6.02,Julio Urias,642715.0,628711.0,,swinging_strike,,,,,2.0,,W,R,L,LAD,TB,S,,,0.0,0.0,2020.0,0.2,1.5,-0.04,3.32,,,,2.0,9.0,Top,,,,,605131.0,,,-5.199252,-138.098234,-4.637970,4.158758,30.838499,-12.535677,3.50,1.69,,,,94.0,2397.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.76,,,,,,,,65.0,1.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
3,3,FF,2020-10-27,94.4,1.66,5.93,Julio Urias,670712.0,628711.0,strikeout,called_strike,,,,,4.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,S,2.0,,3.0,2.0,2020.0,0.3,1.5,-0.37,2.15,,,,1.0,9.0,Top,,,,,605131.0,,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,3.34,1.53,,,,93.7,2508.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.60,,,0.0,1.0,0.0,0.0,,64.0,6.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
4,4,CU,2020-10-27,81.4,1.46,6.06,Julio Urias,670712.0,628711.0,,ball,,,,,13.0,,W,R,L,LAD,TB,B,,,2.0,2.0,2020.0,-1.7,-0.2,-0.14,0.96,,,,1.0,9.0,Top,,,,,605131.0,,,-0.383207,-118.447810,-4.454166,-15.553576,24.451936,-32.892744,3.34,1.53,,,,80.2,3031.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.84,,,,,,,,64.0,5.0,Curveball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279655,20155,FF,2020-07-23,96.1,-3.28,5.13,Max Scherzer,543305.0,453286.0,field_out,hit_into_play,,,,,5.0,"Aaron Hicks grounds out sharply, second basema...",R,L,R,WSH,NYY,X,4.0,ground_ball,2.0,2.0,2020.0,-0.8,1.2,-0.19,2.41,,,,0.0,1.0,Top,163.49,140.27,,,435559.0,,,9.899700,-139.626080,-4.473902,-12.016982,30.128689,-14.962333,3.41,1.56,91.0,106.7,4.0,96.3,2450.0,6.4,630851.0,453286.0,435559.0,519346.0,516770.0,452678.0,607208.0,664057.0,645302.0,594809.0,54.09,0.657,0.591,0.0,1.0,0.0,0.0,4.0,1.0,5.0,4-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infield shift,Standard
279656,20157,CU,2020-07-23,79.7,-3.36,5.48,Max Scherzer,543305.0,453286.0,,called_strike,,,,,7.0,,R,L,R,WSH,NYY,S,,,2.0,1.0,2020.0,0.9,-0.8,-0.41,1.85,,,,0.0,1.0,Top,,,,,435559.0,,,4.691537,-116.064292,0.319935,7.680712,22.130450,-39.867212,3.51,1.65,,,,79.6,2846.0,6.2,630851.0,453286.0,435559.0,519346.0,516770.0,452678.0,607208.0,664057.0,645302.0,594809.0,54.28,,,,,,,,1.0,4.0,Curveball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infield shift,Standard
279657,20159,FF,2020-07-23,96.7,-3.21,5.33,Max Scherzer,543305.0,453286.0,,called_strike,,,,,9.0,,R,L,R,WSH,NYY,S,,,2.0,0.0,2020.0,-0.8,1.3,0.48,2.07,,,,0.0,1.0,Top,,,,,435559.0,,,11.547336,-140.289508,-5.998194,-12.798480,30.880442,-14.099707,3.33,1.65,,,,96.4,2421.0,6.2,630851.0,453286.0,435559.0,519346.0,516770.0,452678.0,607208.0,664057.0,645302.0,594809.0,54.29,,,,,,,,1.0,3.0,4-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infield shift,Standard
279658,20162,FF,2020-07-23,96.1,-3.38,5.33,Max Scherzer,543305.0,453286.0,,ball,,,,,11.0,,R,L,R,WSH,NYY,B,,,1.0,0.0,2020.0,-0.8,1.4,-1.17,2.75,,,,0.0,1.0,Top,,,,,435559.0,,,7.600636,-139.767747,-4.483222,-11.964765,30.151437,-12.476899,3.58,1.75,,,,95.9,2477.0,6.1,630851.0,453286.0,435559.0,519346.0,516770.0,452678.0,607208.0,664057.0,645302.0,594809.0,54.38,,,,,,,,1.0,2.0,4-Seam Fastball,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infield shift,Standard


In [4]:
# Drop what we don't need

mlb2020.drop(columns=['spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated','tfs_deprecated', 'tfs_zulu_deprecated',
       'umpire', 'sv_id', 'pitcher.1', 'fielder_2.1'], inplace=True)

In [5]:
# Make datetime
mlb2020['game_date']=pd.to_datetime(mlb2020['game_date'])

In [6]:
mlb2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279660 entries, 0 to 279659
Data columns (total 80 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   index                            279660 non-null  int64         
 1   pitch_type                       279089 non-null  object        
 2   game_date                        279660 non-null  datetime64[ns]
 3   release_speed                    279652 non-null  float64       
 4   release_pos_x                    279652 non-null  float64       
 5   release_pos_z                    279652 non-null  float64       
 6   Pitcher_name                     279660 non-null  object        
 7   batter_id                        279660 non-null  float64       
 8   pitcher_id                       279660 non-null  float64       
 9   events                           70437 non-null   object        
 10  description                      279660 non-

In [7]:
mlb2020.columns

Index(['index', 'pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'Pitcher_name', 'batter_id', 'pitcher_id', 'events',
       'description', 'zone', 'des', 'game_type', 'stand', 'p_throws',
       'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'balls',
       'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b',
       'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'hc_x',
       'hc_y', 'Catcher', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top',
       'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle',
       'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk',
       'FirstBasemen', 'SecondBasemen', 'ThirdBasemen', 'ShortStop',
       'LeftField', 'CenterField', 'RightField', 'release_pos_y',
       'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
       'launch_speed_angle', 'at_bat_number', '

In [8]:
mlb2020[mlb2020['pitch_type']=='FO']['pitch_name']

249966    Forkball
250000    Forkball
252757    Forkball
Name: pitch_name, dtype: object

In [9]:
# Rename CS to CU for Curveball
mlb2020['pitch_type']=mlb2020['pitch_type'].map({'CS':'CU',
                           'FF':'FF',
                           'CU':'CU',
                           'CH':'CH',
                           'SI':'SI',
                           'SL':'SL',
                           'FC':'FC',
                           'KC':'KC',
                           'FS':'FS',
                           'KN':'KN',
                           'FO':'FO'})

# Dealing with Nulls

---
I will drop any nulls in the release_speed as it will most likely drop the same nulls in release position, zone, movement, velocity, and acceleration columns.

In [10]:
# re run to check every so often
pd.DataFrame(mlb2020.isnull().sum()).T

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,Pitcher_name,batter_id,pitcher_id,events,description,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,Catcher,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,0,571,0,8,8,8,0,0,0,209223,0,8,209224,0,0,0,0,0,0,219112,233143,0,0,0,8,8,8,8,252778,226576,191254,0,0,0,233192,233192,0,8,8,8,8,8,8,8,8,195879,197092,197092,8,354,595,0,0,0,0,0,0,0,0,8,233450,233450,209223,209530,209223,209223,233450,0,0,571,0,0,0,0,0,0,0,0,525,525


In [11]:
mlb2020 = mlb2020[mlb2020['release_speed'].notnull()].copy() # dropped 8 nulls

For `pitch_type`, I will replace them with "U". And `pitch_name` is "Unknown"

In [12]:
mlb2020['pitch_type'].fillna('U', inplace=True)
mlb2020['pitch_name'].fillna('Unknown', inplace=True)

For fielding alignments, I will also fill with "Unknown". Although they may or may not be vital for our model, I will still apply them.

In [13]:
mlb2020['if_fielding_alignment'].fillna('Unknown', inplace=True)
mlb2020['of_fielding_alignment'].fillna('Unknown', inplace=True)

The other NaNs are based on the batter/runners causing hits, steals, or sacrifice flys. Those will remain and may not play a role.

# Feature Engineering 

---
## Strike Attempt

This will show whether or not a pitch result in a strike or other event. 

In [14]:
# Gives a pitch and determines what happens at that particular at-bat

# Good for creating a model to determine if a ball is a strike or not
mlb2020['strike_attempt']=mlb2020['description'].map(
{'called_strike':'strike',
 'swinging_strike':'strike',
 'ball':'ball',
 'foul':'strike', # False in out_via_description
 'hit_into_play':'out',
 'blocked_ball':'ball',
 'hit_into_play_score':'ob',
 'swinging_strike_blocked':'strike',
 'hit_into_play_no_out':'ob',
 'foul_bunt':'strike',
 'foul_tip':'strike',
 'hit_by_pitch':'ob',
 'missed_bunt':'strike',
 'pitchout':'out',
 'bunt_foul_tip':'strike'})

In [15]:
mlb2020['strike_attempt'].value_counts(normalize=True)

strike    0.463315
ball      0.367146
out       0.107012
ob        0.062528
Name: strike_attempt, dtype: float64

## Create Batter Name

---


```python
def batter_name(des):
    try:
        name = ' '.join(des.split(' ',2)[:2])
        return name
    except:
        return np.nan
mlb2020['batter_name'] = mlb2020['des'].map(batter_name)
mlb2020['batter_name'].ffill(axis=0, inplace=True)
mlb2020.head()
```

In [16]:
def batter_name(des):
    try:
        name = ' '.join(des.split(' ',2)[:2])
        return name
    except:
        return np.nan

mlb2020['batter_name'] = mlb2020['des'].map(batter_name)
mlb2020['batter_name'].ffill(axis=0, inplace=True)
# mlb2020.head(10)

In [17]:
# Convert balls and strikes into `int` then into `str` then concatenate strings to make the count
mlb2020['count'] = mlb2020['balls'].map(int).map(str) +"-"+ mlb2020['strikes'].map(int).map(str)
mlb2020['count'].unique()

array(['0-2', '0-1', '0-0', '3-2', '2-2', '1-2', '1-1', '1-0', '2-1',
       '2-0', '3-1', '3-0', '4-2'], dtype=object)

---
# Select Features

If I were to go with trying to figure out whether a pitch was a `strike`, `ball`, `out`, or `on-base`, I would select these features.

```python
features = ['game_date','pitch_type','Pitcher_name', 'batter_id','pitcher_id','release_speed', 'release_pos_x', 'release_pos_z', 'stand', 'p_throws', 'balls', 'strikes',
            'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'inning', 'inning_topbot',
            'effective_speed', 'release_spin_rate', 'release_extension','Catcher','FirstBasemen', 'SecondBasemen', 'ThirdBasemen', 'ShortStop',
            'LeftField', 'CenterField', 'RightField', 'at_bat_number', 'pitch_number', 'pitch_name',
            'bat_score', 'fld_score', 'post_bat_score', 'post_fld_score', 'if_fielding_alignment', 'of_fielding_alignment',
            'count', 'strike_attempt','events','description','type','bb_type','vx0','vy0','vz0',
            'ax','ay','az','outs_when_up']

target = mlb2020['strike_attempt'] or mlb2020['pitch_name']
```

If I wanted to predict the next pitch of a sequence based on factors such as lineup or current at-bat, I would need these features, most likely.

```python
features = <need to select>

target = mlb2020['pitch_name', 'pitch_speed'] # anything after pitch name is debatable
```

In [18]:
features = ['game_date','pitch_type','Pitcher_name', 'batter_id','pitcher_id','release_speed', 'release_pos_x', 'release_pos_z', 'stand', 'p_throws', 'balls', 'strikes',
            'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'inning', 'inning_topbot',
            'effective_speed', 'release_spin_rate', 'release_extension','Catcher','FirstBasemen', 'SecondBasemen', 'ThirdBasemen', 'ShortStop',
            'LeftField', 'CenterField', 'RightField', 'at_bat_number', 'pitch_number', 'pitch_name',
            'bat_score', 'fld_score', 'post_bat_score', 'post_fld_score', 'if_fielding_alignment', 'of_fielding_alignment',
            'count', 'strike_attempt','events','description','type','bb_type','vx0','vy0','vz0',
            'ax','ay','az','outs_when_up']
mlb2020new = mlb2020[features].copy()

# Results

In [19]:
mlb2020new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 279652 entries, 0 to 279659
Data columns (total 54 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   game_date              279652 non-null  datetime64[ns]
 1   pitch_type             279652 non-null  object        
 2   Pitcher_name           279652 non-null  object        
 3   batter_id              279652 non-null  float64       
 4   pitcher_id             279652 non-null  float64       
 5   release_speed          279652 non-null  float64       
 6   release_pos_x          279652 non-null  float64       
 7   release_pos_z          279652 non-null  float64       
 8   stand                  279652 non-null  object        
 9   p_throws               279652 non-null  object        
 10  balls                  279652 non-null  float64       
 11  strikes                279652 non-null  float64       
 12  pfx_x                  279652 non-null  floa

In [20]:
mlb2020new.sample(5)

Unnamed: 0,game_date,pitch_type,Pitcher_name,batter_id,pitcher_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
125522,2020-09-02,FF,Nick Tropeano,500874.0,607374.0,91.9,-1.76,6.12,R,R,3.0,1.0,-1.2,1.2,-0.84,2.31,,,,4.0,Top,90.9,2279.0,5.7,607732.0,592567.0,624428.0,663647.0,621028.0,546990.0,657061.0,570256.0,28.0,5.0,4-Seam Fastball,3.0,1.0,3.0,1.0,Standard,Standard,3-1,strike,,foul,S,,4.753227,-133.643085,-6.36497,-14.841024,27.670233,-16.513535,1.0
260206,2020-07-28,SI,Aaron Civale,547989.0,650644.0,92.7,-1.12,6.06,R,R,0.0,1.0,-1.0,0.9,-0.33,1.94,641313.0,660162.0,,5.0,Top,93.3,2309.0,6.3,547379.0,467793.0,514917.0,608070.0,502273.0,656185.0,605548.0,669288.0,44.0,2.0,Sinker,1.0,4.0,1.0,4.0,Standard,Standard,0-1,ob,field_out,hit_into_play_score,X,ground_ball,4.300765,-134.911068,-6.798754,-13.638796,25.720912,-19.736722,1.0
178279,2020-08-19,SL,Jacob deGrom,621446.0,594798.0,91.9,-1.12,5.62,R,R,0.0,0.0,0.1,0.3,0.48,1.61,,,465041.0,2.0,Bot,93.1,2502.0,6.8,467092.0,624413.0,641645.0,605204.0,642708.0,642086.0,607043.0,624424.0,18.0,1.0,Slider,0.0,1.0,0.0,1.0,Standard,Standard,0-0,out,field_out,hit_into_play,X,ground_ball,3.779151,-133.793948,-4.969629,0.949965,26.669977,-28.143366,2.0
193545,2020-08-15,SI,Jonathan Hernandez,596115.0,642546.0,97.9,-1.91,5.45,R,R,1.0,0.0,-1.1,0.6,0.0,1.62,,547172.0,641658.0,7.0,Bot,98.7,2186.0,6.3,624431.0,518618.0,596059.0,643396.0,462101.0,425783.0,669256.0,608336.0,58.0,2.0,Sinker,2.0,3.0,2.0,3.0,Standard,Standard,1-0,out,grounded_into_double_play,hit_into_play,X,ground_ball,7.778678,-142.526861,-6.402343,-17.149011,26.748391,-22.477266,1.0
247955,2020-08-01,CH,Trevor Richards,600524.0,670950.0,83.7,-1.16,6.4,R,R,3.0,2.0,-1.1,1.0,0.38,0.79,,,,6.0,Bot,84.2,2521.0,6.7,572287.0,670712.0,621563.0,650490.0,642715.0,660294.0,622534.0,592669.0,45.0,6.0,Changeup,2.0,1.0,2.0,1.0,Infield shift,Standard,3-2,ball,walk,ball,B,,5.655333,-121.556881,-8.508987,-11.554075,23.183169,-20.984139,0.0


In [21]:
# Save to new csv
# mlb2020new.to_csv('data/2020cleaned.csv', index=False)

In [22]:
check = pd.read_csv('data/cleaned2020.csv')

In [23]:
check.head()

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,events,description,stand,p_throws,balls,strikes,type,bb_type,vx0,vy0,vz0,ax,ay,az,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,effective_speed,pitcher,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,home_team,away_team,first,second,third
0,0,FF,2020-10-27,96.7,1.58,5.99,Julio Urias,642715.0,strikeout,called_strike,R,L,0.0,2.0,S,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,0.2,1.6,-0.53,2.29,,,,2.0,9.0,Top,95.4,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,3.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0
1,1,FF,2020-10-27,94.1,2.91,5.45,Julio Urias,642715.0,,called_strike,R,L,0.0,1.0,S,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,0.8,1.3,-0.55,3.03,,,,2.0,9.0,Top,93.4,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,2.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0
2,2,FF,2020-10-27,94.9,1.77,6.02,Julio Urias,642715.0,,swinging_strike,R,L,0.0,0.0,S,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,0.2,1.5,-0.04,3.32,,,,2.0,9.0,Top,94.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,1.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0
3,3,FF,2020-10-27,94.4,1.66,5.93,Julio Urias,670712.0,strikeout,called_strike,R,L,3.0,2.0,S,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,0.3,1.5,-0.37,2.15,,,,1.0,9.0,Top,93.7,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,6.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0
4,4,CU,2020-10-27,81.4,1.46,6.06,Julio Urias,670712.0,,ball,R,L,2.0,2.0,B,,-0.383207,-118.44781,-4.454166,-15.553576,24.451936,-32.892744,-1.7,-0.2,-0.14,0.96,,,,1.0,9.0,Top,80.2,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,5.0,Curveball,1.0,3.0,1.0,3.0,Standard,Standard,LAD,TB,0,0,0


In [24]:
for col in mlb2020new.columns:
    if col not in check.columns:
        print(col)

Pitcher_name
batter_id
pitcher_id
release_spin_rate
release_extension
Catcher
FirstBasemen
SecondBasemen
ThirdBasemen
ShortStop
LeftField
CenterField
RightField
count
strike_attempt


In [25]:
check[check['type']=='X']['events'].unique()

array(['field_out', 'grounded_into_double_play', 'home_run', 'double',
       'single', 'fielders_choice', 'triple', 'force_out', 'sac_bunt',
       'sac_fly', 'fielders_choice_out', 'double_play', 'field_error',
       'sac_fly_double_play', 'triple_play'], dtype=object)

In [26]:
# compare what is in original cleaned and add it into mlb2020new
for col in check.columns:
    if col not in mlb2020new.columns:
        print(col)

index
player_name
batter
pitcher
fielder_2
fielder_3
fielder_4
fielder_5
fielder_6
fielder_7
fielder_8
fielder_9
home_team
away_team
first
second
third


After selecting the columns I want for this, I will import 2019 and before data. I want to concatenate and clean it up so I can put it in a model soon.

---
# Import the other years


In [27]:
mlb2020new.head()

Unnamed: 0,game_date,pitch_type,Pitcher_name,batter_id,pitcher_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,2020-10-27,FF,Julio Urias,642715.0,628711.0,96.7,1.58,5.99,R,L,0.0,2.0,0.2,1.6,-0.53,2.29,,,,9.0,Top,95.4,2615.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,3.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-2,strike,strikeout,called_strike,S,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,2.0
1,2020-10-27,FF,Julio Urias,642715.0,628711.0,94.1,2.91,5.45,R,L,0.0,1.0,0.8,1.3,-0.55,3.03,,,,9.0,Top,93.4,2470.0,5.9,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,2.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-1,strike,,called_strike,S,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,2.0
2,2020-10-27,FF,Julio Urias,642715.0,628711.0,94.9,1.77,6.02,R,L,0.0,0.0,0.2,1.5,-0.04,3.32,,,,9.0,Top,94.0,2397.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,65.0,1.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,0-0,strike,,swinging_strike,S,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,2.0
3,2020-10-27,FF,Julio Urias,670712.0,628711.0,94.4,1.66,5.93,R,L,3.0,2.0,0.3,1.5,-0.37,2.15,,,,9.0,Top,93.7,2508.0,5.9,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,6.0,4-Seam Fastball,1.0,3.0,1.0,3.0,Standard,Standard,3-2,strike,strikeout,called_strike,S,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,1.0
4,2020-10-27,CU,Julio Urias,670712.0,628711.0,81.4,1.46,6.06,R,L,2.0,2.0,-1.7,-0.2,-0.14,0.96,,,,9.0,Top,80.2,3031.0,5.7,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,64.0,5.0,Curveball,1.0,3.0,1.0,3.0,Standard,Standard,2-2,ball,,ball,B,,-0.383207,-118.44781,-4.454166,-15.553576,24.451936,-32.892744,1.0


In [28]:
mlb2019 = pd.read_csv('data/2019mlb.csv')
mlb2018 = pd.read_csv('data/2018mlb.csv')
mlb2017 = pd.read_csv('data/2017mlb.csv')
mlb2016 = pd.read_csv('data/2016mlb.csv')

In [29]:
mlb2019 = project.cleaning(mlb2019)
mlb2019 = project.strike_attempt_column(mlb2019)
mlb2019 = project.fill_in_batters(mlb2019)
mlb2019.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,Pitcher_name,batter_id,pitcher_id,events,description,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,Catcher,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,strike_attempt,batter_name
0,0,SL,2019-10-30,87.9,-2.65,5.5,Daniel Hudson,488726.0,543339.0,strikeout,swinging_strike,14.0,Michael Brantley strikes out swinging.,W,L,R,HOU,WSH,S,2.0,,3.0,2.0,2019.0,0.0,0.2,0.88,1.03,,,,2.0,9.0,Bot,,,543228.0,8.472927,-127.797998,-5.181622,-1.384471,23.932114,-29.091156,3.35,1.4,,,,87.8,2461.0,6.1,599377.0,475582.0,452678.0,543685.0,607208.0,665742.0,645302.0,594809.0,54.42,,,0.0,1.0,0.0,0.0,,79.0,7.0,Slider,2.0,6.0,2.0,6.0,6.0,2.0,2.0,6.0,Infield shift,Standard,strike,Michael Brantley
1,1,FF,2019-10-30,95.9,-2.77,5.52,Daniel Hudson,488726.0,543339.0,,foul,7.0,,W,L,R,HOU,WSH,S,,,3.0,2.0,2019.0,-0.6,1.5,-0.47,1.92,,,,2.0,9.0,Bot,,,543228.0,7.236909,-139.21242,-7.101361,-8.940442,32.683954,-11.563514,3.35,1.56,277.0,,,94.4,2572.0,5.7,599377.0,475582.0,452678.0,543685.0,607208.0,665742.0,645302.0,594809.0,54.83,,,,,,,,79.0,6.0,4-Seam Fastball,2.0,6.0,2.0,6.0,6.0,2.0,2.0,6.0,Infield shift,Standard,strike,Michael Brantley
2,2,FF,2019-10-30,96.5,-2.68,5.42,Daniel Hudson,488726.0,543339.0,,ball,14.0,,W,L,R,HOU,WSH,B,,,2.0,2.0,2019.0,-0.7,1.4,1.68,1.35,,,,2.0,9.0,Bot,,,543228.0,12.923798,-139.703293,-8.230555,-11.485524,32.459653,-12.44245,3.53,1.63,,,,95.3,2637.0,5.9,599377.0,475582.0,452678.0,543685.0,607208.0,665742.0,645302.0,594809.0,54.55,,,,,,,,79.0,5.0,4-Seam Fastball,2.0,6.0,2.0,6.0,6.0,2.0,2.0,6.0,Infield shift,Standard,ball,Michael Brantley
3,3,FF,2019-10-30,96.0,-2.65,5.55,Daniel Hudson,488726.0,543339.0,,foul,9.0,,W,L,R,HOU,WSH,S,,,2.0,1.0,2019.0,-0.8,1.5,0.75,2.05,,,,2.0,9.0,Bot,,,543228.0,10.686962,-139.133193,-6.89762,-12.790017,31.422309,-11.667275,3.35,1.56,382.0,,,94.9,2598.0,5.9,599377.0,475582.0,452678.0,543685.0,607208.0,665742.0,645302.0,594809.0,54.6,,,,,,,,79.0,4.0,4-Seam Fastball,2.0,6.0,2.0,6.0,6.0,2.0,2.0,6.0,Infield shift,Standard,strike,Michael Brantley
4,4,SL,2019-10-30,86.7,-2.73,5.59,Daniel Hudson,488726.0,543339.0,,ball,14.0,,W,L,R,HOU,WSH,B,,,1.0,1.0,2019.0,-0.1,0.5,1.27,2.17,,,,2.0,9.0,Bot,,,543228.0,9.633513,-126.044028,-2.982919,-2.223969,21.921075,-26.826688,3.59,1.63,,,,87.0,2598.0,6.2,599377.0,475582.0,452678.0,543685.0,607208.0,665742.0,645302.0,594809.0,54.28,,,,,,,,79.0,3.0,Slider,2.0,6.0,2.0,6.0,6.0,2.0,2.0,6.0,Infield shift,Standard,ball,Michael Brantley


In [34]:
pd.DataFrame(mlb2019.isnull().sum()).T

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,Pitcher_name,batter_id,pitcher_id,events,description,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,Catcher,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,strike_attempt,batter_name
0,0,0,0,0,21,21,0,0,0,550439,0,21,550435,0,0,0,0,0,0,575027,611294,0,0,0,21,21,21,21,670336,603493,514827,0,0,0,611657,611657,0,21,21,21,21,21,21,21,21,550341,611223,611222,0,12826,21,0,0,0,0,0,0,0,0,21,612943,612943,550439,552088,550439,550439,612943,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
mlb2019[mlb2019['release_pos_x'].isnull()] # make a model to determine the perfect strike zone segment

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,Pitcher_name,batter_id,pitcher_id,events,description,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,Catcher,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,strike_attempt,batter_name
41612,21387,U,2019-09-22,71.8,,,Robbie Ray,605361.0,592662.0,,called_strike,,,R,L,L,SD,ARI,S,,,0.0,0.0,2019.0,,,,,,,,1.0,5.0,Bot,,,608348.0,,,,,,,,,,,,0.0,,,566352.0,572233.0,527038.0,500871.0,605113.0,668942.0,501659.0,430945.0,,,,,,,,,42.0,1.0,Unknown,2.0,4.0,2.0,4.0,4.0,2.0,2.0,4.0,Standard,Standard,strike,Nick Martini
71084,2118,U,2019-09-15,55.6,,,Shane Bieber,570731.0,669456.0,,blocked_ball,,,R,R,R,CLE,MIN,B,,,0.0,0.0,2019.0,,,,,,,553902.0,1.0,6.0,Top,,,547379.0,,,,,,,,,,,,0.0,,,565390.0,467793.0,543401.0,475247.0,596019.0,656185.0,640458.0,624577.0,,,,,,,,,45.0,1.0,Unknown,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Standard,Standard,ball,Jonathan Schoop
156292,14240,U,2019-08-25,72.4,,,Dario Agrazal,578428.0,642607.0,single,hit_into_play_no_out,,Jose Iglesias singles on a bunt ground ball to...,R,R,R,PIT,CIN,X,5.0,ground_ball,1.0,1.0,2019.0,,,,,,,606157.0,0.0,3.0,Top,114.46,179.7,553869.0,,,,,,,,,,,,0.0,,,566245.0,605137.0,624428.0,592567.0,621028.0,668804.0,516782.0,591741.0,,,,0.9,,1.0,0.0,,26.0,3.0,Unknown,5.0,3.0,3.0,5.0,3.0,5.0,3.0,5.0,Standard,Strategic,ob,Jose Iglesias
194814,2338,U,2019-08-16,63.6,,,Adam Wainwright,606157.0,425794.0,,called_strike,,,R,R,R,CIN,STL,S,,,1.0,1.0,2019.0,,,,,,,,1.0,4.0,Bot,,,425877.0,,,,,,,,,,,,0.0,,,565282.0,502671.0,543939.0,572761.0,657557.0,542303.0,451594.0,669242.0,,,,,,,,,32.0,3.0,Unknown,0.0,6.0,0.0,6.0,6.0,0.0,0.0,6.0,Standard,Standard,strike,Aristides Aquino
324867,10944,U,2019-07-15,70.0,,,Kyle Hendricks,592200.0,543294.0,,ball,,,R,R,R,CHC,CIN,B,,,0.0,0.0,2019.0,,,,,,578428.0,,2.0,4.0,Top,,,605170.0,,,,,,,,,,,,0.0,,,565172.0,519203.0,608365.0,623520.0,595879.0,656941.0,518792.0,592178.0,,,,,,,,,28.0,1.0,Unknown,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Standard,Standard,ball,Curt Casali
332459,16886,U,2019-07-13,86.1,,,Lou Trivino,607223.0,642152.0,strikeout,called_strike,,AJ Reed called out on strikes.,R,L,R,OAK,CWS,S,2.0,,1.0,2.0,2019.0,,,,,,,,0.0,8.0,Top,,,543302.0,,,,,,,,,,,,0.0,,,566131.0,621566.0,620439.0,640461.0,543760.0,543257.0,657656.0,592192.0,,,,0.0,1.0,0.0,0.0,,70.0,6.0,Unknown,12.0,2.0,2.0,12.0,2.0,12.0,2.0,12.0,Standard,Standard,strike,AJ Reed
354930,4598,U,2019-07-04,89.9,,,Lou Trivino,488771.0,642152.0,,foul,,,R,L,R,OAK,MIN,S,,,0.0,1.0,2019.0,,,,,,,570731.0,1.0,8.0,Top,,,543302.0,,,,,,,,,,,,0.0,,,566129.0,621566.0,595777.0,656305.0,543760.0,543257.0,657656.0,592192.0,,,,,,,,,68.0,2.0,Unknown,3.0,2.0,2.0,3.0,2.0,3.0,2.0,3.0,Standard,Standard,strike,Jason Castro
354974,5082,U,2019-07-04,56.3,,,Zack Littell,656305.0,641793.0,,ball,,,R,R,R,OAK,MIN,B,,,0.0,0.0,2019.0,,,,,,543257.0,543302.0,2.0,6.0,Bot,,,488771.0,,,,,,,,,,,,0.0,,,566129.0,501303.0,570731.0,593934.0,593871.0,650333.0,621439.0,596146.0,,,,,,,,,58.0,1.0,Unknown,3.0,2.0,3.0,2.0,2.0,3.0,3.0,2.0,Infield shift,Standard,ball,Matt Chapman
368561,21972,U,2019-06-30,57.4,,,Robbie Ray,605412.0,592662.0,field_out,hit_into_play,,"Joe Panik grounds out, first baseman Christian...",R,L,L,SF,ARI,X,3.0,ground_ball,1.0,1.0,2019.0,,,,,,,,1.0,2.0,Bot,162.91,155.48,608348.0,,,,,,,,,,82.9,-21.0,0.0,,,566513.0,572233.0,545121.0,500871.0,605113.0,641796.0,606466.0,430945.0,,0.052,0.047,0.0,1.0,0.0,0.0,2.0,14.0,3.0,Unknown,2.0,0.0,2.0,0.0,0.0,2.0,2.0,0.0,Standard,Standard,out,Joe Panik
425624,6414,U,2019-06-16,84.9,,,Mike Leake,543760.0,502190.0,single,hit_into_play_no_out,,Marcus Semien singles on a line drive to cente...,R,R,R,OAK,SEA,X,8.0,line_drive,1.0,0.0,2019.0,,,,,,,,0.0,3.0,Bot,127.09,86.69,608596.0,,,,,,,,,,90.4,15.0,0.0,,,566119.0,543592.0,543829.0,572122.0,641487.0,607776.0,605480.0,570267.0,,0.922,0.882,0.9,1.0,1.0,0.0,4.0,22.0,2.0,Unknown,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Standard,Standard,ob,Marcus Semien


In [39]:
mlb2019['release_pos_z'].mean()

5.896874546802518