In [1]:
import pybaseball as pb
import pandas as pd

In [2]:
from pybaseball import cache

In [3]:
cache.enable()

In [10]:
pd.set_option('display.max_columns', None)

# Gathering the Data per season

---
We selected the 20125-2020 seasons to get enough data to run this model. This is a re-run of the previous project.

The dates include the opening days until the very last pitch of the World Series of the respective years.

In [4]:
mlb2020 = pb.statcast(start_dt='2020-07-23', end_dt='2020-10-27')
mlb2019 = pb.statcast(start_dt='2019-03-20', end_dt='2019-10-30')
mlb2018 = pb.statcast(start_dt='2018-03-29', end_dt='2018-10-28')
mlb2017 = pb.statcast(start_dt='2017-04-02', end_dt='2017-11-01')
mlb2016 = pb.statcast(start_dt='2016-04-03', end_dt='2016-11-02')
mlb2015 = pb.statcast(start_dt='2015-04-05', end_dt='2015-11-01')

print('Finished Acquiring Data')

This is a large query, it may take a moment to complete
Completed sub-query from 2020-07-23 to 2020-07-28
Completed sub-query from 2020-07-29 to 2020-08-03
Completed sub-query from 2020-08-04 to 2020-08-09
Completed sub-query from 2020-08-10 to 2020-08-15
Completed sub-query from 2020-08-16 to 2020-08-21
Completed sub-query from 2020-08-22 to 2020-08-27
Completed sub-query from 2020-08-28 to 2020-09-02
Completed sub-query from 2020-09-03 to 2020-09-08
Completed sub-query from 2020-09-09 to 2020-09-14
Completed sub-query from 2020-09-15 to 2020-09-20
Completed sub-query from 2020-09-21 to 2020-09-26
Completed sub-query from 2020-09-27 to 2020-10-02
Completed sub-query from 2020-10-03 to 2020-10-08
Completed sub-query from 2020-10-09 to 2020-10-14
Completed sub-query from 2020-10-15 to 2020-10-20
Completed sub-query from 2020-10-21 to 2020-10-26
Completed sub-query from 2020-10-27 to 2020-10-27
This is a large query, it may take a moment to complete
Completed sub-query from 2019-03-20 to

In [5]:
# We will check to see if the data was scraped properly

mlb2020.head()

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,77,FF,2020-10-27,96.7,1.58,5.99,"Urías, Julio",642715.0,628711.0,strikeout,...,3.0,1.0,3.0,1.0,3.0,Standard,Standard,149.0,0.011,-0.067
1,79,FF,2020-10-27,94.1,2.91,5.45,"Urías, Julio",642715.0,628711.0,,...,3.0,1.0,3.0,1.0,3.0,Standard,Standard,127.0,0.0,-0.027
2,83,FF,2020-10-27,94.9,1.77,6.02,"Urías, Julio",642715.0,628711.0,,...,3.0,1.0,3.0,1.0,3.0,Standard,Standard,147.0,0.0,-0.017
3,89,FF,2020-10-27,94.4,1.66,5.93,"Urías, Julio",670712.0,628711.0,strikeout,...,3.0,1.0,3.0,1.0,3.0,Standard,Standard,153.0,0.023,-0.208
4,93,CU,2020-10-27,81.4,1.46,6.06,"Urías, Julio",670712.0,628711.0,,...,3.0,1.0,3.0,1.0,3.0,Standard,Standard,308.0,0.0,0.06


In [6]:
# If you choose to save the data as separate CSVs based on year, you can execute

# mlbYEAR.to_csv('mlbYEAR.csv', index=False) 

Since we are after one big concatenated file, we will save it as one gigantic file.

In [11]:
mlbseasons = [mlb2020, mlb2019, mlb2018, mlb2017, mlb2016, mlb2015]
df = pd.concat(mlbseasons)

df.head(10)

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,fielder_2,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,77,FF,2020-10-27,96.7,1.58,5.99,"Urías, Julio",642715.0,628711.0,strikeout,called_strike,,,,,4.0,Willy Adames called out on strikes.,W,R,L,LAD,TB,S,2.0,,0.0,2.0,2020.0,0.18,1.63,-0.53,2.29,,,,2.0,9.0,Top,,,,,605131.0,,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,3.5,1.69,,,,95.4,2615.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.82,,,0.0,1.0,0.0,0.0,,65.0,3.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard,149.0,0.011,-0.067
1,79,FF,2020-10-27,94.1,2.91,5.45,"Urías, Julio",642715.0,628711.0,,called_strike,,,,,1.0,Willy Adames called out on strikes.,W,R,L,LAD,TB,S,,,0.0,1.0,2020.0,0.75,1.3,-0.55,3.03,,,,2.0,9.0,Top,,,,,605131.0,,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,3.49,1.69,,,,93.4,2470.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.59,,,,,,,,65.0,2.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard,127.0,0.0,-0.027
2,83,FF,2020-10-27,94.9,1.77,6.02,"Urías, Julio",642715.0,628711.0,,swinging_strike,,,,,2.0,Willy Adames called out on strikes.,W,R,L,LAD,TB,S,,,0.0,0.0,2020.0,0.23,1.47,-0.04,3.32,,,,2.0,9.0,Top,,,,,605131.0,,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,3.5,1.69,,,,94.0,2397.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.76,,,,,,,,65.0,1.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard,147.0,0.0,-0.017
3,89,FF,2020-10-27,94.4,1.66,5.93,"Urías, Julio",670712.0,628711.0,strikeout,called_strike,,,,,4.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,S,2.0,,3.0,2.0,2020.0,0.27,1.5,-0.37,2.15,,,,1.0,9.0,Top,,,,,605131.0,,,-5.843595,-137.294295,-7.414897,4.754147,30.016237,-11.778755,3.24,1.53,,,,93.7,2508.0,5.9,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.6,,,0.0,1.0,0.0,0.0,,64.0,6.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard,153.0,0.023,-0.208
4,93,CU,2020-10-27,81.4,1.46,6.06,"Urías, Julio",670712.0,628711.0,,ball,,,,,13.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,B,,,2.0,2.0,2020.0,-1.67,-0.15,-0.14,0.96,,,,1.0,9.0,Top,,,,,605131.0,,,-0.383207,-118.44781,-4.454166,-15.553576,24.451936,-32.892744,3.2,1.53,,,,80.2,3031.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.84,,,,,,,,64.0,5.0,Curveball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard,308.0,0.0,0.06
5,95,FF,2020-10-27,95.6,1.81,5.95,"Urías, Julio",670712.0,628711.0,,ball,,,,,11.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,B,,,1.0,2.0,2020.0,0.3,1.44,-0.47,3.94,,,,1.0,9.0,Top,,,,,605131.0,,,-6.600812,-139.135295,-2.905682,5.33565,30.811825,-12.976271,3.22,1.53,,,,94.9,2437.0,5.8,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.66,,,,,,,,64.0,4.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard,148.0,0.0,0.03
6,98,CU,2020-10-27,80.6,1.5,6.11,"Urías, Julio",670712.0,628711.0,,foul,,,,,13.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,S,,,1.0,1.0,2020.0,-1.57,-0.09,-0.97,2.06,,,,1.0,9.0,Top,,,,,605131.0,,,-2.476312,-117.286244,-2.102737,-13.916321,24.086999,-32.781095,3.34,1.53,8.0,92.8,-16.0,79.3,2971.0,5.6,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.94,,,,,,,,64.0,3.0,Curveball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Infield shift,Standard,128.0,0.0,-0.044
7,102,FF,2020-10-27,94.7,1.82,5.95,"Urías, Julio",670712.0,628711.0,,foul,,,,,5.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,S,,,1.0,0.0,2020.0,0.5,1.41,0.08,2.59,,,,1.0,9.0,Top,,,,,605131.0,,,-5.607907,-137.809921,-6.197171,7.630847,29.808875,-13.053227,3.34,1.53,240.0,77.3,29.0,93.9,2478.0,5.8,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.72,,,,,,,,64.0,2.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Infield shift,Standard,140.0,0.0,-0.033
8,108,FF,2020-10-27,94.3,1.67,6.11,"Urías, Julio",670712.0,628711.0,,ball,,,,,12.0,Mike Brosseau called out on strikes.,W,R,L,LAD,TB,B,,,0.0,0.0,2020.0,0.52,1.44,1.18,3.52,,,,1.0,9.0,Top,,,,,605131.0,,,-2.392348,-137.363478,-4.225068,7.109486,29.169727,-13.246297,3.24,1.53,,,,93.6,2581.0,5.7,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.79,,,,,,,,64.0,1.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Infield shift,Standard,146.0,0.0,0.027
9,114,FF,2020-10-27,95.2,1.72,6.09,"Urías, Julio",622534.0,628711.0,field_out,hit_into_play,,,,,6.0,Manuel Margot flies out to right fielder Mooki...,W,R,L,LAD,TB,X,9.0,fly_ball,1.0,2.0,2020.0,0.4,1.48,0.4,2.56,,,,0.0,9.0,Top,191.34,99.03,,,605131.0,,,-4.306764,-138.539023,-6.867868,6.166594,30.824839,-11.731952,3.25,1.53,283.0,83.4,44.0,94.0,2450.0,5.6,635886.0,628711.0,605131.0,571970.0,571771.0,621458.0,608369.0,621035.0,641355.0,605141.0,54.92,0.027,0.041,0.0,1.0,0.0,0.0,3.0,63.0,4.0,4-Seam Fastball,3.0,1.0,1.0,3.0,1.0,3.0,1.0,3.0,Standard,Standard,148.0,0.037,-0.18


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3925776 entries, 0 to 712838
Data columns (total 93 columns):
 #   Column                           Dtype         
---  ------                           -----         
 0   index                            int64         
 1   pitch_type                       object        
 2   game_date                        datetime64[ns]
 3   release_speed                    float64       
 4   release_pos_x                    float64       
 5   release_pos_z                    float64       
 6   player_name                      object        
 7   batter                           float64       
 8   pitcher                          float64       
 9   events                           object        
 10  description                      object        
 11  spin_dir                         float64       
 12  spin_rate_deprecated             float64       
 13  break_angle_deprecated           float64       
 14  break_length_deprecated          fl