#  Setup

In [1]:
!pip install pybaseball
import pybaseball
from pybaseball import statcast, statcast_pitcher, playerid_lookup

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
pybaseball.cache.enable()
data = statcast('2022-04-07', '2022-10-05')

print(data.shape)
data.head(10)

This is a large query, it may take a moment to complete


That's a nice request you got there. It'd be a shame if something were to happen to it.
We strongly recommend that you enable caching before running this. It's as simple as `pybaseball.cache.enable()`.
Since the Statcast requests can take a *really* long time to run, if something were to happen, like: a disconnect;
gremlins; computer repair by associates of Rudy Giuliani; electromagnetic interference from metal trash cans; etc.;
you could lose a lot of progress. Enabling caching will allow you to immediately recover all the successful
subqueries if that happens.
100%|██████████| 182/182 [13:44<00:00,  4.53s/it]


(708540, 92)


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
3171,CH,2022-10-05,80.8,-0.76,6.61,"Baker, Bryan",624415,641329,field_out,hit_into_play,...,5,4,5,4,5,Infield shift,Standard,213,0.035,-0.099
3387,FF,2022-10-05,97.7,-0.58,6.6,"Baker, Bryan",643376,641329,strikeout,swinging_strike,...,5,4,5,4,5,Standard,Standard,189,0.052,-0.134
3467,CH,2022-10-05,84.9,-0.55,6.58,"Baker, Bryan",643376,641329,,ball,...,5,4,5,4,5,Standard,Standard,212,0.0,0.023
3590,FF,2022-10-05,97.2,-0.42,6.6,"Baker, Bryan",643376,641329,,swinging_strike,...,5,4,5,4,5,Standard,Standard,192,0.0,-0.036
3846,FC,2022-10-05,86.2,-0.55,6.64,"Baker, Bryan",643376,641329,,called_strike,...,5,4,5,4,5,Standard,Standard,127,0.0,-0.031
3886,FF,2022-10-05,97.5,-0.56,6.68,"Baker, Bryan",643376,641329,,ball,...,5,4,5,4,5,Standard,Standard,194,0.0,0.023
4028,CH,2022-10-05,84.9,-0.58,6.65,"Baker, Bryan",665489,641329,strikeout,called_strike,...,5,4,5,4,5,Standard,Standard,207,0.071,-0.158
4181,FF,2022-10-05,98.2,-0.59,6.72,"Baker, Bryan",665489,641329,,ball,...,5,4,5,4,5,Standard,Standard,190,0.0,0.017
4387,CH,2022-10-05,83.8,-0.41,6.57,"Baker, Bryan",665489,641329,,called_strike,...,5,4,5,4,5,Standard,Standard,211,0.0,-0.046
4557,FF,2022-10-05,96.7,-0.41,6.75,"Baker, Bryan",665489,641329,,swinging_strike,...,5,4,5,4,5,Standard,Standard,192,0.0,-0.034




In [30]:
data = pd.read_csv('data2022.csv')
data.shape

(708540, 93)

# Data Cleaning

## Determining Training Data Columns

In [31]:
for i in zip(data.columns, data.dtypes):
  print(i)

('Unnamed: 0', dtype('int64'))
('pitch_type', dtype('O'))
('game_date', dtype('O'))
('release_speed', dtype('float64'))
('release_pos_x', dtype('float64'))
('release_pos_z', dtype('float64'))
('player_name', dtype('O'))
('batter', dtype('int64'))
('pitcher', dtype('int64'))
('events', dtype('O'))
('description', dtype('O'))
('spin_dir', dtype('float64'))
('spin_rate_deprecated', dtype('float64'))
('break_angle_deprecated', dtype('float64'))
('break_length_deprecated', dtype('float64'))
('zone', dtype('float64'))
('des', dtype('O'))
('game_type', dtype('O'))
('stand', dtype('O'))
('p_throws', dtype('O'))
('home_team', dtype('O'))
('away_team', dtype('O'))
('type', dtype('O'))
('hit_location', dtype('float64'))
('bb_type', dtype('O'))
('balls', dtype('int64'))
('strikes', dtype('int64'))
('game_year', dtype('int64'))
('pfx_x', dtype('float64'))
('pfx_z', dtype('float64'))
('plate_x', dtype('float64'))
('plate_z', dtype('float64'))
('on_3b', dtype('float64'))
('on_2b', dtype('float64'))
(

https://baseballsavant.mlb.com/csv-docs 

Which columns should we keep based on the documentation? Any metrics derived from the movement of the pitch alone, with no regard for the pitcher who threw it except for the handedness of the pitcher.
  - Motivation: different pitch mixes for different hands of pitchers (e.g. more right-handed changeups than left-handed)

In [32]:
data = data[['pitcher', 'player_name',"p_throws", 'pitch_type', 'pitch_name', 'release_spin_rate', 'release_speed','effective_speed', 'spin_axis', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z']]

In [33]:
for i in zip(data.columns, data.dtypes):
  print(i)

('pitcher', dtype('int64'))
('player_name', dtype('O'))
('p_throws', dtype('O'))
('pitch_type', dtype('O'))
('pitch_name', dtype('O'))
('release_spin_rate', dtype('float64'))
('release_speed', dtype('float64'))
('effective_speed', dtype('float64'))
('spin_axis', dtype('float64'))
('vx0', dtype('float64'))
('vy0', dtype('float64'))
('vz0', dtype('float64'))
('ax', dtype('float64'))
('ay', dtype('float64'))
('az', dtype('float64'))
('pfx_x', dtype('float64'))
('pfx_z', dtype('float64'))
('plate_x', dtype('float64'))
('plate_z', dtype('float64'))


In [34]:
pitch_names = data.pitch_name.value_counts()
pitch_names

4-Seam Fastball    232954
Slider             147516
Sinker             111403
Changeup            80275
Curveball           59570
Cutter              50758
Knuckle Curve       13541
Split-Finger        10358
Fastball             1279
Eephus                494
Knuckleball            19
Name: pitch_name, dtype: int64

In [35]:
pitch_types = data.pitch_type.value_counts()
pitch_types

FF    232954
SL    147516
SI    111403
CH     80275
CU     59473
FC     50758
KC     13541
FS     10358
FA      1279
EP       494
CS        97
KN        19
Name: pitch_type, dtype: int64

In [36]:
data[data['pitch_name'] == 'Fastball']

Unnamed: 0,pitcher,player_name,p_throws,pitch_type,pitch_name,release_spin_rate,release_speed,effective_speed,spin_axis,vx0,vy0,vz0,ax,ay,az,pfx_x,pfx_z,plate_x,plate_z
6761,620443,"Torrens, Luis",R,FA,Fastball,2028.0,88.8,87.4,198.0,5.908894,-129.211593,-6.160026,-9.017058,23.988244,-16.007847,-0.70,1.34,0.83,2.94
6762,620443,"Torrens, Luis",R,FA,Fastball,2013.0,87.1,86.0,210.0,7.767040,-126.695007,-5.562338,-13.780615,23.249160,-19.449337,-1.14,1.09,0.36,2.11
6763,620443,"Torrens, Luis",R,FA,Fastball,2006.0,84.8,83.5,215.0,5.576926,-123.404175,-4.159193,-5.468194,23.603541,-16.933643,-0.43,1.43,0.36,2.97
6764,620443,"Torrens, Luis",R,FA,Fastball,1920.0,85.1,84.2,205.0,8.035447,-123.709060,-4.118744,-8.744140,22.830860,-21.535999,-0.70,0.97,1.11,2.37
6765,620443,"Torrens, Luis",R,FA,Fastball,1929.0,86.6,85.1,197.0,10.013253,-125.686932,-5.303777,-13.215244,24.124622,-19.228439,-1.06,1.13,1.72,2.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694456,608348,"Kelly, Carson",R,FA,Fastball,1857.0,78.6,76.1,205.0,5.157492,-114.128093,-3.105126,-2.843360,26.246587,-16.087129,-0.19,1.82,0.61,3.86
694457,608348,"Kelly, Carson",R,FA,Fastball,1592.0,65.5,63.3,206.0,4.637400,-95.083521,-3.870533,-5.016676,17.420544,-21.174033,-0.70,1.76,0.39,1.62
694459,608348,"Kelly, Carson",R,FA,Fastball,1949.0,79.0,76.6,211.0,6.208427,-114.661908,-5.294731,-6.994042,25.365648,-17.540070,-0.65,1.58,0.79,2.66
694460,608348,"Kelly, Carson",R,FA,Fastball,1509.0,65.2,63.4,210.0,2.917025,-94.862012,-4.324639,-3.695498,16.350107,-21.961350,-0.53,1.62,-0.38,1.32


In [37]:
data[data['pitch_name'] == 'Fastball'].player_name.value_counts()

Alberto, Hanser           151
Gordon, Nick               85
Kelly, Carson              67
VanMeter, Josh             62
Phillips, Brett            59
Harrison, Josh             51
Stubbs, Garrett            45
Brosseau, Mike             44
Myers, Wil                 38
Torrens, Luis              33
Clement, Ernie             32
McKenna, Ryan              31
Hager, Jake                28
Strange-Gordon, Dee        28
Bethancourt, Christian     27
Bradley Jr., Jackie        27
Pinder, Chad               26
Castillo, Diego            26
Schrock, Max               26
Culberson, Charlie         26
Maton, Nick                25
Schwindel, Frank           24
Simmons, Andrelton         21
Reynolds, Matt             20
Pujols, Albert             19
González, Luis             18
Leon, Sandy                17
Castro, Harold             16
Ford, Mike                 15
Plawecki, Kevin            14
Caratini, Victor           14
Dickerson, Corey           12
Merrifield, Whit           12
Molina, Ya

In [38]:
data[data['pitch_name'] == 'Eephus']

Unnamed: 0,pitcher,player_name,p_throws,pitch_type,pitch_name,release_spin_rate,release_speed,effective_speed,spin_axis,vx0,vy0,vz0,ax,ay,az,pfx_x,pfx_z,plate_x,plate_z
22034,545121,"Vargas, Ildemaro",R,EP,Eephus,841.0,38.6,37.1,204.0,0.657855,-54.456680,11.293683,0.051253,5.996353,-30.418288,0.06,1.67,-0.16,4.32
22036,545121,"Vargas, Ildemaro",R,EP,Eephus,812.0,38.9,37.2,216.0,-0.237409,-54.731675,12.313730,-0.238409,5.931549,-30.790850,-0.13,1.51,-0.99,5.20
22038,545121,"Vargas, Ildemaro",R,EP,Eephus,898.0,39.0,37.6,214.0,1.599343,-55.654232,9.203681,-0.112455,6.065402,-29.648962,0.03,1.86,0.50,2.68
22039,545121,"Vargas, Ildemaro",R,EP,Eephus,786.0,38.8,37.7,200.0,-0.054562,-55.025723,10.620663,0.609237,4.665442,-30.573263,0.30,1.33,-0.71,3.81
23315,665019,"Clemens, Kody",R,EP,Eephus,1042.0,46.1,44.3,201.0,0.513493,-66.848762,5.700501,-0.052311,5.908564,-28.959118,0.00,1.28,-0.85,3.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644539,660636,"Castillo, Diego",R,EP,Eephus,1658.0,67.3,66.6,215.0,6.064892,-97.949027,0.604866,-1.991434,14.706473,-20.967695,-0.17,1.78,1.63,3.66
688029,571976,"Myers, Wil",R,EP,Eephus,1881.0,51.5,49.3,29.0,0.924721,-74.678987,3.750270,3.443837,10.185812,-37.814632,0.96,-1.32,0.52,1.22
688034,571976,"Myers, Wil",R,EP,Eephus,1982.0,54.6,52.1,41.0,2.643121,-79.223556,3.304773,2.718303,12.443245,-38.140390,0.76,-1.25,1.46,1.36
688039,571976,"Myers, Wil",R,EP,Eephus,1991.0,55.3,52.9,35.0,1.963514,-80.321091,2.368406,3.810391,12.177114,-38.965036,0.96,-1.45,0.83,0.68


In [39]:
data[data['pitch_name'] == 'Eephus'].player_name.value_counts()

Clemens, Kody             86
González, Luis            68
Castro, Harold            34
Castillo, Diego           34
Bethancourt, Christian    25
Neuse, Sheldon            24
Batten, Matthew           23
Chang, Yu                 22
Lopez, Alejo              19
Astudillo, Willians       17
Schwindel, Frank          16
Stubbs, Garrett           15
Ruf, Darin                14
Walton, Donovan           13
VanMeter, Josh            12
Escobar, Alcides           9
Wynns, Austin              9
Reyes, Pablo               8
Arcia, Orlando             8
Sanchez, Carlos            7
Knizner, Andrew            6
Dickerson, Corey           6
Myers, Wil                 5
Vargas, Ildemaro           4
Bradley Jr., Jackie        3
Palacios, Jermaine         3
Gonzalez, Marwin           2
Alberto, Hanser            1
Molina, Yadier             1
Name: player_name, dtype: int64

In [40]:
# Knuckeballs
data[data['pitch_name'] == 'Knuckleball']

Unnamed: 0,pitcher,player_name,p_throws,pitch_type,pitch_name,release_spin_rate,release_speed,effective_speed,spin_axis,vx0,vy0,vz0,ax,ay,az,pfx_x,pfx_z,plate_x,plate_z
366740,676391,"Clement, Ernie",R,KN,Knuckleball,1065.0,49.9,49.3,216.0,-0.396687,-72.632626,2.717848,-1.610823,8.003952,-28.684959,-0.47,1.12,-2.39,1.3
366745,676391,"Clement, Ernie",R,KN,Knuckleball,1168.0,53.9,52.1,211.0,1.539443,-78.019912,6.574567,-3.299501,10.679282,-28.226153,-0.76,1.25,-1.19,5.22
366749,676391,"Clement, Ernie",R,KN,Knuckleball,411.0,53.7,50.6,152.0,-1.421582,-76.858301,11.792394,0.412825,12.469656,-32.580382,0.05,0.46,-2.61,7.88
366751,676391,"Clement, Ernie",R,KN,Knuckleball,1105.0,51.0,49.6,209.0,2.345995,-74.120517,3.781626,-2.718059,8.595961,-29.513097,-0.66,0.9,-0.51,2.56
366756,676391,"Clement, Ernie",R,KN,Knuckleball,1056.0,46.9,45.3,202.0,0.931493,-67.736107,7.751091,-0.549931,6.895081,-30.741344,-0.15,0.78,-0.89,4.43
386797,676391,"Clement, Ernie",R,KN,Knuckleball,1226.0,52.5,50.5,234.0,2.899821,-76.104455,4.117468,-4.514909,10.298678,-27.806319,-1.07,1.34,-0.47,3.17
386799,676391,"Clement, Ernie",R,KN,Knuckleball,90.0,55.2,52.4,342.0,0.01439,-79.896212,5.281347,-7.300036,14.351981,-31.144068,-1.76,0.55,-3.18,3.33
386812,676391,"Clement, Ernie",R,KN,Knuckleball,2841.0,50.2,47.1,92.0,-0.284259,-72.478502,5.370313,0.752915,13.607667,-35.748256,0.21,-0.66,-1.96,0.76
386814,676391,"Clement, Ernie",R,KN,Knuckleball,275.0,50.3,47.5,213.0,-1.182588,-72.441673,8.129041,0.8518,11.779551,-32.357822,0.19,0.42,-2.62,4.12
386817,676391,"Clement, Ernie",R,KN,Knuckleball,292.0,61.0,58.9,238.0,4.953775,-88.584033,2.117073,-4.101253,14.788409,-33.315609,-0.63,-0.1,0.84,1.54


In [41]:
data[data['pitch_name'] == 'Knuckleball'].player_name.value_counts()

Clement, Ernie      10
Mayfield, Jack       7
Schwindel, Frank     2
Name: player_name, dtype: int64

In [42]:
# number of rows to delete
pitch_names[-3:].sum()

1792

In [55]:
df = data.loc[~((data['pitch_type'] == 'FA') | (data['pitch_type'] == 'EP') | (data['pitch_type'] == 'KN'))]
df.shape

(706748, 19)

In [56]:
print(data.shape[0] - df.shape[0])

1792


In [57]:
df

Unnamed: 0,pitcher,player_name,p_throws,pitch_type,pitch_name,release_spin_rate,release_speed,effective_speed,spin_axis,vx0,vy0,vz0,ax,ay,az,pfx_x,pfx_z,plate_x,plate_z
0,641329,"Baker, Bryan",R,CH,Changeup,1771.0,80.8,81.5,213.0,3.558407,-117.646983,-4.681001,-13.171122,21.657751,-24.034138,-1.34,0.80,-0.32,2.24
1,641329,"Baker, Bryan",R,FF,4-Seam Fastball,2434.0,97.7,98.1,189.0,2.238826,-142.091777,-8.968560,-2.851911,33.061824,-8.887788,-0.17,1.58,0.10,2.59
2,641329,"Baker, Bryan",R,CH,Changeup,1856.0,84.9,85.2,212.0,6.936655,-123.422405,-5.325648,-13.889678,25.079523,-22.776142,-1.22,0.83,1.35,2.31
3,641329,"Baker, Bryan",R,FF,4-Seam Fastball,2258.0,97.2,97.5,192.0,-0.055174,-141.325311,-10.022382,-1.712935,33.111733,-6.725581,-0.13,1.74,-0.55,2.30
4,641329,"Baker, Bryan",R,FC,Cutter,2348.0,86.2,86.7,127.0,0.906464,-125.636009,-2.473559,6.513804,24.352439,-31.861988,0.63,0.00,0.36,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708535,660271,"Ohtani, Shohei",R,FS,Split-Finger,1314.0,91.8,93.8,234.0,2.903163,-133.823709,-4.599003,-4.200020,23.666550,-30.580581,-0.30,0.08,-0.89,2.05
708536,660271,"Ohtani, Shohei",R,SL,Slider,2440.0,86.9,88.6,92.0,1.715667,-126.691081,-4.680738,11.802514,22.704431,-25.935414,1.11,0.51,-0.64,1.74
708537,660271,"Ohtani, Shohei",R,FF,4-Seam Fastball,2320.0,99.2,100.7,207.0,10.013029,-144.212678,-7.642544,-1.416227,28.775681,-19.306867,0.04,0.81,1.53,1.84
708538,660271,"Ohtani, Shohei",R,FF,4-Seam Fastball,2164.0,97.9,99.9,206.0,4.033907,-142.625616,-7.606169,0.276497,27.659035,-16.289654,0.08,1.06,-0.62,2.01


## Missing Values

In [58]:
df.isnull().sum()

pitcher                 0
player_name             0
p_throws                0
pitch_type            373
pitch_name            373
release_spin_rate    2288
release_speed         429
effective_speed       503
spin_axis            2310
vx0                   413
vy0                   413
vz0                   413
ax                    413
ay                    413
az                    413
pfx_x                 427
pfx_z                 417
plate_x               413
plate_z               413
dtype: int64

In [61]:
# Example: Find pitchers who have spin rate metrics that are "missing"
df[df['release_spin_rate'].isna()].player_name.value_counts()

Lodolo, Nick        106
Ashcraft, Graham    104
Smyly, Drew          92
Sampson, Adrian      87
Thompson, Keegan     74
                   ... 
Cano, Yennier         1
Hamilton, Ian         1
Sands, Cole           1
Snell, Blake          1
Montas, Frankie       1
Name: player_name, Length: 358, dtype: int64

Option A: Go through all missing metrics for each pitcher, take their averages for each column and fill in all missing values with average

Option B: Delete all rows that contain *any* missing values

In [68]:
# Number of rows with missing values
missing_row_vals = df.isna().any(axis=1).sum()
print(missing_row_vals)
# Error proportion relative to whole regular season dataset
print('Percentage Error of Statcast',round(100 * (missing_row_vals / data.shape[0]), 2))

2472
Percentage Error of Statcast 0.35


In [70]:
# Remove all rows containing missing values
num_rows1 = df.shape[0]

# Drop NaN rows and reset indices
df = df.dropna()
df = df.drop(['pitch_type'], axis=1)
df = df.reset_index(drop = True)

print(df.shape)
print('Number of rows removed:', num_rows1 - df.shape[0])

(704276, 19)
Number of rows removed: 2472


In [72]:
df

Unnamed: 0,pitcher,player_name,p_throws,pitch_name,release_spin_rate,release_speed,effective_speed,spin_axis,vx0,vy0,vz0,ax,ay,az,pfx_x,pfx_z,plate_x,plate_z
0,641329,"Baker, Bryan",R,Changeup,1771.0,80.8,81.5,213.0,3.558407,-117.646983,-4.681001,-13.171122,21.657751,-24.034138,-1.34,0.80,-0.32,2.24
1,641329,"Baker, Bryan",R,4-Seam Fastball,2434.0,97.7,98.1,189.0,2.238826,-142.091777,-8.968560,-2.851911,33.061824,-8.887788,-0.17,1.58,0.10,2.59
2,641329,"Baker, Bryan",R,Changeup,1856.0,84.9,85.2,212.0,6.936655,-123.422405,-5.325648,-13.889678,25.079523,-22.776142,-1.22,0.83,1.35,2.31
3,641329,"Baker, Bryan",R,4-Seam Fastball,2258.0,97.2,97.5,192.0,-0.055174,-141.325311,-10.022382,-1.712935,33.111733,-6.725581,-0.13,1.74,-0.55,2.30
4,641329,"Baker, Bryan",R,Cutter,2348.0,86.2,86.7,127.0,0.906464,-125.636009,-2.473559,6.513804,24.352439,-31.861988,0.63,0.00,0.36,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
704271,660271,"Ohtani, Shohei",R,Split-Finger,1314.0,91.8,93.8,234.0,2.903163,-133.823709,-4.599003,-4.200020,23.666550,-30.580581,-0.30,0.08,-0.89,2.05
704272,660271,"Ohtani, Shohei",R,Slider,2440.0,86.9,88.6,92.0,1.715667,-126.691081,-4.680738,11.802514,22.704431,-25.935414,1.11,0.51,-0.64,1.74
704273,660271,"Ohtani, Shohei",R,4-Seam Fastball,2320.0,99.2,100.7,207.0,10.013029,-144.212678,-7.642544,-1.416227,28.775681,-19.306867,0.04,0.81,1.53,1.84
704274,660271,"Ohtani, Shohei",R,4-Seam Fastball,2164.0,97.9,99.9,206.0,4.033907,-142.625616,-7.606169,0.276497,27.659035,-16.289654,0.08,1.06,-0.62,2.01


## Preprocessing: Scaling Pitch Names

# Model Construction

https://erainnovator.com/data-preprocessing-with-python/

https://www.geeksforgeeks.org/multiclass-classification-using-scikit-learn/

https://www.tutorialspoint.com/scikit_learn/scikit_learn_kneighbors_classifier.htm

https://towardsdatascience.com/logistic-regression-classifier-8583e0c3cf9

https://towardsdatascience.com/python-scikit-learn-logistic-regression-classification-eb9c8de8938d
