In [63]:
# import pacakges
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns',None)

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, train_test_split

# Import Data

In [25]:
# Import concatenated 2015-19 data and mlb20cleaned data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/2020cleaned.csv')
train.shape, test.shape

((3646118, 55), (279660, 55))

In [28]:
train = train[train['release_pos_x'].notnull() & train['release_spin_rate'].notnull()
     & train['effective_speed'].notnull()]

In [23]:
print(train.shape)
train.head(3)

(3486421, 55)


Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,2015-11-01,FF,Wade Davis,451584,Wilmer Flores,527038,96.1,-1.92,5.95,1,1,1,2,-0.27,1.7,-0.93,1.62,0,624424,0,12,0,95.4,2463.0,6.4,460077,543333,450314,519058,444876,460086,456715,449181,94,6,4-Seam Fastball,2,7,2,7,Standard,Strategic,1-2,strike,strikeout,called_strike,S,,3.075,-139.609,-9.724,-2.028,34.78,-11.706,2
1,2015-11-01,FC,Wade Davis,451584,Wilmer Flores,527038,93.1,-1.5,6.02,1,1,1,2,0.24,0.72,0.78,1.55,0,624424,0,12,0,92.2,2705.0,5.9,460077,543333,450314,519058,444876,460086,456715,449181,94,5,Cutter,2,7,2,7,Standard,Strategic,1-2,strike,,foul,S,,5.156,-135.442,-7.227,5.004,28.835,-25.323,2
2,2015-11-01,FF,Wade Davis,451584,Wilmer Flores,527038,97.0,-1.37,6.08,1,1,1,2,-0.5,1.41,1.32,2.47,0,624424,0,12,0,96.4,2362.0,6.3,460077,543333,450314,519058,444876,460086,456715,449181,94,4,4-Seam Fastball,2,7,2,7,Standard,Strategic,1-2,strike,,foul,S,,8.459,-140.843,-7.313,-5.412,31.551,-15.189,2


In [29]:
train.isnull().sum()

game_date                      0
pitch_type                     0
Pitcher_name                   0
pitcher_id                     0
batter_name                    0
batter_id                      0
release_speed                  0
release_pos_x                  0
release_pos_z                  0
stand                          0
p_throws                       0
balls                          0
strikes                        0
pfx_x                          0
pfx_z                          0
plate_x                        0
plate_z                        0
on_3b                          0
on_2b                          0
on_1b                          0
inning                         0
inning_topbot                  0
effective_speed                0
release_spin_rate              0
release_extension              1
Catcher                        0
FirstBasemen                   0
SecondBasemen                  0
ThirdBasemen                   0
ShortStop                      0
LeftField 

In [30]:
test = test[test['release_pos_x'].notnull() & test['release_spin_rate'].notnull()
     & test['effective_speed'].notnull()]

In [24]:
print(test.shape)
test.head(3)

(279061, 55)


Unnamed: 0,game_date,pitch_type,Pitcher_name,pitcher_id,batter_name,batter_id,release_speed,release_pos_x,release_pos_z,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,inning,inning_topbot,effective_speed,release_spin_rate,release_extension,Catcher,FirstBasemen,SecondBasemen,ThirdBasemen,ShortStop,LeftField,CenterField,RightField,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,count,strike_attempt,events,description,type,bb_type,vx0,vy0,vz0,ax,ay,az,outs_when_up
0,2020-10-27,FF,Julio Urias,628711,Willy Adames,642715,96.7,1.58,5.99,1,0,0,2,0.2,1.6,-0.53,2.29,0,0,0,9,1,95.4,2615.0,5.7,605131,571970,571771,621458,608369,621035,641355,605141,65,3,4-Seam Fastball,1,3,1,3,Standard,Standard,0-2,strike,strikeout,called_strike,S,,-5.950264,-140.490456,-7.897391,3.772,32.321911,-8.981441,2
1,2020-10-27,FF,Julio Urias,628711,Willy Adames,642715,94.1,2.91,5.45,1,0,0,1,0.8,1.3,-0.55,3.03,0,0,0,9,1,93.4,2470.0,5.9,605131,571970,571771,621458,608369,621035,641355,605141,65,2,4-Seam Fastball,1,3,1,3,Standard,Standard,0-1,strike,,called_strike,S,,-10.560246,-136.599519,-3.429867,11.723598,29.18381,-15.237217,2
2,2020-10-27,FF,Julio Urias,628711,Willy Adames,642715,94.9,1.77,6.02,1,0,0,0,0.2,1.5,-0.04,3.32,0,0,0,9,1,94.0,2397.0,5.7,605131,571970,571771,621458,608369,621035,641355,605141,65,1,4-Seam Fastball,1,3,1,3,Standard,Standard,0-0,strike,,swinging_strike,S,,-5.199252,-138.098234,-4.63797,4.158758,30.838499,-12.535677,2


In [31]:
test.isnull().sum()

game_date                     0
pitch_type                    0
Pitcher_name                  0
pitcher_id                    0
batter_name                   0
batter_id                     0
release_speed                 0
release_pos_x                 0
release_pos_z                 0
stand                         0
p_throws                      0
balls                         0
strikes                       0
pfx_x                         0
pfx_z                         0
plate_x                       0
plate_z                       0
on_3b                         0
on_2b                         0
on_1b                         0
inning                        0
inning_topbot                 0
effective_speed               0
release_spin_rate             0
release_extension           245
Catcher                       0
FirstBasemen                  0
SecondBasemen                 0
ThirdBasemen                  0
ShortStop                     0
LeftField                     0
CenterFi

# Select Features

In [15]:
train.columns

Index(['game_date', 'pitch_type', 'Pitcher_name', 'pitcher_id', 'batter_name',
       'batter_id', 'release_speed', 'release_pos_x', 'release_pos_z', 'stand',
       'p_throws', 'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z',
       'on_3b', 'on_2b', 'on_1b', 'inning', 'inning_topbot', 'effective_speed',
       'release_spin_rate', 'release_extension', 'Catcher', 'FirstBasemen',
       'SecondBasemen', 'ThirdBasemen', 'ShortStop', 'LeftField',
       'CenterField', 'RightField', 'at_bat_number', 'pitch_number',
       'pitch_name', 'bat_score', 'fld_score', 'post_bat_score',
       'post_fld_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'count', 'strike_attempt', 'events', 'description', 'type', 'bb_type',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'outs_when_up'],
      dtype='object')

In [33]:
features = ['Pitcher_name','batter_name','pitch_name','release_speed', 'release_pos_x', 'release_pos_z', 'stand',
       'p_throws', 'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z','inning', 'inning_topbot', 'effective_speed',
       'release_spin_rate', 'pitch_number', 'bat_score', 'fld_score', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'outs_when_up']
target = 'strike_attempt'

# Create Train Test split

In [34]:
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

# LabelEncode `y`

In [38]:
le = LabelEncoder()

In [39]:
le.fit(y_train)

LabelEncoder()

In [40]:
y_train_le = le.transform(y_train)
y_test_le = le.transform(y_test)

In [41]:
y_train_le

array([3, 3, 3, ..., 3, 3, 2])

In [42]:
y_test_le

array([3, 3, 3, ..., 3, 0, 0])

In [48]:
le.inverse_transform(y_test_le) == y_test

0         True
1         True
2         True
3         True
4         True
          ... 
279655    True
279656    True
279657    True
279658    True
279659    True
Name: strike_attempt, Length: 279306, dtype: bool

### Baseline Accuracy

In [51]:
y_train.value_counts(normalize=True) 
# Aim to beat a 46% accuracy score

strike    0.459818
ball      0.357926
out       0.114810
ob        0.067447
Name: strike_attempt, dtype: float64

# Dummy what needs to be dummied

In [52]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3486421 entries, 0 to 3645428
Data columns (total 28 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Pitcher_name       object 
 1   batter_name        object 
 2   pitch_name         object 
 3   release_speed      float64
 4   release_pos_x      float64
 5   release_pos_z      float64
 6   stand              int64  
 7   p_throws           int64  
 8   balls              int64  
 9   strikes            int64  
 10  pfx_x              float64
 11  pfx_z              float64
 12  plate_x            float64
 13  plate_z            float64
 14  inning             int64  
 15  inning_topbot      int64  
 16  effective_speed    float64
 17  release_spin_rate  float64
 18  pitch_number       int64  
 19  bat_score          int64  
 20  fld_score          int64  
 21  vx0                float64
 22  vy0                float64
 23  vz0                float64
 24  ax                 float64
 25  ay                

In [53]:
X_train_dummy = pd.get_dummies(X_train, columns=['Pitcher_name','batter_name','pitch_name'])
X_test_dummy = pd.get_dummies(X_test, columns=['Pitcher_name','batter_name','pitch_name'])

In [55]:
# Check column matches
for col in X_train_dummy.columns:
    if col not in X_test_dummy.columns:
        print(col)

Pitcher_name_A.J. Achter
Pitcher_name_A.J. Burnett
Pitcher_name_A.J. Griffin
Pitcher_name_A.J. Morris
Pitcher_name_A.J. Puk
Pitcher_name_A.J. Schugel
Pitcher_name_AJ Reed
Pitcher_name_Aaron Altherr
Pitcher_name_Aaron Blair
Pitcher_name_Aaron Brooks
Pitcher_name_Aaron Harang
Pitcher_name_Aaron Laffey
Pitcher_name_Aaron Sanchez
Pitcher_name_Aaron Thompson
Pitcher_name_Aaron Wilkerson
Pitcher_name_Abel De Los Santos
Pitcher_name_Adalberto Mejia
Pitcher_name_Adam Conley
Pitcher_name_Adam LaRoche
Pitcher_name_Adam Liberatore
Pitcher_name_Adam Loewen
Pitcher_name_Adam McCreery
Pitcher_name_Adam Rosales
Pitcher_name_Adam Warren
Pitcher_name_Adam Wilk
Pitcher_name_Addison Reed
Pitcher_name_Adonis Rosa
Pitcher_name_Adrian Sampson
Pitcher_name_Akeel Morris
Pitcher_name_Al Alburquerque
Pitcher_name_Alan Busenitz
Pitcher_name_Albert Suarez
Pitcher_name_Alec Asher
Pitcher_name_Alejandro Chacin
Pitcher_name_Alex Avila
Pitcher_name_Alex Blandino
Pitcher_name_Alex Gordon
Pitcher_name_Alex Meyer
Pitche

In [56]:
for col in X_test_dummy.columns:
    if col not in X_train_dummy.columns:
        print(col)

Pitcher_name_Aaron Fletcher
Pitcher_name_Adonis Medina
Pitcher_name_Albert Abreu
Pitcher_name_Alex Vesia
Pitcher_name_Andre Scrubb
Pitcher_name_Angel Perdomo
Pitcher_name_Anthony Bemboom
Pitcher_name_Anthony Castro
Pitcher_name_Anthony Misiewicz
Pitcher_name_Antonio Santos
Pitcher_name_Ashton Goudeau
Pitcher_name_Beau Burrows
Pitcher_name_Ben Braymer
Pitcher_name_Bernardo Flores Jr.
Pitcher_name_Blake Cederlind
Pitcher_name_Blake Taylor
Pitcher_name_Brady Singer
Pitcher_name_Brailyn Marquez
Pitcher_name_Brandon Bailey
Pitcher_name_Brandon Bielak
Pitcher_name_Brandon Leibrandt
Pitcher_name_Brandon Waddell
Pitcher_name_Braxton Garrett
Pitcher_name_Brett Eibner
Pitcher_name_Brock Holt
Pitcher_name_Brooks Kriske
Pitcher_name_Brooks Raley
Pitcher_name_Bruce Zimmermann
Pitcher_name_Caleb Baragar
Pitcher_name_Cam Hill
Pitcher_name_Carlos Hernandez
Pitcher_name_Carlos Sanabria
Pitcher_name_Carson Kelly
Pitcher_name_Casey Mize
Pitcher_name_Clarke Schmidt
Pitcher_name_Codi Heuer
Pitcher_name_Cod

### Drop what won't fit

In [58]:
drop_test = [col for col in X_test_dummy.columns if col not in X_train_dummy.columns]
drop_train = [col for col in X_train_dummy.columns if col not in X_test_dummy.columns]

In [60]:
len(drop_train), len(drop_test)

(2855, 259)

In [62]:
X_train_dummy.drop(columns=drop_train, inplace=True)
X_test_dummy.drop(columns=drop_test, inplace=True)

# Let's set it up

In [None]:
# The data being fed only contains players that have played at least 6 seasons so there may be a lot of inaccuracies due to that

# Instantiate
rf = RandomForestClassifier(random_state=66, n_jobs=8, verbose = 1)
# Fit it
rf.fit(X_train_dummy, y_train)

In [None]:
# Check scores
cross_val_score(rf, X_train_dummy, y_train, n_jobs=8, verbose = 1).mean()