## Updated model on individual pitchers

### Importing Packages and Data

In [1]:
# General Packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
data_raw = pd.read_csv('savant_2021_regular.csv').drop(['Unnamed: 0'], axis=1)

### Quickly cleaning
This is the same cleaning and basic EDA process from the previous notebook, with the addition of the last step:

In [3]:
# Changing Fastball to 4-Seam Fastball
data_raw['pitch_name'].mask(data_raw['pitch_name'] == 'Fastball', data_raw['pitch_name'].str.lower(), inplace=True)
data_raw['pitch_name'].mask(data_raw['pitch_name'] == 'fastball', '4-Seam Fastball', inplace=True)

In [4]:
# Dropping pitches with a count less than 100 
x = data_raw.groupby(['pitch_name']).size().reset_index().rename(columns={0:'count'})
rare = x.loc[x['count'] < 100]
rare = rare['pitch_name'].to_list()

In [5]:
df = data_raw.loc[~data_raw['pitch_name'].isin(rare)]

#### Looking at pitchers

In [6]:
pitchers = df.groupby(['pitcher']).size().reset_index().rename(columns={0:'count'})
pitchers.sort_values(['count'], ascending = True)


Unnamed: 0,pitcher,count
125,Brett Phillips,1
331,Hanser Alberto,2
363,Jack Mayfield,3
769,Sergio Alcántara,4
720,Romy Gonzalez,4
...,...,...
713,Robbie Ray,3141
869,Walker Buehler,3152
566,Luis Castillo,3164
855,Tyler Mahle,3196


**Wait a second**- Hanser Alberto is an infielder. How did he pitch two pitches? 
When a team is down several runs in the late innings, sometimes the manager will wave the white flag and allow a position player to pitch so that he doesn't "burn" any more of his bullpen arms.

Let's see what was going on in the game when he came in to pitch: 

In [7]:
df[df['pitcher'] == 'Hanser Alberto']

Unnamed: 0,batter,pitcher,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,events,description,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,fielder_2,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
639012,Brandon Lowe,Hanser Alberto,FA,2021-04-20,64.9,-2.31,5.78,field_out,hit_into_play,4.0,Brandon Lowe flies out to left fielder Andrew ...,R,L,R,KC,TB,X,7.0,fly_ball,1,0,2021,-0.04,1.11,-0.28,2.66,,621563.0,,2,9,Top,78.82,120.58,521692.0,3.574256,-94.381933,0.987472,-0.833119,15.26678,-25.98073,3.3,1.5,228.0,87.0,54.0,62.8,1342.0,4.6,634407,593643,521692.0,467793.0,593160.0,641531.0,670032.0,643217.0,572191.0,664728.0,55.89,0.017,0.017,0.0,1.0,0.0,0.0,3.0,86,2,4-Seam Fastball,7,14,14,7,14,7,14,7,Infield shift,Standard,221.0,0.0,-0.343,664040,loweb001,lowebr01,18882,2018.0,2021.0
639013,Brandon Lowe,Hanser Alberto,FA,2021-04-20,64.8,-2.31,5.8,,ball,14.0,Brandon Lowe flies out to left fielder Andrew ...,R,L,R,KC,TB,B,,,0,0,2021,-0.37,1.1,0.83,1.23,,621563.0,,2,9,Top,,,521692.0,5.935795,-94.228769,-1.544814,-3.095241,14.168066,-25.594118,3.25,1.46,,,,62.4,1304.0,4.2,634407,593643,521692.0,467793.0,593160.0,641531.0,670032.0,643217.0,572191.0,664728.0,56.3,,,,,,,,86,1,4-Seam Fastball,7,14,14,7,14,7,14,7,Infield shift,Standard,226.0,0.0,0.025,664040,loweb001,lowebr01,18882,2018.0,2021.0


As expected, Alberto came in with his team down 14-7 and 2 outs in the top of 9th. His two fastballs were zooming at almost 65 mph, which was all he needed to get Brandon Lowe out. 

Though I love watching a position player pitch, I don't want Alberto's 65 mph fastball to throw off the model since it is an outlier and much slower than average. I'm going to remove any players from the dataset who threw less than 50 pitches. 

Sorry, Hanser.

In [8]:
not_pitcher = pitchers.loc[pitchers['count'] <= 50]
not_pitcher = not_pitcher['pitcher'].to_list()

df = df.loc[~df['pitcher'].isin(not_pitcher)]

In [9]:
df['pitch_name'].dropna(inplace=True)

In [10]:
# saving copy 
data = df.copy()
data.to_csv('cleaned_pitch_model.csv')

In [11]:
data = pd.read_csv('cleaned_pitch_model.csv')

Because each pitcher is different, I'm going to create a function that allows the user to decide between using just fastballs/off-speed pitches or all the pitches a pitcher throws.

In [12]:
def pitch_model(data,fb_group, os_group):
    data = data.loc[:,['pitch_name',
                       'release_speed',
                       'release_spin_rate',
                       'vx0','vy0','vz0',
                       'ax','ay','az']]
    
    data = data.dropna()
    
    def fb_filter(x):
        if x=='4-Seam Fastball' or x=='Sinker' or x=='Cutter' or x=='Split-Finger':
            return 'Fastballs'
        return x
    
    if fb_group == True:
        data['pitch_name'] = data['pitch_name'].apply(fb_filter)
    
    
    def os_filter(x):
        if x=='Curveball' or x=='Knuckle Curve' or x=='Slider' or x=='Changeup':
            return 'Off-speed'
        return x
        
    if os_group == True:
        data['pitch_name'] = data['pitch_name'].apply(os_filter)
    
    # x = ['release_speed','vx0','vy0','vz0','ax','ay','az','release_spin_rate']
    # y = 'pitch_name'
    
    X = data.loc[:,['release_speed','vx0','vy0','vz0','ax','ay','az','release_spin_rate']]
    y = data.loc[:,['pitch_name']]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
    
    dt = DecisionTreeClassifier(max_depth=10, min_samples_split=50)
    
    dt.fit(X_train, y_train)
    
    # train_pred= dt_model.predict(X_train)
    # acc_train = (accuracy_score(train_pred, y_train)*100).round(2)
    # print('Training Set Accuracy Score: {0} %'.format(acc_train.round(2)))
    
    y_pred = dt.predict(X_test)
    acc_dt = (accuracy_score(y_pred, y_test)*100).round(2)
    print('Test Set Accuracy Score: {0} %'.format(acc_dt.round(2)))
    
    
    print(classification_report(y_pred, y_test))
    

In [15]:
def pitch_model(data,fb_group, os_group):
    data = data.loc[:,['pitch_name',
                       'release_speed',
                       'release_spin_rate',
                       'vx0','vy0','vz0',
                       'ax','ay','az']]
    
    data = data.dropna()
    
    def fb_filter(x):
        if x=='4-Seam Fastball' or x=='Sinker' or x=='Cutter' or x=='Split-Finger':
            return 'Fastballs'
        return x
    
    if fb_group == True:
        data['pitch_name'] = data['pitch_name'].apply(fb_filter)
    
    
    def os_filter(x):
        if x=='Curveball' or x=='Knuckle Curve' or x=='Slider' or x=='Changeup':
            return 'Off-speed'
        return x
        
    if os_group == True:
        data['pitch_name'] = data['pitch_name'].apply(os_filter)
        
    x = ['release_speed','vx0','vy0','vz0','ax','ay','az','release_spin_rate']
    y = 'pitch_name'
    
    train, test = train_test_split(data, test_size=0.2)
    
    dt = DecisionTreeClassifier(max_depth=10, min_samples_split=50)
    
    dt.fit(train[x], train[y])
    
    # train_pred= dt_model.predict(X_train)
    # acc_train = (accuracy_score(train_pred, y_train)*100).round(2)
    # print('Training Set Accuracy Score: {0} %'.format(acc_train.round(2)))
    
    y_pred = dt.predict(test[x])
    acc_dt = (accuracy_score(y_pred, test[y])*100).round(2)
    print('Test Set Accuracy Score: {0} %'.format(acc_dt.round(2)))
    print(classification_report(y_pred, test[y]))
    
    # test['pitch_hat'] = y_pred
    # test['correct'] = (test['pitch_hat'] == test['pitch_name'])
    
    # probs = pd.DataFrame(dt.predict_proba(test[x]), index=test.index, columns=dt.classes_)
    # results = pd.concat([test[['pitch_name', 'pitch_hat', 'correct']], probs], axis=1)
    # results = results.round(2)
    
    # return(results)
    

Testing the pitch model function:

In [16]:
pitch_model(data, fb_group = False, os_group = False)

Test Set Accuracy Score: 82.36 %
                 precision    recall  f1-score   support

4-Seam Fastball       0.93      0.91      0.92     50995
       Changeup       0.87      0.84      0.86     16644
      Curveball       0.73      0.75      0.74     11521
         Cutter       0.45      0.61      0.52      6887
  Knuckle Curve       0.23      0.56      0.33      1117
         Sinker       0.80      0.84      0.82     21057
         Slider       0.88      0.75      0.81     31662
   Split-Finger       0.32      0.77      0.45       920

       accuracy                           0.82    140803
      macro avg       0.65      0.75      0.68    140803
   weighted avg       0.85      0.82      0.83    140803



The results are almost identical to the original pitch model from the previous post. 

Now, testing with the fastball and offspeed:

In [19]:
pitch_model(data, fb_group = True, os_group= True )

Test Set Accuracy Score: 93.05 %
              precision    recall  f1-score   support

   Fastballs       0.93      0.95      0.94     81233
   Off-speed       0.93      0.90      0.92     59570

    accuracy                           0.93    140803
   macro avg       0.93      0.93      0.93    140803
weighted avg       0.93      0.93      0.93    140803



With the filters on, the results are *slightly* better than the previous post. 

Running the model again with the fastball filter on and the off-speed filter off: 

In [20]:
pitch_model(data, fb_group = True, os_group= False)

Test Set Accuracy Score: 88.3 %
               precision    recall  f1-score   support

     Changeup       0.88      0.85      0.86     16415
    Curveball       0.77      0.75      0.76     12172
    Fastballs       0.93      0.95      0.94     81575
Knuckle Curve       0.29      0.59      0.39      1340
       Slider       0.85      0.79      0.82     29301

     accuracy                           0.88    140803
    macro avg       0.74      0.79      0.75    140803
 weighted avg       0.89      0.88      0.88    140803



And finally, the fastball fitler off and the off-speed filter on:

In [21]:
pitch_model(data, fb_group = False, os_group= True)

Test Set Accuracy Score: 87.66 %
                 precision    recall  f1-score   support

4-Seam Fastball       0.93      0.91      0.92     51281
         Cutter       0.54      0.59      0.57      8539
      Off-speed       0.92      0.91      0.91     58738
         Sinker       0.81      0.83      0.82     21150
   Split-Finger       0.37      0.73      0.49      1095

       accuracy                           0.88    140803
      macro avg       0.72      0.79      0.74    140803
   weighted avg       0.88      0.88      0.88    140803




-----------
So far, we've used data from all of the pitchers in 2021. Since each pitcher has his own set of pitches that he throws a certain way, it makes sense that the model has trouble telling the difference between certain pitch types. 


Now let's try the model with a single pitcher. We'll start with 2021 NL Cy Young winner, Corbin Burnes. 

In [27]:
burnes = data.loc[data['pitcher'] == "Corbin Burnes"]

In [28]:
# Filters off
pitch_model(burnes, fb_group = False, os_group= False)

Test Set Accuracy Score: 99.42 %
                 precision    recall  f1-score   support

4-Seam Fastball       0.67      0.80      0.73         5
       Changeup       1.00      1.00      1.00        48
      Curveball       1.00      1.00      1.00       103
         Cutter       1.00      1.00      1.00       264
         Sinker       0.98      0.98      0.98        49
         Slider       1.00      1.00      1.00        50

       accuracy                           0.99       519
      macro avg       0.94      0.96      0.95       519
   weighted avg       0.99      0.99      0.99       519



In [29]:
# Filters on
pitch_model(burnes, fb_group = True, os_group= True)

Test Set Accuracy Score: 99.42 %
              precision    recall  f1-score   support

   Fastballs       0.99      1.00      1.00       315
   Off-speed       1.00      0.99      0.99       204

    accuracy                           0.99       519
   macro avg       0.99      0.99      0.99       519
weighted avg       0.99      0.99      0.99       519



In [30]:
# Fastballs grouped
pitch_model(burnes, fb_group = True, os_group= False)

Test Set Accuracy Score: 99.81 %
              precision    recall  f1-score   support

    Changeup       0.98      1.00      0.99        55
   Curveball       1.00      1.00      1.00        89
   Fastballs       1.00      1.00      1.00       322
      Slider       1.00      1.00      1.00        53

    accuracy                           1.00       519
   macro avg       1.00      1.00      1.00       519
weighted avg       1.00      1.00      1.00       519



In [31]:
# Off Speed grouped
pitch_model(burnes, fb_group = False, os_group= True)

Test Set Accuracy Score: 99.42 %
                 precision    recall  f1-score   support

4-Seam Fastball       0.85      1.00      0.92        11
         Cutter       1.00      1.00      1.00       267
      Off-speed       1.00      0.99      1.00       197
         Sinker       1.00      0.98      0.99        44

       accuracy                           0.99       519
      macro avg       0.96      0.99      0.97       519
   weighted avg       0.99      0.99      0.99       519



The model does not perform well on Burnes's 4-Seam Fastball. Probably because he doesn't use it very often. 

-----------
Now the 2021 AL Cy Young winner, Robbie Ray:

In [33]:
ray = data.loc[data['pitcher'] == "Robbie Ray"]

In [34]:
# Filters off
pitch_model(ray, fb_group = False, os_group= False)

Test Set Accuracy Score: 98.73 %
                 precision    recall  f1-score   support

4-Seam Fastball       1.00      0.99      1.00       377
       Changeup       0.93      0.96      0.95        27
  Knuckle Curve       0.91      0.97      0.94        33
         Slider       0.99      0.98      0.98       191

       accuracy                           0.99       628
      macro avg       0.96      0.98      0.97       628
   weighted avg       0.99      0.99      0.99       628



In [35]:
#Filters on
pitch_model(ray, fb_group = True, os_group= True)

Test Set Accuracy Score: 99.2 %
              precision    recall  f1-score   support

   Fastballs       1.00      0.99      0.99       366
   Off-speed       0.98      1.00      0.99       262

    accuracy                           0.99       628
   macro avg       0.99      0.99      0.99       628
weighted avg       0.99      0.99      0.99       628



In [36]:
# Fastballs grouped
pitch_model(ray, fb_group = True, os_group= False)

Test Set Accuracy Score: 98.73 %
               precision    recall  f1-score   support

     Changeup       0.88      1.00      0.94        15
    Fastballs       1.00      0.99      1.00       362
Knuckle Curve       0.98      0.93      0.95        43
       Slider       0.98      0.99      0.98       208

     accuracy                           0.99       628
    macro avg       0.96      0.98      0.97       628
 weighted avg       0.99      0.99      0.99       628



In [37]:
# Off Speed grouped
pitch_model(ray, fb_group = False, os_group= True)

Test Set Accuracy Score: 98.73 %
                 precision    recall  f1-score   support

4-Seam Fastball       0.99      0.99      0.99       373
      Off-speed       0.98      0.99      0.99       253
         Sinker       1.00      0.50      0.67         2

       accuracy                           0.99       628
      macro avg       0.99      0.83      0.88       628
   weighted avg       0.99      0.99      0.99       628

