In [None]:
# Player Swing Probability
# Objective: This model attempts to determine whether or not an MLB hitter will swing at a given pitch.
# Author: Jameel Kaba

In [None]:
import pandas as pd
from pybaseball import statcast_batter, statcast_pitcher, playerid_lookup
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm as lgb

In [None]:
# If you want to run the model on a different player, just replace the player name to lookup the player id
# Looking up Bryce Harper's lookup table
playerid_lookup('harper','bryce')

In [None]:
# Getting Bryce Harper's pitch by pitch data over the last 5 years
data0 = statcast_batter('2016-03-15','2021-10-01', 547180)

In [None]:
data0.shape

In [None]:
# Removing intentional balls and pitchouts because they are not trying to get the batter to swing
data1 = data0.loc[(data0['description']!= 'intent_ball') & (data0['pitch_type']!= 'PO')]

In [None]:
data1.shape

In [None]:
data1['plate_z'].isna().sum()

In [None]:
# Removing null data 
data2 = data1.loc[(data1['plate_x'].notnull()) & (data1['plate_z'].notnull())]

In [None]:
data2.shape

In [None]:
data2['pitch_type'].isna().sum()

In [None]:
data2['release_speed'].isna().sum()

In [None]:
# Removing null data from pitch type and release speed
data3 = data2.loc[(data2['pitch_type'].notnull()) & (data2['release_speed'].notnull())]

In [None]:
data3.shape

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Indicating whether or not the batter swung (1 indicates swing, 0 indicates take)
data3['swing'] = 1

In [None]:
data3['description'].unique()

In [None]:
# Setting the swing column equal to 0 on all pitches the batter did not swing at
data3.loc[data3['description'].isin(['ball','called_strike','blocked_ball','hit_by_pitch',
                                    'pitchout']),'swing'] = 0

In [None]:
# Resetting and reversing the index so pitches are ordered chronologically
data4 = data3.iloc[::-1].reset_index(drop = True)

In [None]:
data4.pitch_type.unique()

In [None]:
# Categorizing pitches as either fastballs, breaking balls, or offspeed pitches
fastballs = ['FF','FT','SI', 'FC']
breaking = ['CU','SL','KC','KN','FS','SC']
offspeed = ['CH','FO','EP']

In [None]:
# Creating a new column to indicate what the previous pitch thrown was. Default value is 'none'
data4['prev_pitch'] = 'none'   

In [None]:
# Loop that imputes the previous pitch type into our new 'prev_pitch' column (if one was thrown)
prev_pitch_list = []
for i in range(len(data4)):
    
    # Skipping if it's the first pitch of the at-bat 
    if data4['pitch_number'][i] == 1:
        previous_pitch = 'none'
    
    elif data4['pitch_number'][i]!= 1:
        
        # Getting the pitch type that was thrown in the previous pitch
        previous_pitch = data4['pitch_type'][i-1]
    
    prev_pitch_list.append(previous_pitch)

In [None]:
# Setting the prev_pitch column equal to the list we created above
data4['prev_pitch'] = prev_pitch_list

In [None]:
# Recategorizing the previous pitches into fastballs, breaking balls, and offspeed pitches 
data4.loc[data4['prev_pitch'].isin(fastballs),'prev_pitch'] = 'fb'
data4.loc[data4['prev_pitch'].isin(breaking),'prev_pitch'] = 'brk'
data4.loc[data4['prev_pitch'].isin(offspeed),'prev_pitch'] = 'offs'

In [None]:
# Getting only the columns we will need in our model
relevant_df = data4[['plate_x','plate_z','release_speed','p_throws','pfx_x','pfx_z','vx0','vy0','vz0',
                  'strikes','balls','prev_pitch', 'swing']]

In [None]:
relevant_df.head()

In [None]:
# One-hot encoding categorical features
hot_df = pd.get_dummies(relevant_df, prefix = ['throws_','prev_'])

In [None]:
# Separating the dependent variable
y = hot_df['swing']
y = y.astype(int)

In [None]:
# Getting the independent variables
x = hot_df.drop('swing', axis = 1).values

In [None]:
# Splitting test and training data
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [None]:
# Initializing model
clf = lgb.LGBMClassifier()

In [None]:
# Training the model
clf.fit(X_train, y_train)

In [None]:
# Predicting
y_pred = clf.predict(X_test)

In [None]:
# Getting our accuracy
accuracy = metrics.accuracy_score(y_pred, y_test)

In [None]:
accuracy

In [None]:
# Predicting the training data
y_pred_train = clf.predict(X_train)

In [None]:
print('Training-set accuracy score: {0:0.4f}'.format(metrics.accuracy_score(y_train, y_pred_train)))

In [None]:
print('Training set score: {:.4f}'.format(clf.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf.score(X_test, y_test)))

In [None]:
# Confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n',cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# Attempting hyperparameter tuning
clf2 = lgb.LGBMClassifier(boosting_type = 'gbdt',
    objective = 'binary',
    learning_rate = 0.099,
    max_depth = 6,
    num_leaves = 26,
    n_estimators = 180)

In [None]:
# Training the new model
clf2.fit(X_train, y_train)

In [None]:
# Predicting with the new model
y_pred2 = clf2.predict(X_test)

In [None]:
# Accuracy of the new model 
accuracy2 = metrics.accuracy_score(y_pred2, y_test)

In [None]:
accuracy2

In [None]:
print('Training set score: {:.4f}'.format(clf2.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf2.score(X_test, y_test)))

In [None]:
# Confusion matrix for new model
cm2 = metrics.confusion_matrix(y_test, y_pred2)

In [None]:
print('Confusion matrix\n\n',cm2)
print('\nTrue Positives(TP) = ', cm2[0,0])
print('\nTrue Negatives(TN) = ', cm2[1,1])
print('\nFalse Positives(FP) = ', cm2[0,1])
print('\nFalse Negatives(FN) = ', cm2[1,0])

In [None]:
# Getting list of column names
col_names = hot_df.drop('swing',axis = 1).columns

In [None]:
# Putting the test data back into a dataframe 
test_df2 = pd.DataFrame(data = X_test, columns = col_names)

In [None]:
# Putting the predicted probabilities into a df
gb_prob_df2 = pd.DataFrame(data = clf2.predict_proba(X_test), columns = ['take_probability', 'swing_probability'])

In [None]:
# Adding the swing probability to the df with our test data
test_df2['swing_probability'] = gb_prob_df2['swing_probability']

In [None]:
# Plotting the location of each pitch along with its swing probability  
sns.scatterplot(x = test_df2['plate_x'],y = test_df2['plate_z'],hue = test_df2['swing_probability'])

In [None]:
# We are gonna test out the model on Clayton Kershaw's pitches
kershaw = statcast_pitcher('2019-03-15','2021-10-01', 477132)

In [None]:
# Getting everytime his curveball was thrown
ck_cu = kershaw.loc[kershaw['pitch_type'] == 'CU']

In [None]:
# Getting the average speed of his curveball 
ck_avg_speed = ck_cu['release_speed'].mean()

In [None]:
ck_avg_pfx_x = ck_cu['pfx_x'].mean()
ck_avg_pfx_z = ck_cu['pfx_z'].mean()
ck_avg_vx0 = ck_cu['vx0'].mean()
ck_avg_vy0 = ck_cu['vy0'].mean()
ck_avg_vz0 = ck_cu['vz0'].mean()

# These will be used as the x,z coordinates for the strikezone
ck_plate_x = np.linspace(start = -1.5, stop = 1.5, num = 40)
ck_plate_z = np.linspace(start = 0.5, stop = 4, num = 50)

In [None]:
# Getting the cartesian product of those two arrays
plate_product = np.transpose([np.tile(ck_plate_x, len(ck_plate_z)), np.repeat(ck_plate_z, len(ck_plate_x))])

In [None]:
ck_df = pd.DataFrame(data = plate_product, columns = ['plate_x','plate_z'])

In [None]:
#adding the rest of the features and setting their values to the average
ck_df['release_speed'] = ck_avg_speed
ck_df['pfx_x'] = ck_avg_pfx_x
ck_df['pfx_z'] = ck_avg_pfx_z
ck_df['vx0'] = ck_avg_vx0
ck_df['vy0'] = ck_avg_vy0
ck_df['vz0'] = ck_avg_vz0
ck_df['strikes'] = 0
ck_df['balls'] = 0
ck_df['throws_L'] = 0
ck_df['throws_R'] = 1
ck_df['prev_brk'] = 0
ck_df['prev_fb'] = 0
ck_df['prev_none'] = 1
ck_df['prev_offs'] = 0

In [None]:
# Getting probabilities for Kershaw's curveball
swing_prob = clf.predict_proba(ck_df.values)

In [None]:
ck_sp = pd.DataFrame(data = swing_prob, columns = ['take_prob','swing_prob'])

In [None]:
ck_df['swing_prob'] = ck_sp['swing_prob']

In [None]:
sns.scatterplot(x = tg_df['plate_x'],y = tg_df['plate_z'],hue = tg_df['swing_prob'])

In [None]:
# This function takes a specific mlb player's id, pitch type, ball/strike count, and the prior pitch thrown
# and returns a hexbin plot of probabilities that Bryce Harper will swing at the pitch in the situation

def pitch_swing(mlbid, pitch_type, strikes, balls, prev_pitch):
    
    import matplotlib.patches as patches
    
    data = statcast_pitcher('2019-03-15','2021-10-01',mlbid)
    
    arm = data['p_throws'][0]
    
    if arm == 'R':
        throws_r = 1
        throws_l = 0
    elif arm =='L':
        throws_r = 0
        throws_l = 1
    
    if prev_pitch == 'fb':
        prev_fb = 1
        prev_brk = 0
        prev_offs = 0
        prev_none = 0
        msg = ' in ' + str(balls) + '-' + str(strikes) + ' count after a fastball'
    elif prev_pitch == 'brk':
        prev_fb = 0
        prev_brk = 1
        prev_offs = 0
        prev_none = 0
        msg = ' in ' + str(balls) + '-' + str(strikes) + ' count after a breaking ball'
    elif prev_pitch == 'offs':
        prev_fb = 0
        prev_brk = 0
        prev_offs = 1
        prev_none = 0
        msg = ' in ' + str(balls) + '-' + str(strikes) + ' count after an offspeed pitch'
    elif prev_pitch == 'none':
        prev_fb = 0
        prev_brk = 0
        prev_offs = 0
        prev_none = 1
        msg = ' on first pitch'
    
    pitch = data.loc[data['pitch_type'] == pitch_type]
    
    pitch = pitch.reset_index(drop = True)
    
    sz_top = data4['sz_top'].mean()
    
    sz_bot = data4['sz_bot'].mean()
    
    # Use the average of the speed/movement metrics to test the model
    avg_speed = pitch['release_speed'].mean()
    avg_pfx_x = pitch['pfx_x'].mean()
    avg_pfx_z = pitch['pfx_z'].mean()
    avg_vx0 = pitch['vx0'].mean()
    avg_vy0 = pitch['vy0'].mean()
    avg_vz0 = pitch['vz0'].mean()
    
    # These will be used as the x,z coordinates for the strikezone
    plate_x = np.linspace(start = -1.5, stop = 1.5, num = 40)
    plate_z = np.linspace(start = 0.5, stop = sz_top + 0.75, num = 50)
    
    # Getting the cartesian product of those two arrays
    plate_product = np.transpose([np.tile(plate_x, len(plate_z)), np.repeat(plate_z, len(plate_x))])
    
    df = pd.DataFrame(data = plate_product, columns = ['plate_x','plate_z'])
    
    df['release_speed'] = avg_speed
    df['pfx_x'] = avg_pfx_x
    df['pfx_z'] = avg_pfx_z
    df['vx0'] = avg_vx0
    df['vy0'] = avg_vy0
    df['vz0'] = avg_vz0
    df['strikes'] = strikes
    df['balls'] = balls
    df['throws_L'] = throws_l
    df['throws_R'] = throws_r
    df['prev_brk'] = prev_brk
    df['prev_fb'] = prev_fb
    df['prev_none'] = prev_none
    df['prev_offs'] = prev_offs
    
    swing_prob = clf2.predict_proba(df.values)
    
    sp = pd.DataFrame(data = swing_prob, columns = ['take_prob','swing_prob'])
    
    df['swing_prob'] = sp['swing_prob']
    
    strikezone = patches.Rectangle((-0.70833,sz_bot), width = 17/12, height = (sz_top-sz_bot), fill = False)
    
    fig, ax = plt.subplots()
    graph = ax.hexbin(df['plate_x'],df['plate_z'], C = df['swing_prob'], 
              gridsize = (12), vmax = 1,vmin =0, cmap = 'Blues')    
    
    name = data['player_name'][0]
    title = 'Joey Votto swing probability against ' + name + "'s " + pitch['pitch_name'][0] + msg
    ax.add_patch(strikezone)
    
    ax.axis('equal')
    
    ax.set_title(title)
    
    cbar = fig.colorbar(graph)
    
    cbar.set_label('Swing probability')
    
    return graph

In [None]:
playerid_lookup('kershaw','clayton')

In [None]:
pitch_swing(477132, 'CU',2,0,'fb')