In [None]:
import pandas as pd
import numpy as np
from pybaseball import statcast,playerid_reverse_lookup,batting_stats,playerid_lookup
from datetime import datetime
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly
from sklearn.neighbors import NearestNeighbors as NN
from sklearn.neural_network import MLPClassifier

In [None]:
#You can run this notebook for any year, just be sure to change the dates in this cell

#get statcast data
data = statcast("2021-01-01","2021-12-31")
#only keep non 2k data
data = data.loc[data['strikes'] < 2]
#get batter names
data_names = playerid_reverse_lookup(pd.unique(data['batter']), key_type='mlbam')
data_names['batter_name'] = [x.capitalize() +", " + y.capitalize() for x,y in zip(data_names['name_last'],data_names['name_first'])]
data_final = data.merge(data_names[['key_mlbam','batter_name']], right_on=['key_mlbam'],left_on =['batter'], how = 'left')
#get batter stats for 2021
batter_stats = batting_stats(2021,qual = 1)
data_final = data_final[['batter','batter_name','description','plate_x', 'plate_z','launch_speed']]


In [None]:
def scorer(player_data):
    batted_balls = player_data.loc[(player_data['launch_speed'] > 0) & (player_data['description'] == 'hit_into_play' )].reset_index(drop = True)
    
    player_data['Swing'] = player_data['description'].isin(['hit_into_play', 'foul', 'swinging_strike',
                                                          'swinging_strike_blocked', 'foul_tip'])
    
    
    model = NN(radius = 0.375)
    model.fit(batted_balls[['plate_x','plate_z']].to_numpy(),batted_balls['launch_speed'].to_numpy())
    exit_velo_global = np.percentile(batted_balls['launch_speed'],80) if np.percentile(batted_balls['launch_speed'],80) > 100 else 100

    for i in player_data.index:
        neighbors = model.radius_neighbors([[player_data.at[i,'plate_x'],player_data.at[i,'plate_z']]])
        if len(neighbors[1][0]) == 0:
            player_data.at[i,'ShouldSwing'] = False
            player_data.at[i,'EVP'] = 0
        else:
            exit_velo_per_local    = np.percentile(batted_balls.iloc[neighbors[1][0]]['launch_speed'],85)
            player_data.at[i,'EVP'] = exit_velo_per_local
            player_data.at[i,'ShouldSwing'] = True if exit_velo_per_local > exit_velo_global else False
    
    
    model_2 = MLPClassifier(max_iter=3000)
    model_2.fit(player_data[['plate_x','plate_z']].to_numpy(),player_data['ShouldSwing'].to_list())
    player_data['MLPShouldSwing'] = model_2.predict(player_data[['plate_x','plate_z']].to_numpy())
    
    swings = player_data.loc[player_data['description'].isin(['hit_into_play', 'foul', 'swinging_strike',
                                                          'swinging_strike_blocked', 'foul_tip'])].reset_index(drop = True)
    takes  = player_data.loc[~player_data['description'].isin(['hit_into_play', 'foul', 'swinging_strike',
                                                          'swinging_strike_blocked', 'foul_tip'])].reset_index(drop = True)
    
    return (len(swings.index),swings['MLPShouldSwing'].value_counts(normalize=True)[False],swings['MLPShouldSwing'].value_counts(normalize=True)[True],
                len(takes.index),takes['MLPShouldSwing'].value_counts(normalize=True)[True],takes['MLPShouldSwing'].value_counts(normalize=True)[False],
           exit_velo_global)
            
    

In [None]:
aggregate = pd.DataFrame({'batter':[],'batter_name':[],'Swings':[],'BadSwingRate':[],'GoodSwingRate':[],'Takes':[],
             'BadTakeRate':[],'GoodTakeRate':[],'GoodDecisionRate':[],'GlobalEV':[],'wRC+':[],'O-Swing% (sc)':[],'Z-Swing% (sc)':[],'Swing% (sc)':[]})

for batter in pd.unique(data_final.batter):
    player_data = data_final.loc[data_final['batter'] == batter].dropna(subset=['plate_x','plate_z']).reset_index(drop = True)
    kf = data_names.loc[data_names['key_mlbam'] == batter]['key_fangraphs'].values[0]
    player_stats = batter_stats.loc[batter_stats['IDfg'] == kf]
    if len(player_data.index) < 600:
        continue
    batter_name = pd.unique(player_data['batter_name'])[0]
    
    try:
        t = scorer(player_data)
        good_decision_rate = ((t[2] * t[0]) + (t[5] * t[3])) / (t[3] + t[0])
        aggregate = pd.concat([aggregate,pd.DataFrame({'batter':[int(batter)],'batter_name':[batter_name],'Swings':[int(t[0])],
                                                       'BadSwingRate':[round(t[1],3)],'GoodSwingRate':[round(t[2],3)],'Takes':[int(t[3])],
                                                       'BadTakeRate':[round(t[4],3)], 'GoodTakeRate':[round(t[5],3)], 'GoodDecisionRate':[round(good_decision_rate,3)],
                                                       'GlobalEV':[t[6]],'wRC+':[int(player_stats['wRC+'].values[0])],
                                                       'O-Swing% (sc)':[player_stats['O-Swing% (sc)'].values[0]],
                                                       'Z-Swing% (sc)':[player_stats['Z-Swing% (sc)'].values[0]],
                                                       'Swing% (sc)':[player_stats['Swing% (sc)'].values[0]]})])

    except Exception as e:
        print(e,batter)
        continue
        
player_aggregate_frame = pd.DataFrame(aggregate)


In [None]:
#produce the top 10 for whatever stat you want
top_10 = player_aggregate_frame.sort_values(by = ['GoodDecisionRate'],ascending = False).head(10)
top_10 

In [None]:
## TO PRODUCE GRAPHS OF THE VARIOUS BATTERS CHANGE BATTERID TO THE PLAYER IN QUESTIONS MLBAM ID
# Use this cell to find their MLBAM key


playerid_lookup("pujols", "albert", fuzzy=True)

In [None]:
batterid = 405395

In [None]:
def add_zone(fig):
    fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1)
    fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1)

    fig.add_trace(go.Scatter(x=[-.8,.8],y=[1.8,1.8], mode="lines",
        line=dict(color='black',width=2),
        showlegend=False))
    fig.add_trace(go.Scatter(x=[-.8,.8],y=[3.8,3.8], mode="lines",
        line=dict(color='black',width=2),
        showlegend=False))
    fig.add_trace(go.Scatter(x=[-.8,-.8],y=[1.8,3.8], mode="lines",
        line=dict(color='black',width=2),
        showlegend=False))
    fig.add_trace(go.Scatter(x=[.8,.8],y=[1.8,3.8], mode="lines",
        line=dict(color='black',width=2),
        showlegend=False))
    return fig
    


player_data = data_final.loc[data_final['batter'] == batterid].dropna(subset=['plate_x','plate_z']).reset_index(drop = True)
batted_balls = player_data.loc[(player_data['launch_speed'] > 0) & (player_data['description'] == 'hit_into_play' )].reset_index(drop = True)
exit_velo_per = np.percentile(batted_balls['launch_speed'],80) if np.percentile(batted_balls['launch_speed'],80) > 100 else 100

top_batted_balls = batted_balls.loc[(batted_balls['launch_speed'] > exit_velo_per) ]
player_data['Swing'] = player_data['description'].isin(['hit_into_play', 'foul', 'swinging_strike',
                                                          'swinging_strike_blocked', 'foul_tip'])
swings = player_data.loc[player_data['description'].isin(['hit_into_play', 'foul', 'swinging_strike',
                                                          'swinging_strike_blocked', 'foul_tip'])].reset_index(drop = True)

 # make zones
fig1 = px.scatter(top_batted_balls,x='plate_x',y='plate_z',color='launch_speed',
                 title = 'Top 80th Percentile Batted Balls By Player', size_max = 20)


fig1 = add_zone(fig1)
fig1.show()


model = NN(radius = 0.375)
model.fit(batted_balls[['plate_x','plate_z']].to_numpy(),batted_balls['launch_speed'].to_numpy())
for i in player_data.index:
    neighbors = model.radius_neighbors([[player_data.at[i,'plate_x'],player_data.at[i,'plate_z']]])
    if len(neighbors[1][0]) == 0:
        player_data.at[i,'ShouldSwing'] = False
        player_data.at[i,'xEV'] = 0
    else:
        exit_velo_per_local    = np.percentile(batted_balls.iloc[neighbors[1][0]]['launch_speed'],85)
        player_data.at[i,'xEV'] = exit_velo_per_local
        player_data.at[i,'ShouldSwing'] = True if exit_velo_per_local > exit_velo_per else False
        
model_2 = MLPClassifier(max_iter=2000)
model_2.fit(player_data[['plate_x','plate_z']].to_numpy(),player_data['ShouldSwing'].to_list())
player_data['MLPShouldSwing'] = model_2.predict(player_data[['plate_x','plate_z']].to_numpy())
player_data['MLPShouldSwing'].value_counts(normalize=True)
pd.unique(player_data['batter_name'])[0]
exit_velo_per

fig2 = px.scatter(player_data,x='plate_x',y='plate_z',color='ShouldSwing',hover_data=['xEV'],
                 title = 'Should Player Have Swung? Radius Neighbors Only',color_discrete_sequence =plotly.express.colors.qualitative.Safe )

fig2 = add_zone(fig2)
fig2.show()
fig3 = px.scatter(player_data,x='plate_x',y='plate_z',color='MLPShouldSwing',hover_data=['xEV'],
                 title = 'Should Player Have Swung? MLP',color_discrete_sequence =plotly.express.colors.qualitative.Safe )
fig3 = add_zone(fig3)
fig3.show()