In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from pybaseball import statcast, cache
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import joblib
import math
cache.enable()

In [3]:
sc_23 = pd.read_csv('../statcast_2023.csv')

In [4]:
columns = ['player_name', 'p_throws', 'pitch_name', 'stand', 'delta_run_exp', 'release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'balls', 'strikes', 'release_pos_x', 'release_pos_z', 'spin_axis', 'release_extension']
non_features = ['player_name', 'delta_run_exp', 'stand', 'pitch_name', 'p_throws', 'balls', 'strikes', 'plate_x', 'plate_z']
before_features = [column for column in columns if column not in non_features]
df = sc_23[columns].dropna(subset='delta_run_exp').dropna(subset=before_features)

In [5]:
# Getting lefties on the same scale as righties
df.loc[df['p_throws'] == 'L', 'pfx_x'] *= -1
df.loc[df['p_throws'] == 'L', 'spin_axis'] = 360 - df.loc[df['p_throws'] == 'L', 'spin_axis']
df.loc[df['p_throws'] == 'L', 'release_pos_x'] *= -1

average_extension = df['release_extension'].mean()

def calculate_perceived_velocity(row):
    adjusted_extension = row['release_extension'] - average_extension
    perceived_velocity = row['release_speed'] + (adjusted_extension * (3.5/.5))
    return perceived_velocity

df['perceived_velocity'] = df.apply(calculate_perceived_velocity, axis=1)

In [6]:
arm_angles = np.degrees(np.arctan2(df['release_pos_z'], df['release_pos_x']))
df['arm_angle'] = arm_angles

In [7]:
df['spin_axis_rad'] = df['spin_axis'] * np.pi / 180

df['TSM'] = df['release_spin_rate'] * np.sin(df['spin_axis_rad'])
df['G'] = df['release_spin_rate'] * np.cos(df['spin_axis_rad'])

df['spin_efficiency'] = (df['TSM'] / df['release_spin_rate']) * 100

min_spin_efficiency = df['spin_efficiency'].min(skipna=True)
max_spin_efficiency = df['spin_efficiency'].max(skipna=True)

df['normalized_spin_efficiency'] = (df['spin_efficiency'] - min_spin_efficiency) / (max_spin_efficiency - min_spin_efficiency)

In [8]:
fb_model = joblib.load("models/['4-Seam Fastball', 'Sinker']_optimized_random_stuff.joblib")
bb_model = joblib.load("models/['Curveball', 'Slider', 'Cutter', 'Knuckle Curve', 'Sweeper']_optimized_random_stuff.joblib")
offs_model = joblib.load("models/['Split-Finger', 'Changeup']_optimized_random_stuff.joblib")

In [9]:
features = ['perceived_velocity', 'pfx_x', 'pfx_z', 'arm_angle', 'release_speed', 'release_spin_rate', 'normalized_spin_efficiency']

def predict_pitch(df, pitch_name, model):
    pitch_df = df[df['pitch_name'].isin(pitch_name)]
    X = pitch_df[features].values
    pitch_df['xrv'] = model.predict(X)
    return pitch_df

fastball_sc = predict_pitch(df, ['4-Seam Fastball', 'Sinker'], fb_model)
bb_sc = predict_pitch(df, ['Cutter', 'Curveball', 'Knuckle Curve', 'Slider', 'Sweeper', 'Slow Curve'], bb_model)
offs_sc = predict_pitch(df, ['Split-Finger', 'Changeup', 'Forkball'], offs_model)

total_sc = pd.concat([fastball_sc, bb_sc, offs_sc])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)


In [10]:
def calculate_stuff_plus(pitch_sc, min_num_pitches=100):
    agg_stuff_plus = pitch_sc.groupby(['player_name', 'pitch_name']).agg(
        mean_xrv=('xrv', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        perceived_velo=('perceived_velocity', 'mean'),
        spin_rate=('release_spin_rate', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff+'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff+', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

def calculate_agg_stuff_plus(pitch_sc, min_num_pitches=100):
    agg_stuff_plus = pitch_sc.groupby(['player_name']).agg(
        mean_xrv=('xrv', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        perceived_velo=('perceived_velocity', 'mean'),
        spin_rate=('release_spin_rate', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff+'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff+', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

agg_fb = calculate_stuff_plus(fastball_sc, 100)
agg_bb = calculate_stuff_plus(bb_sc, 100)
agg_offs = calculate_stuff_plus(offs_sc, 50)
agg_total = calculate_agg_stuff_plus(total_sc, 1200)

In [12]:
pd.set_option('display.max_rows', None)
agg_bb

Unnamed: 0,player_name,pitch_name,mean_xrv,num_pitches,velo,perceived_velo,spin_rate,vert_break,horz_break,xrv_100,xrv_100_adj,stuff+
29,"Alvarado, José",Cutter,-0.01119,187,93.650802,96.082641,2196.016043,0.600695,0.178128,-1.119024,8.74525,126.070633
1308,"deGrom, Jacob",Slider,-0.010724,144,91.768056,92.986124,2641.604167,0.312014,0.406042,-1.072377,8.698603,125.39817
157,"Burnes, Corbin",Cutter,-0.010621,969,94.365119,95.057826,2654.764706,0.960784,0.257843,-1.062094,8.68832,125.249931
427,"Glasnow, Tyler",Slider,-0.010532,304,89.544737,96.83945,2644.292763,0.088026,0.183224,-1.053214,8.67944,125.121918
308,"Díaz, Alexis",Slider,-0.010519,345,87.536812,96.152054,2667.973913,0.182783,0.388435,-1.051918,8.678144,125.103249
783,"Miller, Bobby",Slider,-0.010215,216,90.412037,93.807883,2484.347222,0.163796,0.377593,-1.021548,8.647775,124.665436
886,"Oviedo, Johan",Slider,-0.009969,728,88.347253,93.294167,2505.086538,0.250082,0.429162,-0.996857,8.623082,124.309471
1015,"Santos, Gregory",Slider,-0.009744,377,91.657825,92.429806,2564.034483,0.162069,0.473156,-0.974362,8.600588,123.985199
143,"Brown, Hunter",Slider,-0.009616,445,91.824719,92.870934,2146.710112,0.484472,0.26436,-0.961563,8.58779,123.800697
417,"Gilbert, Logan",Slider,-0.009601,495,88.817172,95.228321,1966.115152,0.084121,0.070566,-0.960083,8.586309,123.779358
