In [11]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from pybaseball import statcast, cache
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import joblib
import math
cache.enable()

In [12]:
sc_23 = statcast('2023-04-01', '2023-08-16')

This is a large query, it may take a moment to complete


100%|████████████████████████████████████████████████████████████████████████████████| 138/138 [01:50<00:00,  1.25it/s]


In [13]:
columns = ['player_name', 'p_throws', 'pitch_name', 'stand', 'delta_run_exp', 'release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'balls', 'strikes', 'release_pos_x', 'release_pos_z', 'spin_axis', 'release_extension']
non_features = ['player_name', 'delta_run_exp', 'stand', 'pitch_name', 'p_throws', 'balls', 'strikes', 'plate_x', 'plate_z']
before_features = [column for column in columns if column not in non_features]
df = sc_23[columns].dropna(subset='delta_run_exp').dropna(subset=before_features)

In [14]:
capps_constant = 8.2 - df.release_extension.mean()

In [15]:
df['pfx_x'] = abs(df['pfx_x'])
df.loc[df['p_throws'] == 'L', 'spin_axis'] = 360 - df.loc[df['p_throws'] == 'L', 'spin_axis']
df['release_pos_x'] = abs(df['release_pos_x'])

average_extension = df['release_extension'].mean()
# Found this using the added Carter Capps perceived velo divided by his added extension over average
extension_constant = (3.5/capps_constant)
def calculate_perceived_velocity(row):
    adjusted_extension = row['release_extension'] - average_extension
    perceived_velocity = row['release_speed'] + (adjusted_extension * extension_constant)
    return perceived_velocity

df['perceived_velocity'] = df.apply(calculate_perceived_velocity, axis=1)

In [16]:
arm_angles = np.degrees(np.arctan2(df['release_pos_z'], df['release_pos_x']))
df['arm_angle'] = arm_angles

In [17]:
df['spin_axis_rad'] = df['spin_axis'] * np.pi / 180

df['TSM'] = df['release_spin_rate'] * np.sin(df['spin_axis_rad'])
df['G'] = df['release_spin_rate'] * np.cos(df['spin_axis_rad'])

df['spin_efficiency'] = (df['TSM'] / df['release_spin_rate']) * 100

min_spin_efficiency = df['spin_efficiency'].min(skipna=True)
max_spin_efficiency = df['spin_efficiency'].max(skipna=True)

df['normalized_spin_efficiency'] = (df['spin_efficiency'] - min_spin_efficiency) / (max_spin_efficiency - min_spin_efficiency)

In [20]:
fb_model = joblib.load("models/['4-Seam Fastball', 'Sinker']_optimized_random_stuff.joblib")
bb_model = joblib.load("models/['Curveball', 'Slider', 'Cutter', 'Knuckle Curve', 'Sweeper']_optimized_random_stuff.joblib")
offs_model = joblib.load("models/['Split-Finger', 'Changeup', 'Forkball']_optimized_random_stuff.joblib")

In [21]:
features = ['pfx_x',
 'pfx_z',
 'release_spin_rate',
 'arm_angle',
 'perceived_velocity',
 'normalized_spin_efficiency']

def predict_pitch(df, pitch_name, model):
    pitch_df = df[df['pitch_name'].isin(pitch_name)]
    X = pitch_df[features].values  # Assuming you have defined 'features'
    pitch_df['xrv'] = model.predict(X)
    return pitch_df

fastball_sc = predict_pitch(df, ['4-Seam Fastball', 'Sinker'], fb_model)
bb_sc = predict_pitch(df, ['Cutter', 'Curveball', 'Knuckle Curve', 'Slider', 'Sweeper', 'Slow Curve'], bb_model)
offs_sc = predict_pitch(df, ['Split-Finger', 'Changeup', 'Forkball'], offs_model)

total_sc = pd.concat([fastball_sc, bb_sc, offs_sc])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)


In [37]:
def calculate_stuff_plus(pitch_sc, min_num_pitches=100):
    agg_stuff_plus = pitch_sc.groupby(['player_name', 'pitch_name']).agg(
        mean_xrv=('xrv', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        perceived_velo=('perceived_velocity', 'mean'),
        spin_rate=('release_spin_rate', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff+'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff+', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

def calculate_agg_stuff_plus(pitch_sc, min_num_pitches=100):
    agg_stuff_plus = pitch_sc.groupby(['player_name']).agg(
        mean_xrv=('xrv', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        perceived_velo=('perceived_velocity', 'mean'),
        spin_rate=('release_spin_rate', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff+'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff+', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

agg_fb = calculate_stuff_plus(fastball_sc, 100)
agg_bb = calculate_stuff_plus(bb_sc, 100)
agg_offs = calculate_stuff_plus(offs_sc, 50)
agg_total = calculate_agg_stuff_plus(total_sc, 700)

In [39]:
pd.set_option('display.max_rows', None)
agg_fb

Unnamed: 0,player_name,pitch_name,mean_xrv,num_pitches,velo,perceived_velo,spin_rate,vert_break,horz_break,xrv_100,xrv_100_adj,stuff+
69,"Bautista, Félix",4-Seam Fastball,-0.016352,645,99.537519,99.854709,2416.92093,1.711798,0.568372,-1.635152,7.771778,130.134171
401,"Helsley, Ryan",4-Seam Fastball,-0.011817,221,99.671946,100.080353,2660.669683,1.54629,0.277421,-1.181711,7.318337,122.54155
278,"Fairbanks, Pete",4-Seam Fastball,-0.011789,195,98.770769,99.321649,2441.789744,1.585795,0.144564,-1.178925,7.315552,122.494904
952,"Strider, Spencer",4-Seam Fastball,-0.010972,1366,97.220351,98.405392,2365.819912,1.544451,0.478887,-1.097162,7.233788,121.125816
847,"Romano, Jordan",4-Seam Fastball,-0.010505,279,96.690323,98.987294,2339.473118,1.462581,0.376129,-1.050541,7.187167,120.345177
171,"Chapman, Aroldis",4-Seam Fastball,-0.010153,376,99.152128,100.49117,2436.154255,1.50266,0.389016,-1.015288,7.151915,119.754883
275,"Estrada, Jeremiah",4-Seam Fastball,-0.009726,199,95.71206,96.121021,2285.201005,1.669548,0.291055,-0.972609,7.109235,119.040253
794,"Pérez, Eury",4-Seam Fastball,-0.009521,466,97.671888,98.589609,2637.040773,1.492554,0.717725,-0.95214,7.088767,118.69751
377,"Hader, Josh",Sinker,-0.009273,530,96.035094,96.710271,2145.184906,1.565245,0.624566,-0.927271,7.063897,118.28109
151,"Cano, Yennier",Sinker,-0.009215,449,96.096214,95.268916,2119.688196,-0.227506,1.487862,-0.921463,7.05809,118.183838


In [24]:
fl = pd.read_csv('../../AutomatedPitchTagging/AutoTaggedCSVs/autoMaster 2023-08-16.csv')

In [25]:
fl['RelSide'] = abs(fl['RelSide'])

In [26]:
arm_angles = np.degrees(np.arctan2(fl['RelHeight'], fl['RelSide']))
fl['arm_angle'] = arm_angles

In [27]:
fl = fl.rename(columns={'RelSpeed': 'release_speed', 'HorzBreak': 'pfx_x', 'InducedVertBreak': 'pfx_z', 'RelSide': 'release_pos_x', 'RelHeight': 'release_pos_z', 'Extension': 'release_extension', 'SpinRate': 'release_spin_rate', 'yt_Efficiency': 'normalized_spin_efficiency'})

In [28]:
average_extension = fl['release_extension'].mean()

extension_constant = (3.5/capps_constant)
def calculate_perceived_velocity_fl(row):
    adjusted_extension = row['release_extension'] - average_extension
    perceived_velocity = row['release_speed'] + (adjusted_extension * extension_constant)
    return perceived_velocity

fl['perceived_velocity'] = fl.apply(calculate_perceived_velocity_fl, axis=1)

In [29]:
def predict_pitch_fl(df, pitch_name, model):
    pitch_df = df[df['AutoPitchType'].isin(pitch_name)]
    pitch_df['xrv'] = model.predict(pitch_df[model.feature_names_in_])
    return pitch_df

fastball_fl = predict_pitch_fl(fl, ['Fastball', 'Sinker'], fb_model)
bb_fl = predict_pitch_fl(fl, ['Cutter', 'Slider', 'Curveball'], bb_model)
offs_fl = predict_pitch_fl(fl, ['Splitter', 'Changeup'], offs_model)

total_fl = pd.concat([fastball_fl, bb_fl, offs_fl])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(pitch_df[model.feature_names_in_])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(pitch_df[model.feature_names_in_])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(pitch_df[model.feature_names_in_])


In [30]:
def calculate_stuff_plus(pitch_fl, min_num_pitches=100):
    agg_stuff_plus = pitch_fl.groupby(['Pitcher', 'AutoPitchType']).agg(
        mean_xrv=('xrv', 'mean'),
        num_pitches=('Pitcher', 'count'),
        velo=('release_speed', 'mean'),
        perceived_velo=('perceived_velocity', 'mean'),
        spin_rate=('release_spin_rate', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff+'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff+', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

def calculate_agg_stuff_plus(pitch_fl, min_num_pitches=100):
    agg_stuff_plus = pitch_fl.groupby(['Pitcher']).agg(
        mean_xrv=('xrv', 'mean'),
        num_pitches=('Pitcher', 'count'),
        velo=('release_speed', 'mean'),
        perceived_velo=('perceived_velocity', 'mean'),
        spin_rate=('release_spin_rate', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff+'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff+', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

agg_fb_fl = calculate_stuff_plus(fastball_fl, 100)
agg_bb_fl = calculate_stuff_plus(bb_fl, 100)
agg_offs_fl = calculate_stuff_plus(offs_fl, 50)
agg_total_fl = calculate_agg_stuff_plus(total_fl, 700)

In [31]:
pd.set_option('display.max_rows', None)
agg_total_fl

Unnamed: 0,Pitcher,mean_xrv,num_pitches,velo,perceived_velo,spin_rate,vert_break,horz_break,xrv_100,xrv_100_adj,stuff+
130,Darrien Ragins,-0.01008,737,85.517161,86.345026,2423.428596,7.535849,5.18951,-1.007968,27.931145,152.337479
112,Cole Cook,0.025623,1378,82.029276,81.01032,2444.645474,3.129987,6.506508,2.562292,24.360884,132.865143
101,Chris Burica,0.036652,806,79.518997,79.918591,2175.440214,8.430004,5.718467,3.665194,23.257982,126.849876
220,Jhon Vargas,0.03899,865,82.474856,82.390607,2427.542555,3.031729,9.617362,3.899033,23.024143,125.574516
384,Tim Holdgrafer,0.047558,891,84.707772,84.523191,2206.358677,6.308209,10.753615,4.755819,22.167356,120.901558
375,Steven Fuentes,0.052844,1064,87.90408,88.37213,2263.381171,6.705977,8.81888,5.284415,21.63876,118.018578
197,Jackson Hickert,0.058947,1159,84.978073,84.270083,2048.448405,5.363233,11.862745,5.894714,21.028461,114.689995
353,Ruben Ramirez,0.05922,899,85.181188,85.359906,2280.035437,6.248149,9.367189,5.921958,21.001217,114.541397
278,Luis Perez,0.05924,1057,87.682825,86.604247,2226.030999,8.408988,8.574051,5.923992,20.999184,114.530312
251,Justin Watland,0.060746,840,83.396278,82.752919,2160.164022,9.402167,7.193381,6.074598,20.848577,113.7089


In [None]:
agg_fb