In [25]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from pybaseball import statcast, cache
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import joblib
import math
cache.enable()

In [26]:
sc_23 = statcast('2023-04-01', '2023-08-16')

This is a large query, it may take a moment to complete


100%|████████████████████████████████████████████████████████████████████████████████| 138/138 [01:45<00:00,  1.31it/s]


In [27]:
columns = ['player_name', 'p_throws', 'pitch_name', 'stand', 'delta_run_exp', 'release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'balls', 'strikes', 'release_pos_x', 'release_pos_z', 'spin_axis', 'release_extension']
df = sc_23[columns].dropna(subset='delta_run_exp')

In [28]:
capps_constant = 8.2 - df.release_extension.mean()

In [29]:
df.loc[df['p_throws'] == 'L', 'pfx_x'] *= -1
df.loc[df['p_throws'] == 'L', 'spin_axis'] = 360 - df.loc[df['p_throws'] == 'L', 'spin_axis']
df.loc[df['p_throws'] == 'L', 'release_pos_x'] *= -1

average_extension = df['release_extension'].mean()
# Found this using the added Carter Capps perceived velo divided by his added extension over average
extension_constant = (3.5/capps_constant)
def calculate_perceived_velocity(row):
    adjusted_extension = row['release_extension'] - average_extension
    perceived_velocity = row['release_speed'] + (adjusted_extension * extension_constant)
    return perceived_velocity

df['perceived_velocity'] = df.apply(calculate_perceived_velocity, axis=1)

In [30]:
arm_angles = np.degrees(np.arctan2(df['release_pos_z'], df['release_pos_x']))
df['arm_angle'] = arm_angles

In [31]:
df['spin_axis_rad'] = df['spin_axis'] * np.pi / 180

df['TSM'] = df['release_spin_rate'] * np.sin(df['spin_axis_rad'])
df['G'] = df['release_spin_rate'] * np.cos(df['spin_axis_rad'])

df['spin_efficiency'] = (df['TSM'] / df['release_spin_rate']) * 100

min_spin_efficiency = df['spin_efficiency'].min(skipna=True)
max_spin_efficiency = df['spin_efficiency'].max(skipna=True)

df['normalized_spin_efficiency'] = (df['spin_efficiency'] - min_spin_efficiency) / (max_spin_efficiency - min_spin_efficiency)

In [32]:
def get_pitch_data(df, pitch_name, players):
    velo_map = {player: math.nan for player in players}
    h_movement_map = {player: math.nan for player in players}
    v_movement_map = {player: math.nan for player in players}

    grouped_data = df.loc[(df['pitch_name'] == pitch_name) & (df['player_name'].isin(players))].groupby('player_name').agg({
        'release_speed': 'median',
        'pfx_x': 'median',
        'pfx_z': 'median'
    }).reset_index()

    for _, row in grouped_data.iterrows():
        player_name = row['player_name']
        velo_map[player_name] = row['release_speed']
        h_movement_map[player_name] = row['pfx_x']
        v_movement_map[player_name] = row['pfx_z']

    return velo_map, h_movement_map, v_movement_map

all_players = df['player_name'].unique()

fastball_velo_map, fastball_h_movement_map, fastball_v_movement_map = get_pitch_data(df, '4-Seam Fastball', all_players)
nan_fastball = [key for key in fastball_velo_map if math.isnan(fastball_velo_map[key])]

sinker_velo_map, sinker_h_movement_map, sinker_v_movement_map = get_pitch_data(df, 'Sinker', nan_fastball)
nan_sink = [key for key in sinker_velo_map if math.isnan(sinker_velo_map[key])]

cutter_velo_map, cutter_h_movement_map, cutter_v_movement_map = get_pitch_data(df, 'Cutter', nan_sink)

In [33]:
def calculate_velo_dif(row):
    if row['player_name'] in cutter_velo_map:
        return row['release_speed'] - cutter_velo_map[row['player_name']]
    elif row['player_name'] in sinker_velo_map:
        return row['release_speed'] - sinker_velo_map[row['player_name']]
    return row['release_speed'] - fastball_velo_map[row['player_name']]

def calculate_h_movement_dif(row):
    if row['player_name'] in cutter_velo_map:
        return row['pfx_x'] - cutter_h_movement_map[row['player_name']]
    elif row['player_name'] in sinker_velo_map:
        return row['pfx_x'] - sinker_velo_map[row['player_name']]
    return row['pfx_x'] - fastball_h_movement_map[row['player_name']]

def calculate_v_movement_dif(row):
    if row['player_name'] in cutter_velo_map:
        return row['pfx_z'] - cutter_v_movement_map[row['player_name']]
    elif row['player_name'] in sinker_velo_map:
        return row['pfx_z'] - sinker_velo_map[row['player_name']]
    return row['pfx_z'] - fastball_v_movement_map[row['player_name']]

df['velo_dif'] = df.apply(lambda row: calculate_velo_dif(row), axis=1)
df['h_movement_dif'] = df.apply(lambda row: calculate_h_movement_dif(row), axis=1)
df['v_movement_dif'] = df.apply(lambda row: calculate_v_movement_dif(row), axis=1)

In [34]:
fb_model = joblib.load("models/['4-Seam Fastball', 'Sinker']_optimized_random_stuff.joblib")
bb_model = joblib.load("models/['Curveball', 'Slider', 'Cutter', 'Knuckle Curve', 'Sweeper']_optimized_random_stuff.joblib")
offs_model = joblib.load("models/['Split-Finger', 'Changeup', 'Forkball']_optimized_random_stuff.joblib")

In [35]:
features = ['pfx_x',
 'pfx_z',
 'arm_angle', 'release_speed',
     'perceived_velocity',
 'normalized_spin_efficiency']

df = df.dropna(subset=features)

new_features = features.copy() + ['velo_dif', 'h_movement_dif', 'v_movement_dif']

def predict_pitch(df, pitch_name, model):
    pitch_df = df[df['pitch_name'].isin(pitch_name)]
    X = pitch_df[features].values
    pitch_df['xrv'] = model.predict(X)
    return pitch_df

fastball_sc = predict_pitch(df, ['4-Seam Fastball', 'Sinker'], fb_model)
bb_sc = predict_pitch(df, ['Cutter', 'Curveball', 'Knuckle Curve', 'Slider', 'Sweeper', 'Slow Curve'], bb_model)
offs_sc = predict_pitch(df, ['Split-Finger', 'Changeup', 'Forkball'], offs_model)

total_sc = pd.concat([fastball_sc, bb_sc, offs_sc])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitch_df['xrv'] = model.predict(X)


In [36]:
def calculate_stuff_plus(pitch_sc, min_num_pitches=100):
    agg_stuff_plus = pitch_sc.groupby(['player_name', 'pitch_name']).agg(
        mean_xrv=('xrv', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        perceived_velo=('perceived_velocity', 'mean'),
        spin_rate=('release_spin_rate', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff+'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff+', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

def calculate_agg_stuff_plus(pitch_sc, min_num_pitches=100):
    agg_stuff_plus = pitch_sc.groupby(['player_name']).agg(
        mean_xrv=('xrv', 'mean'),
        num_pitches=('player_name', 'count'),
        velo=('release_speed', 'mean'),
        perceived_velo=('perceived_velocity', 'mean'),
        spin_rate=('release_spin_rate', 'mean'),
        vert_break=('pfx_z', 'mean'),
        horz_break=('pfx_x', 'mean')
    ).reset_index()

    agg_stuff_plus['xrv_100'] = agg_stuff_plus['mean_xrv'] * 100
    agg_stuff_plus['xrv_100_adj'] = abs(agg_stuff_plus['xrv_100'] - agg_stuff_plus['xrv_100'].max())
    agg_stuff_plus['stuff+'] = (agg_stuff_plus['xrv_100_adj'] / agg_stuff_plus['xrv_100_adj'].mean()) * 100
    agg_stuff_plus = agg_stuff_plus.sort_values('stuff+', ascending=False)

    agg_stuff_plus = agg_stuff_plus[agg_stuff_plus['num_pitches'] > min_num_pitches]

    return agg_stuff_plus

agg_fb = calculate_stuff_plus(fastball_sc, 100)
agg_bb = calculate_stuff_plus(bb_sc, 100)
agg_offs = calculate_stuff_plus(offs_sc, 50)
agg_total = calculate_agg_stuff_plus(total_sc, 1200)

In [37]:
pd.set_option('display.max_rows', None)
agg_total

Unnamed: 0,player_name,mean_xrv,num_pitches,velo,perceived_velo,spin_rate,vert_break,horz_break,xrv_100,xrv_100_adj,stuff+
526,"Pivetta, Nick",0.003367,1775,88.692056,89.461154,2462.060845,0.543775,0.132321,0.336703,55.059654,118.548706
27,"Ashcraft, Graham",0.005527,2057,93.143899,91.6657,2576.12105,0.347005,0.438794,0.552664,54.843693,118.083717
113,"Cease, Dylan",0.014869,2348,88.847019,88.520943,2606.755111,0.575524,-0.001533,1.486861,53.909496,116.072296
645,"Strider, Spencer",0.0232,2332,92.586621,93.770257,2352.805746,1.022449,-0.210493,2.320029,53.076328,114.278412
94,"Burnes, Corbin",0.023435,2194,90.789836,91.042383,2603.87876,0.52675,0.12825,2.343528,53.05283,114.227821
447,"Miller, Bobby",0.025443,1246,92.606902,93.448696,2429.802568,0.541091,-0.415722,2.544317,52.852043,113.795509
254,"Greene, Hunter",0.026789,1249,93.96285,93.677016,2317.760608,0.875612,-0.505596,2.678853,52.717506,113.505829
247,"Gore, MacKenzie",0.028246,2181,91.286245,92.50009,2231.37735,0.863237,-0.209766,2.824646,52.571712,113.191925
84,"Brown, Hunter",0.028835,2122,91.155938,91.575702,2245.812912,0.458996,-0.077139,2.883477,52.512882,113.065254
448,"Miller, Bryce",0.031008,1414,91.741867,91.356062,2502.186704,1.033055,-0.163663,3.100846,52.295513,112.597237


In [38]:
importances = zip(bb_model.feature_names_in_, bb_model.feature_importances_)

for feature_name, importance in importances:
    print(f"Feature: {feature_name}, Importance: {importance}")


Feature: pfx_x, Importance: 0.24588432908058167
Feature: pfx_z, Importance: 0.07730956375598907
Feature: release_spin_rate, Importance: 0.2546869218349457
Feature: arm_angle, Importance: 0.10713642090559006
Feature: perceived_velocity, Importance: 0.19892627000808716
Feature: normalized_spin_efficiency, Importance: 0.11605645716190338
