In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [17]:
from collections import defaultdict
import os

import matplotlib.pyplot as plt
import scipy.stats

from jax import pure_callback, random
import jax.numpy as jnp
import optax

import numpyro
from numpyro import handlers
import numpyro.distributions as dist
import jax
import jax.numpy as jno
from numpyro.infer import MCMC, NUTS, Predictive
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler

## Pre-Processing
1. Import players who have over 400 plate appearances in 2024.
2. Import play-by-play data from Statcast in 2024.
3. Filtered specific columns to keep in play-by-play data.
4. Filtered bunts from play-by-play data.
5. Remove NA's from needed columns.
6. Imported residuals of expected bat speed and swing length from GAM (v1).
7. Scaled bat speed and swing length residuals.

In [3]:
#Get players who have 400 plate appearances or more
from baseball_utilities.data import *
fangraphs_batting_min_400_2024 = get_table("fangraphs_batting_min_400_2024")

In [4]:
swing_2024 = get_swing_data([2024])

2024: 322559 rows


In [5]:
from baseball_utilities.preprocessing import *
columns_to_keep = ["pitch_type", 
                     "game_date", 
                     "release_speed", 
                     "player_name",
                     "batter_name",
                     "batter", 
                     "pitcher", 
                     "events",
                     "description",
                     "des", 
                     "zone",
                     "game_type",
                     "stand",
                     "p_throws",
                     "bb_type",
                     "balls",
                     "strikes",
                     "game_year",
                     "plate_x",
                     "plate_z",
                     "launch_speed",
                     "launch_angle",
                     "effective_speed",
                     "release_spin_rate",
                     "release_extension", 
                     "estimated_ba_using_speedangle",
                     "estimated_woba_using_speedangle",
                     "woba_value",
                     "arm_angle",
                     "bat_speed",
                     "swing_length"]
columns_na = ["bat_speed", "swing_length"]
columns_not_na = remove_nan(swing_2024, columns_na)
swing_2024 = remove_columns(swing_2024, columns_to_keep)


No rows dropped.


In [6]:
mlbam_ids_2024 = fangraphs_batting_min_400_2024["mlbam_id"]
swing_2024 = swing_2024[swing_2024["batter"].isin(mlbam_ids_2024)]

In [7]:
swing_2024 = swing_2024[(swing_2024["bat_speed"] > 20) & (swing_2024["swing_length"] > 3)]

In [27]:
swings_2024 = pd.read_csv("swings_2024_residuals_appended.csv")

In [9]:
swings_2024

Unnamed: 0,pitch_type,game_date,release_speed,player_name,batter,pitcher,events,description,des,zone,...,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,arm_angle,bat_speed,swing_length,bat_speed_pred,swing_length_pred,bat_speed_residual,swing_length_residual
0,FF,2024-09-30,94.2,"Díaz, Edwin",621566,621242,field_out,hit_into_play,Matt Olson pops out to shortstop Francisco Lin...,12.0,...,0.037,0.039,0.0,17.8,77.4,7.0,72.630213,6.851235,4.769787,0.148765
1,CU,2024-09-30,86.4,"Johnson, Pierce",596019,572955,home_run,hit_into_play,Francisco Lindor homers (33) on a fly ball to ...,8.0,...,0.791,1.621,2.0,39.4,80.2,8.1,73.566324,7.948831,6.633676,0.151169
2,SL,2024-09-30,89.7,"Díaz, Edwin",542303,621242,strikeout,swinging_strike,Marcell Ozuna strikes out swinging.,6.0,...,,0.000,0.0,25.3,78.4,8.1,74.373570,7.287232,4.026430,0.812768
3,SL,2024-09-30,89.5,"Díaz, Edwin",542303,621242,,swinging_strike,Marcell Ozuna strikes out swinging.,14.0,...,,,,22.1,79.5,8.5,69.598224,8.102444,9.901776,0.397556
4,FF,2024-09-30,96.9,"Díaz, Edwin",542303,621242,,foul,Marcell Ozuna strikes out swinging.,5.0,...,,,,18.4,77.4,8.0,73.445847,6.833112,3.954153,1.166888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196956,SI,2024-04-03,92.3,"Keller, Mitch",682928,656605,field_out,hit_into_play,CJ Abrams pops out to third baseman Ke'Bryan H...,2.0,...,0.003,0.003,0.0,31.7,64.2,6.2,67.746497,6.518272,-3.546497,-0.318272
196957,FF,2024-04-03,92.4,"Keller, Mitch",682928,656605,,foul,CJ Abrams pops out to third baseman Ke'Bryan H...,2.0,...,,,,31.5,66.8,6.0,69.811933,6.778951,-3.011933,-0.778951
196958,CH,2024-04-03,83.3,"Williams, Trevor",668804,592866,field_out,hit_into_play,Bryan Reynolds lines out to right fielder Lane...,5.0,...,0.343,0.439,0.0,15.5,70.9,7.2,71.598932,6.546403,-0.698932,0.653597
196959,CH,2024-04-03,82.5,"Williams, Trevor",665833,592866,strikeout,swinging_strike,Oneil Cruz strikes out swinging.,13.0,...,,0.000,0.0,14.8,82.1,8.5,73.092508,8.483885,9.007492,0.016115


In [28]:
#Scale bat speed and swing length residuals
scaler = StandardScaler()
swings_2024[["bat_speed_residual_scaled", "swing_length_residual_scaled"]] = scaler.fit_transform(
    swings_2024[["bat_speed_residual", "swing_length_residual"]]
)

## Modeling

In [10]:
def swing_mixture_model(y):
    n, d = y.shape

    mu_fg = numpyro.sample("mu_fg", dist.Normal(0,2).expand([d]))
    sigma_fg = numpyro.sample("sigma_fg", dist.HalfNormal(1).expand([d]))
    fg_dist = dist.Independent(dist.Normal(mu_fg, sigma_fg),1)

    mu_bg = numpyro.sample("mu_bg", dist.Normal(0,2).expand([d]))
    sigma_bg = numpyro.sample("sigma_bg", dist.HalfNormal(1).expand([d]))
    bg_dist = dist.Independent(dist.Normal(mu_bg, sigma_bg), 1)

    components = [fg_dist, bg_dist]

    Q = numpyro.sample("Q", dist.Uniform(0, 1))
    mix = dist.Categorical(probs=jnp.array([Q , 1- Q]))

    with numpyro.plate("data", n):
        logp_fg = fg_dist.log_prob(y)
        logp_bg = bg_dist.log_prob(y)

        log_mixture = jnp.logaddexp(
            jnp.log(Q) + logp_fg,
            jnp.log(1.0 - Q) + logp_bg
        )

        numpyro.factor("likelihood", log_mixture.sum())

In [30]:
def run_mixture_model(player_df):
    Y = player_df[["bat_speed_residual_scaled", "swing_length_residual_scaled"]].to_numpy()

    nuts_kernel = NUTS(swing_mixture_model)
    mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
    mcmc.run(jax.random.PRNGKey(0), y = Y)

    posterior_samples = mcmc.get_samples()

    Q = posterior_samples["Q"].mean()
    mu_fg = posterior_samples["mu_fg"].mean(axis=0)
    sigma_fg = posterior_samples["sigma_fg"].mean(axis=0)
    mu_bg = posterior_samples["mu_bg"].mean(axis=0)
    sigma_bg = posterior_samples["sigma_bg"].mean(axis=0)

    logp_fg = (norm.logpdf(Y[:,0], mu_fg[0], sigma_fg[0]) +
               norm.logpdf(Y[:,1], mu_fg[1], sigma_fg[1]))
    logp_bg = (norm.logpdf(Y[:,0], mu_bg[0], sigma_bg[0]) +
               norm.logpdf(Y[:,1], mu_bg[1], sigma_bg[1]))

    p_fg = np.exp(logp_fg)
    p_bg = np.exp(logp_bg)

    outlier_prob = (1-Q) * p_bg / (Q * p_fg + (1 - Q) * p_bg)

    return outlier_prob

In [12]:
swings_2024["batter"].nunique()

207

In [32]:
np.random.seed(32)
unique_batters = swings_2024["batter"].unique()
batter_groups = np.array_split(unique_batters, 3)

In [37]:
group_idx = 2
batter_subset = batter_groups[group_idx]

subset_df = swings_2024[swings_2024["batter"].isin(batter_subset)].copy()

In [29]:
swings_2024

Unnamed: 0,pitch_type,game_date,release_speed,player_name,batter,pitcher,events,description,des,zone,...,woba_value,arm_angle,bat_speed,swing_length,bat_speed_pred,swing_length_pred,bat_speed_residual,swing_length_residual,bat_speed_residual_scaled,swing_length_residual_scaled
0,FF,2024-09-30,94.2,"Díaz, Edwin",621566,621242,field_out,hit_into_play,Matt Olson pops out to shortstop Francisco Lin...,12.0,...,0.0,17.8,77.4,7.0,72.630213,6.851235,4.769787,0.148765,0.778840,0.268969
1,CU,2024-09-30,86.4,"Johnson, Pierce",596019,572955,home_run,hit_into_play,Francisco Lindor homers (33) on a fly ball to ...,8.0,...,2.0,39.4,80.2,8.1,73.566324,7.948831,6.633676,0.151169,1.083187,0.273316
2,SL,2024-09-30,89.7,"Díaz, Edwin",542303,621242,strikeout,swinging_strike,Marcell Ozuna strikes out swinging.,6.0,...,0.0,25.3,78.4,8.1,74.373570,7.287232,4.026430,0.812768,0.657460,1.469494
3,SL,2024-09-30,89.5,"Díaz, Edwin",542303,621242,,swinging_strike,Marcell Ozuna strikes out swinging.,14.0,...,,22.1,79.5,8.5,69.598224,8.102444,9.901776,0.397556,1.616823,0.718785
4,FF,2024-09-30,96.9,"Díaz, Edwin",542303,621242,,foul,Marcell Ozuna strikes out swinging.,5.0,...,,18.4,77.4,8.0,73.445847,6.833112,3.954153,1.166888,0.645658,2.109747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196956,SI,2024-04-03,92.3,"Keller, Mitch",682928,656605,field_out,hit_into_play,CJ Abrams pops out to third baseman Ke'Bryan H...,2.0,...,0.0,31.7,64.2,6.2,67.746497,6.518272,-3.546497,-0.318272,-0.579094,-0.575440
196957,FF,2024-04-03,92.4,"Keller, Mitch",682928,656605,,foul,CJ Abrams pops out to third baseman Ke'Bryan H...,2.0,...,,31.5,66.8,6.0,69.811933,6.778951,-3.011933,-0.778951,-0.491807,-1.408353
196958,CH,2024-04-03,83.3,"Williams, Trevor",668804,592866,field_out,hit_into_play,Bryan Reynolds lines out to right fielder Lane...,5.0,...,0.0,15.5,70.9,7.2,71.598932,6.546403,-0.698932,0.653597,-0.114126,1.181711
196959,CH,2024-04-03,82.5,"Williams, Trevor",665833,592866,strikeout,swinging_strike,Oneil Cruz strikes out swinging.,13.0,...,0.0,14.8,82.1,8.5,73.092508,8.483885,9.007492,0.016115,1.470798,0.029137


In [38]:
for mlbam_id, player_df in subset_df.groupby("batter"):
    outlier_probs = run_mixture_model(player_df)
    swings_2024.loc[player_df.index, "outlier_prob"] = np.asarray(outlier_probs)

  mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
sample: 100%|█| 2000/2000 [00:02<00:00, 757.54it/s, 15 steps of size 2.02e-01. a
sample: 100%|█| 2000/2000 [00:01<00:00, 1041.96it/s, 23 steps of size 2.10e-01. 
  mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
sample: 100%|█| 2000/2000 [00:03<00:00, 637.63it/s, 15 steps of size 1.44e-01. a
sample: 100%|█| 2000/2000 [00:02<00:00, 913.27it/s, 15 steps of size 1.49e-01. a
  mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
sample: 100%|█| 2000/2000 [00:03<00:00, 517.13it/s, 15 steps of size 1.25e-01. a
sample: 100%|█| 2000/2000 [00:02<00:00, 901.99it/s, 19 steps of size 1.43e-01. a
  mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
sample: 100%|█| 2000/2000 [00:02<00:00, 667.46it/s, 15 steps of size 1.67e-01. a
sample: 100%|█| 2000/2000 [00:02<00:00, 939.67it/s, 15 steps of size 1.73e-01. a
  mcmc = MCMC(nuts_kerne

In [39]:
swings_2024

Unnamed: 0,pitch_type,game_date,release_speed,player_name,batter,pitcher,events,description,des,zone,...,arm_angle,bat_speed,swing_length,bat_speed_pred,swing_length_pred,bat_speed_residual,swing_length_residual,bat_speed_residual_scaled,swing_length_residual_scaled,outlier_prob
0,FF,2024-09-30,94.2,"Díaz, Edwin",621566,621242,field_out,hit_into_play,Matt Olson pops out to shortstop Francisco Lin...,12.0,...,17.8,77.4,7.0,72.630213,6.851235,4.769787,0.148765,0.778840,0.268969,0.001757
1,CU,2024-09-30,86.4,"Johnson, Pierce",596019,572955,home_run,hit_into_play,Francisco Lindor homers (33) on a fly ball to ...,8.0,...,39.4,80.2,8.1,73.566324,7.948831,6.633676,0.151169,1.083187,0.273316,0.001764
2,SL,2024-09-30,89.7,"Díaz, Edwin",542303,621242,strikeout,swinging_strike,Marcell Ozuna strikes out swinging.,6.0,...,25.3,78.4,8.1,74.373570,7.287232,4.026430,0.812768,0.657460,1.469494,0.000154
3,SL,2024-09-30,89.5,"Díaz, Edwin",542303,621242,,swinging_strike,Marcell Ozuna strikes out swinging.,14.0,...,22.1,79.5,8.5,69.598224,8.102444,9.901776,0.397556,1.616823,0.718785,0.002070
4,FF,2024-09-30,96.9,"Díaz, Edwin",542303,621242,,foul,Marcell Ozuna strikes out swinging.,5.0,...,18.4,77.4,8.0,73.445847,6.833112,3.954153,1.166888,0.645658,2.109747,0.000029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196956,SI,2024-04-03,92.3,"Keller, Mitch",682928,656605,field_out,hit_into_play,CJ Abrams pops out to third baseman Ke'Bryan H...,2.0,...,31.7,64.2,6.2,67.746497,6.518272,-3.546497,-0.318272,-0.579094,-0.575440,0.033400
196957,FF,2024-04-03,92.4,"Keller, Mitch",682928,656605,,foul,CJ Abrams pops out to third baseman Ke'Bryan H...,2.0,...,31.5,66.8,6.0,69.811933,6.778951,-3.011933,-0.778951,-0.491807,-1.408353,0.207797
196958,CH,2024-04-03,83.3,"Williams, Trevor",668804,592866,field_out,hit_into_play,Bryan Reynolds lines out to right fielder Lane...,5.0,...,15.5,70.9,7.2,71.598932,6.546403,-0.698932,0.653597,-0.114126,1.181711,0.000396
196959,CH,2024-04-03,82.5,"Williams, Trevor",665833,592866,strikeout,swinging_strike,Oneil Cruz strikes out swinging.,13.0,...,14.8,82.1,8.5,73.092508,8.483885,9.007492,0.016115,1.470798,0.029137,0.007303


In [38]:
swings_2024["outlier_prob"].isna().sum()

np.int64(0)

In [40]:
swings_2024.to_csv("swings_2024_outlier_added.csv")

In [None]:
for mlbam_id, player_df in swings_2024.groupby("batter"):
    outlier_probs = run_mixture_model(player_df)
    swings_2024.loc[player_df.index, "outlier_prob"] = np.asarray(outlier_probs)

  mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
sample: 100%|█| 2000/2000 [00:04<00:00, 451.09it/s, 15 steps of size 1.37e-01. a
sample: 100%|█| 2000/2000 [00:02<00:00, 828.22it/s, 15 steps of size 1.66e-01. a
  mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
sample: 100%|█| 2000/2000 [00:02<00:00, 667.83it/s, 15 steps of size 1.70e-01. a
sample: 100%|█| 2000/2000 [00:02<00:00, 974.33it/s, 15 steps of size 1.49e-01. a
  mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
sample: 100%|█| 2000/2000 [00:04<00:00, 490.82it/s, 15 steps of size 1.23e-01. a
sample: 100%|█| 2000/2000 [00:03<00:00, 664.84it/s, 15 steps of size 1.35e-01. a
  mcmc = MCMC(nuts_kernel, num_warmup = 1000, num_samples = 1000, num_chains = 2)
sample: 100%|█| 2000/2000 [00:04<00:00, 446.28it/s, 15 steps of size 8.01e-02. a
sample: 100%|█| 2000/2000 [00:03<00:00, 561.17it/s, 31 steps of size 1.15e-01. a
  mcmc = MCMC(nuts_kerne