In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint

import pickle
import joblib

import warnings
import logging

from itertools import product

#logging.basicConfig(level=logging.DEBUG)
#logging.basicConfig(level=logging.ERROR)

%run ../src/data_helpers.py
%run ../src/plot_helpers.py
%run ../src/xgboost_model_helpers.py
%run ../src/mlflow_pipeline_helpers.py

np.random.seed(909)
warnings.filterwarnings('ignore')



In [None]:
def make_stuff_model_xRV_predictions(pred_data, model_dict, feature_li, run_val_df):
    """
    Purpose: Generates expected run value (xRV) predictions using suite of sub-models for pitch, ball-in-play outcomes

    Parameter(s):
        pred_data (pd.DataFrame): Input DataFrame used for model predictions
        model_dict (dict): Dictionary containing trained models with keys:
            "Swing", "Take", "Contact", "Fair", "Launch", and "EV"
        feature_li (list): List of feature names to use for model prediction
        run_val_df (pd.DataFrame): DataFrame containing the context-neutral run values for pitch, ball-in-play outcomes

    Return(s):
        pd.DataFrame: Input `pred_data` with added columns for:
            - xRV_swing: Expected run value for swing outcomes given the probability of a swing
            - xRV_take: Expected run value for take outcomes given the probability of not swinging
            - xRV_stuff: Overall expected run value
    """
    
    swing_mod = model_dict["Swing"]
    take_mod = model_dict["Take"]
    contact_mod = model_dict["Contact"]
    fair_mod = model_dict["Fair"]
    launch_mod = model_dict["Launch"]
    ev_mod = model_dict["EV"]


    # Predict outcome probabilities for all models
    pred_data[["P(No Swing)", "P(Swing)"]] = swing_mod.predict(pred_data[feature_li], params={"predict_method": "predict_proba"})
    pred_data[["P(Ball)", "P(Called Strike)", "P(HBP)"]] = take_mod.predict(pred_data[feature_li], params={"predict_method": "predict_proba"})
    pred_data[["P(No Contact)", "P(Contact)"]] = contact_mod.predict(pred_data[feature_li], params={"predict_method": "predict_proba"})
    pred_data[["P(Foul)", "P(Fair)"]] = fair_mod.predict(pred_data[feature_li], params={"predict_method": "predict_proba"})
    
    launch_cols = ["P(GB)", "P(LD)", "P(FB)", "P(PU)"]
    ev_cols = ["P(<90)", "P(90_95)", "P(95_100)", "P(100_105)", "P(>105)"]
    
    pred_data[launch_cols] = launch_mod.predict(pred_data[feature_li], params={"predict_method": "predict_proba"})
    pred_data[ev_cols] = ev_mod.predict(pred_data[feature_li], params={"predict_method": "predict_proba"})

    # Create joint distribution of launch angle × exit velocity
    joint_names = [f"{l}|{v}" for l, v in product(launch_cols, ev_cols)]

    # Compute outer product of probabilities for launch and EV bucket combinations
    joint_probs = np.einsum(
        'ij,ik->ijk', 
        pred_data[launch_cols].values, 
        pred_data[ev_cols].values
    ).reshape(len(pred_data), -1)

    pred_data[joint_names] = pd.DataFrame(joint_probs, index=pred_data.index)

    # Drop original marginal columns
    pred_data.drop(columns=launch_cols + ev_cols, inplace=True)

    # Pull context-neutral run values
    cs_val = run_values[run_values.description_bucket == "called_strike"].mean_run_value.iloc[0]
    sw_val = run_values[run_values.description_bucket == "swinging_strike"].mean_run_value.iloc[0]
    ball_val = run_values[run_values.description_bucket == "ball"].mean_run_value.iloc[0]
    foul_val = run_values[run_values.description_bucket == "foul"].mean_run_value.iloc[0]
    hbp_val = run_values[run_values.description_bucket == "hit_by_pitch"].mean_run_value.iloc[0]

    # Map joint probabilities to run value
    run_val_map = {
        f"P({row.launch_angle_bucket})|P({row.EV_bucket})": row.mean_run_value
        for _, row in run_val_df.dropna().iterrows()
    }

    # Align the run value vector to joint_names
    run_val_vector = np.array([run_val_map.get(name, 0.0) for name in joint_names])  # fallback to 0 if missing

    # Compute xRV given context-neutral run values and probabiltiies, chained together by swing/take outcomes to ball-in-play outcomes
    pred_data["xRV_swing"] = ((1 - pred_data["P(Contact)"]) * sw_val) + pred_data["P(Contact)"] * ((pred_data["P(Foul)"] * foul_val) + (pred_data["P(Fair)"] * (pred_data[joint_names].values @ run_val_vector)))
    pred_data["xRV_take"] = (pred_data["P(HBP)"] * hbp_val) + ((1 - pred_data["P(HBP)"]) * ((pred_data["P(Called Strike)"] * cs_val) + ((1 - pred_data["P(Called Strike)"]) * ball_val)))
    pred_data["xRV_stuff"] = (pred_data["P(Swing)"] * pred_data["xRV_swing"]) + (pred_data["P(No Swing)"] * pred_data["xRV_take"])

    return pred_data

In [7]:
# Read in Statcast data (cleaned and transformed)
data2023 = pd.read_csv("/Users/josh/Documents/Baseball/Statcast/data23_trf.csv", low_memory = True)
data2024 = pd.read_csv("/Users/josh/Documents/Baseball/Statcast/data24_trf.csv", low_memory = True)
data2025 = pd.read_csv("/Users/josh/Documents/Baseball/Statcast/data25_trf.csv", low_memory = True)

sort_cols = ["game_year", "game_date", "game_pk", "at_bat_number", "pitch_number"]

data_concat = pd.concat([data2023, data2024, data2025], axis = 0).sort_values(sort_cols).reset_index(drop=True).dropna(subset=["arm_angle"])

In [8]:
# Create dataframes for training models
data_concat_diff = add_diff_features(data_concat)
data2025_diff = add_diff_features(data2025)

In [4]:
# Read in models
pyfunc_model_path = "../tmp/p_swing_stuff_xgb"
swing_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_take_stuff_xgb"
take_outcome_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_contact_stuff_xgb"
contact_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_fair_stuff_xgb"
fair_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_launch_stuff_xgb"
launch_bucket_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_ev_stuff_xgb"
ev_bucket_model = mlflow.pyfunc.load_model(pyfunc_model_path)

In [5]:
# Define model dictionary to use in prediction steps
stuff_standard_model_dict = {
    "Swing": swing_model,
    "Take": take_outcome_model,
    "Contact": contact_model,
    "Fair": fair_model,
    "Launch": launch_bucket_model,
    "EV": ev_bucket_model
}

In [9]:
# Define movement diff models
pyfunc_model_path = "../tmp/p_swing_stuff_diffs_xgb"
swing_diffs_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_take_stuff_diffs_xgb"
take_outcome_diffs_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_contact_stuff_diffs_xgb"
contact_diffs_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_fair_stuff_diffs_xgb"
fair_diffs_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_launch_stuff_diffs_xgb"
launch_bucket_diffs_model = mlflow.pyfunc.load_model(pyfunc_model_path)

pyfunc_model_path = "../tmp/p_ev_stuff_diffs_xgb"
ev_bucket_diffs_model = mlflow.pyfunc.load_model(pyfunc_model_path)

In [10]:
# Define movement diff model dictionary
stuff_diff_model_dict = {
    "Swing": swing_diffs_model,
    "Take": take_outcome_diffs_model,
    "Contact": contact_diffs_model,
    "Fair": fair_diffs_model,
    "Launch": launch_bucket_diffs_model,
    "EV": ev_bucket_diffs_model
}

In [11]:
# Grab context-neutral run values from data
run_values = get_average_run_value_per_bip_bucket(data_concat, True)

In [13]:
# Get predictions from both datasets
predict_data = get_pitch_outcome_features(data_concat, "swing_stuff", predict= True)
predict_data_diff = get_pitch_outcome_features(data_concat_diff, "swing_stuff", predict= True, added_features=["effective_speed_diff", "pfx_x_diff", "pfx_z_diff"])

In [16]:
# Define feature list, make predictions
feature_list = [
    "is_rhh", "count", "sz_top", "sz_bot",
    "release_pos_x", "release_pos_z", "release_extension", "arm_angle", "effective_speed", "pfx_x", "pfx_z",
    "effective_speed_diff", "pfx_x_diff", "pfx_z_diff"
]

stuff_preds = make_stuff_model_xRV_predictions(predict_data, stuff_standard_model_dict, feature_list[:-2], run_values)
stuff_preds_diff = make_stuff_model_xRV_predictions(predict_data_diff, stuff_diff_model_dict, feature_list, run_values)

In [30]:
# Write predictions to CSV files
cols = [
    'batter', 'pitcher', 'game_year', 'game_date', 'pitch_type', 'p_throws', 'stand', 'player_name', 'at_bat_number', 'pitch_number',
    'description_bucket', 'events', 'balls', 'strikes', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'estimated_slg_using_speedangle', 
    'woba_value', 'woba_denom', 'delta_run_exp', 'is_rhh', 'count', 'sz_top', 'sz_bot', 'release_pos_x', 'release_pos_z', 'release_extension',
    'arm_angle', 'effective_speed', 'pfx_x', 'pfx_z', 'xRV_swing', 'xRV_take', 'xRV_stuff'
]

stuff_preds[cols].to_csv("../stuff_standard_preds.csv", index = False)

cols = [
    'batter', 'pitcher', 'game_year', 'game_date', 'pitch_type', 'p_throws', 'stand', 'player_name', 'at_bat_number', 'pitch_number',
    'description_bucket', 'events', 'balls', 'strikes', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'estimated_slg_using_speedangle', 
    'woba_value', 'woba_denom', 'delta_run_exp', 'is_rhh', 'count', 'sz_top', 'sz_bot', 'release_pos_x', 'release_pos_z', 'release_extension',
    'arm_angle', 'effective_speed', 'pfx_x', 'pfx_z', 'effective_speed_diff', 'pfx_x_diff', 'pfx_z_diff', 
    'xRV_swing', 'xRV_take', 'xRV_stuff'
]

stuff_preds_diff[cols].to_csv("../stuff_diffs_preds.csv", index = False)