# tjStuff+ v+3.0

##### By: Thomas Nestico ([@TJStats](https://x.com/TJStats))
##### Data: MLB (Downloaded via my [MLB Stats API Scraper](https://github.com/tnestico/mlb_scraper))
##### [Medium Article](https://medium.com/@thomasjamesnestico/modelling-tjstuff-v3-0-10b48294c7fb)


### Data Loading

I downloaded 2020-24 Data using my MLB Stats API Scraper, and wrote it to a CSV file on my local machine. The data can be downloaded at this [link](https://huggingface.co/datasets/nesticot/mlb_data/blob/main/mlb_pitch_data_2020_2024.csv)

This notebook uses Polars for its DataFrame Library.

In [None]:
# Import the packages
import polars as pl
import numpy as np

# Load the MLB pitch data from a CSV file into a Polars DataFrame
df = pl.read_csv("mlb_pitch_data_2020_2024.csv")

### Run Values

This project uses Run Values (RV) as the target. Using Baseball Savant's 2024 CSVs, I created a CSV File called 'run_values.csv'. This CSV file hosts the Run Values for specified events from the 2024 MLB Season.

These events include:

* Pitch-Level Events
    * Ball
    * Called Strike
    * Swinging Strike
    * Foul
    * Hit By Pitch

* Batted Ball Events
    * Single
    * Double
    * Triple
    * Home Run
    * Field Out

Each of these events are assigned a RV which is the average RV of that event for a given count (Balls-Strikes) during the 2024 Season. The following code reassigns the events in the Full Dataset to one of these outcomes and then assigns their specified RV to a column called 'target'.

In [None]:
# Assume 'df' is already loaded with necessary pitch data

# Define a dictionary to group pitch outcomes relevant to whiff calculation
des_dict = {
    'Ball': 'ball',
    'In play, run(s)': 'hit_into_play',
    'In play, out(s)': 'hit_into_play',
    'In play, no out': 'hit_into_play',
    'Called Strike': 'called_strike',
    'Foul': 'foul',
    'Swinging Strike': 'swinging_strike',  # Important for whiff
    'Blocked Ball': 'ball',
    'Swinging Strike (Blocked)': 'swinging_strike',  # Important for whiff
    'Foul Tip': 'swinging_strike',  # Important for whiff
    'Foul Bunt': 'foul',
    'Hit By Pitch': 'hit_by_pitch',
    'Pitchout': 'ball',
    'Missed Bunt': 'swinging_strike',  # Important for whiff
    'Bunt Foul Tip': 'swinging_strike',  # Important for whiff
    'Foul Pitchout': 'foul',
    'Ball In Dirt': 'ball'
}

# Define a dictionary to group events together
event_dict = {
    'game_advisory': None,
    'single': 'single',
    'walk': 'walk',
    np.nan: None,
    'strikeout': 'strikeout',  # Relevant for whiff as strikeouts correlate with swinging strikes
    'field_out': 'field_out',
    'force_out': 'field_out',
    'double': 'double',
    'hit_by_pitch': 'hit_by_pitch',
    'home_run': 'home_run',
    'grounded_into_double_play': 'field_out',
    'fielders_choice_out': 'field_out',
    'fielders_choice': 'field_out',
    'field_error': None,
    'double_play': 'field_out',
    'sac_fly': 'field_out',
    'strikeout_double_play': None,
    'triple': 'triple',
    'caught_stealing_2b': None,
    'sac_bunt': 'field_out',
    'catcher_interf': None,
    'caught_stealing_3b': None,
    'sac_fly_double_play': 'field_out',
    'triple_play': 'field_out',
    'other_out': 'field_out',
    'pickoff_3b': None,
    'caught_stealing_home': None,
    'pickoff_1b': None,
    'pickoff_2b': None,
    'wild_pitch': None,
    'stolen_base_2b': None,
    'pickoff_caught_stealing_3b': None,
    'pickoff_caught_stealing_2b': None,
    'sac_bunt_double_play': None,
    'passed_ball': None,
    'pickoff_caught_stealing_home': None
}

# Create a separate dataframe for the relevant whiff data
df_whiff = df[['play_description', 'balls', 'strikes', 'is_swing', 'is_whiff']].unique()

# Replace play descriptions with the grouped outcomes from des_dict
df = df.with_columns(pl.col("play_description").replace_strict(des_dict, default=None))

# Join the whiff-related data (df_whiff) with the main dataframe (df) based on the play description, balls, and strikes
df = df.join(df_whiff, 
             left_on=['play_description', 'balls', 'strikes'],
             right_on=['play_description', 'balls', 'strikes'], 
             how='left', 
             suffix='_whiff')

# Convert the boolean columns 'is_swing' and 'is_whiff' to integers
df = df.with_columns(
    pl.when(pl.col('is_swing').is_null()).then(0).otherwise(pl.col('is_swing').cast(pl.Int32)).alias('is_swing_int'),
    pl.when(pl.col('is_whiff').is_null()).then(0).otherwise(pl.col('is_whiff').cast(pl.Int32)).alias('is_whiff_int')
)

# Now, define the target variable:
# If the batter swung (is_swing == 1) and missed (is_whiff == 1), mark it as 1 (whiff); otherwise, 0
df = df.with_columns(
    pl.when(pl.col('is_swing_int') == 1)
    .then(pl.when(pl.col('is_whiff_int') == 1).then(1).otherwise(0))  # If swung and missed, it's a whiff
    .otherwise(0)  # If no swing, not a whiff
    .alias('target')
)



# Now df['target'] will contain 1 for whiffs and 0 for non-whiffs



In [None]:
import polars as pl

# Load the run values data from CSV
df_run_values = pl.read_csv("run_values.csv")

# Define a dictionary to group pitch outcomes together
des_dict = {
    'Ball': 'ball',
    'In play, run(s)': 'hit_into_play',
    'In play, out(s)': 'hit_into_play',
    'In play, no out': 'hit_into_play',
    'Called Strike': 'called_strike',
    'Foul': 'foul',
    'Swinging Strike': 'swinging_strike',  # Important for whiff
    'Blocked Ball': 'ball',
    'Swinging Strike (Blocked)': 'swinging_strike',  # Important for whiff
    'Foul Tip': 'swinging_strike',  # Important for whiff
    'Foul Bunt': 'foul',
    'Hit By Pitch': 'hit_by_pitch',
    'Pitchout': 'ball',
    'Missed Bunt': 'swinging_strike',  # Important for whiff
    'Bunt Foul Tip': 'swinging_strike',  # Important for whiff
    'Foul Pitchout': 'foul',
    'Ball In Dirt': 'ball'
}

# Define a dictionary to group events together
event_dict = {
    'game_advisory': None,
    'single': 'single',
    'walk': 'walk',
    np.nan: None,
    'strikeout': 'strikeout',  # Relevant for whiff as strikeouts correlate with swinging strikes
    'field_out': 'field_out',
    'force_out': 'field_out',
    'double': 'double',
    'hit_by_pitch': 'hit_by_pitch',
    'home_run': 'home_run',
    'grounded_into_double_play': 'field_out',
    'fielders_choice_out': 'field_out',
    'fielders_choice': 'field_out',
    'field_error': None,
    'double_play': 'field_out',
    'sac_fly': 'field_out',
    'strikeout_double_play': None,
    'triple': 'triple',
    'caught_stealing_2b': None,
    'sac_bunt': 'field_out',
    'catcher_interf': None,
    'caught_stealing_3b': None,
    'sac_fly_double_play': 'field_out',
    'triple_play': 'field_out',
    'other_out': 'field_out',
    'pickoff_3b': None,
    'caught_stealing_home': None,
    'pickoff_1b': None,
    'pickoff_2b': None,
    'wild_pitch': None,
    'stolen_base_2b': None,
    'pickoff_caught_stealing_3b': None,
    'pickoff_caught_stealing_2b': None,
    'sac_bunt_double_play': None,
    'passed_ball': None,
    'pickoff_caught_stealing_home': None
}

# Join the run values data with the main dataframe based on event type, balls, and strikes
df = df.join(df_run_values, 
             left_on=['event_type', 'balls', 'strikes'],
             right_on=['event', 'balls', 'strikes'], 
             how='left')

# Replace play descriptions with the grouped outcomes from des_dict
df = df.with_columns(pl.col("play_description").replace_strict(des_dict, default=None))

# Join the run values data again based on the play description, balls, and strikes
df = df.join(df_run_values, 
             left_on=['play_description', 'balls', 'strikes'],
             right_on=['event', 'balls', 'strikes'], 
             how='left',
             suffix='_des')

# # Adjust delta_run_exp for whiffs by applying a multiplier (e.g., 1.5x for whiffs)
# df = df.with_columns(
#     pl.when(pl.col("play_description") == "swinging_strike")  # Focus on whiffs
#     .then(pl.col("delta_run_exp") * 1.5)  # Increase weight for whiffs
#     .otherwise(pl.col("delta_run_exp"))
#     .alias("delta_run_exp_adjusted")
# )
# df = df.with_columns(
#     pl.when(pl.col("play_description") == "strikeout")  # Focus on whiffs
#     .then(pl.col("delta_run_exp") * 1.5)  # Increase weight for whiffs
#     .otherwise(pl.col("delta_run_exp"))
#     .alias("delta_run_exp_adjusted")
# )
# Assign the target column based on the adjusted delta run expectation
df = df.with_columns(
    pl.when(pl.col("delta_run_exp_adjusted").is_null())
    .then(pl.col("delta_run_exp_des"))
    .otherwise(pl.col("delta_run_exp_adjusted"))
    .alias("target")
)

# The resulting 'target' column now has adjusted run values for whiffs


In [None]:
# Load the run values data from CSV
df_run_values = pl.read_csv("adj_run_values2.csv")  

# Define a dictionary to group pitch outcomes together
des_dict = {
    'Ball': 'ball',
    'In play, run(s)': 'hit_into_play',
    'In play, out(s)': 'hit_into_play',
    'In play, no out': 'hit_into_play',
    'Called Strike': 'called_strike',
    'Foul': 'foul',
    'Swinging Strike': 'swinging_strike',
    'Blocked Ball': 'ball',
    'Swinging Strike (Blocked)': 'swinging_strike',
    'Foul Tip': 'swinging_strike',
    'Foul Bunt': 'foul',
    'Hit By Pitch': 'hit_by_pitch',
    'Pitchout': 'ball',
    'Missed Bunt': 'swinging_strike',
    'Bunt Foul Tip': 'swinging_strike',
    'Foul Pitchout': 'foul',
    'Ball In Dirt': 'ball'
}

# Define a dictionary to group events together
event_dict = {
    'game_advisory': None,
    'single': 'single',
    'walk': 'walk',
    np.nan: None,
    'strikeout': 'strikeout',
    'field_out': 'field_out',
    'force_out': 'field_out',
    'double': 'double',
    'hit_by_pitch': 'hit_by_pitch',
    'home_run': 'home_run',
    'grounded_into_double_play': 'field_out',
    'fielders_choice_out': 'field_out',
    'fielders_choice': 'field_out',
    'field_error': None,
    'double_play': 'field_out',
    'sac_fly': 'field_out',
    'strikeout_double_play': None,
    'triple': 'triple',
    'caught_stealing_2b': None,
    'sac_bunt': 'field_out',
    'catcher_interf': None,
    'caught_stealing_3b': None,
    'sac_fly_double_play': 'field_out',
    'triple_play': 'field_out',
    'other_out': 'field_out',
    'pickoff_3b': None,
    'caught_stealing_home': None,
    'pickoff_1b': None,
    'pickoff_2b': None,
    'wild_pitch': None,
    'stolen_base_2b': None,
    'pickoff_caught_stealing_3b': None,
    'pickoff_caught_stealing_2b': None,
    'sac_bunt_double_play': None,
    'passed_ball': None,
    'pickoff_caught_stealing_home': None
}

# Join the run values data with the main dataframe based on event type, balls, and strikes
df = df.join(df_run_values, 
             left_on=['event_type', 'balls', 'strikes'],
             right_on=['event', 'balls', 'strikes'], 
             how='left')

# Replace play descriptions with the grouped outcomes from des_dict
df = df.with_columns(pl.col("play_description").replace_strict(des_dict, default=None))

# Join the run values data again based on the play description, balls, and strikes
df = df.join(df_run_values, 
             left_on=['play_description', 'balls', 'strikes'],
             right_on=['event', 'balls', 'strikes'], 
             how='left',
             suffix='_des')

# Assign the target column based on the delta run expectation
df = df.with_columns(
    pl.when(pl.col("delta_run_exp").is_null())
    .then(pl.col("delta_run_exp_des"))
    .otherwise(pl.col("delta_run_exp"))
    .alias("target")
)

In [None]:
# Load the run values data from CSV
df_run_values = pl.read_csv("adj_run_values.csv")  

# Define a dictionary to group pitch outcomes together
des_dict = {
    'Ball': 'ball',
    'In play, run(s)': 'hit_into_play',
    'In play, out(s)': 'hit_into_play',
    'In play, no out': 'hit_into_play',
    'Called Strike': 'called_strike',
    'Foul': 'foul',
    'Swinging Strike': 'swinging_strike',
    'Blocked Ball': 'ball',
    'Swinging Strike (Blocked)': 'swinging_strike',
    'Foul Tip': 'swinging_strike',
    'Foul Bunt': 'foul',
    'Hit By Pitch': 'hit_by_pitch',
    'Pitchout': 'ball',
    'Missed Bunt': 'swinging_strike',
    'Bunt Foul Tip': 'swinging_strike',
    'Foul Pitchout': 'foul',
    'Ball In Dirt': 'ball'
}

# Define a dictionary to group events together
event_dict = {
    'game_advisory': None,
    'single': 'single',
    'walk': 'walk',
    np.nan: None,
    'strikeout': 'strikeout',
    'field_out': 'field_out',
    'force_out': 'field_out',
    'double': 'double',
    'hit_by_pitch': 'hit_by_pitch',
    'home_run': 'home_run',
    'grounded_into_double_play': 'field_out',
    'fielders_choice_out': 'field_out',
    'fielders_choice': 'field_out',
    'field_error': None,
    'double_play': 'field_out',
    'sac_fly': 'field_out',
    'strikeout_double_play': None,
    'triple': 'triple',
    'caught_stealing_2b': None,
    'sac_bunt': 'field_out',
    'catcher_interf': None,
    'caught_stealing_3b': None,
    'sac_fly_double_play': 'field_out',
    'triple_play': 'field_out',
    'other_out': 'field_out',
    'pickoff_3b': None,
    'caught_stealing_home': None,
    'pickoff_1b': None,
    'pickoff_2b': None,
    'wild_pitch': None,
    'stolen_base_2b': None,
    'pickoff_caught_stealing_3b': None,
    'pickoff_caught_stealing_2b': None,
    'sac_bunt_double_play': None,
    'passed_ball': None,
    'pickoff_caught_stealing_home': None
}

# Join the run values data with the main dataframe based on event type, balls, and strikes
df = df.join(df_run_values, 
             left_on=['event_type', 'balls', 'strikes', 'pitcher_hand', 'batter_hand'],
             right_on=['event', 'balls', 'strikes', 'pitcher_hand', 'batter_hand'], 
             how='left')

# Replace play descriptions with the grouped outcomes from des_dict
df = df.with_columns(pl.col("play_description").replace_strict(des_dict, default=None))

# Join the run values data again based on the play description, balls, and strikes
df = df.join(df_run_values, 
             left_on=['play_description', 'balls', 'strikes', 'pitcher_hand', 'batter_hand'],
             right_on=['event', 'balls', 'strikes', 'pitcher_hand', 'batter_hand'], 
             how='left',
             suffix='_des')

# Assign the target column based on the delta run expectation
df = df.with_columns(
    pl.when(pl.col("delta_run_exp").is_null())
    .then(pl.col("delta_run_exp_des"))
    .otherwise(pl.col("delta_run_exp"))
    .alias("target")
)

### Feature Engineering

As part of modelling, I engineered features to help improve performance. This included mirroring horizontal break and horizontal release points for Left-Handed Pitchers as well as including the features which relate to each pitcher's primary Fastball.

The following function returns the engineered features.

In [None]:
def feature_engineering(df: pl.DataFrame) -> pl.DataFrame:
    # Extract the year from the game_date column
    df = df.with_columns(
        pl.col('game_date').str.slice(0, 4).alias('year')
    )

    # Mirror horizontal break for left-handed pitchers
    df = df.with_columns(
        pl.when(pl.col('pitcher_hand') == 'L')
        .then(-pl.col('ax'))
        .otherwise(pl.col('ax'))
        .alias('ax')
    )

    # Mirror horizontal release point for left-handed pitchers
    df = df.with_columns(
        pl.when(pl.col('pitcher_hand') == 'L')
        .then(pl.col('x0'))
        .otherwise(-pl.col('x0'))
        .alias('x0')
    )

    # Define the pitch types to be considered
    pitch_types = ['SI', 'FF', 'FC']

    # Filter the DataFrame to include only the specified pitch types
    df_filtered = df.filter(pl.col('pitch_type').is_in(pitch_types))

    # Group by pitcher_id and year, then aggregate to calculate average speed and usage percentage
    df_agg = df_filtered.group_by(['pitcher_id', 'year', 'pitch_type']).agg([
        pl.col('start_speed').mean().alias('avg_fastball_speed'),
        pl.col('az').mean().alias('avg_fastball_az'),
        pl.col('ax').mean().alias('avg_fastball_ax'),
        pl.len().alias('count')
    ])

    # Sort the aggregated data by count and average fastball speed
    df_agg = df_agg.sort(['count', 'avg_fastball_speed'], descending=[True, True])
    df_agg = df_agg.unique(subset=['pitcher_id', 'year'], keep='first')

    # Join the aggregated data with the main DataFrame
    df = df.join(df_agg, on=['pitcher_id', 'year'])

    # If no fastball, use the fastest pitch for avg_fastball_speed
    df = df.with_columns(
        pl.when(pl.col('avg_fastball_speed').is_null())
        .then(pl.col('start_speed').max().over('pitcher_id'))
        .otherwise(pl.col('avg_fastball_speed'))
        .alias('avg_fastball_speed')
    )

    # If no fastball, use the fastest pitch for avg_fastball_az
    df = df.with_columns(
        pl.when(pl.col('avg_fastball_az').is_null())
        .then(pl.col('az').max().over('pitcher_id'))
        .otherwise(pl.col('avg_fastball_az'))
        .alias('avg_fastball_az')
    )

    # If no fastball, use the fastest pitch for avg_fastball_ax
    df = df.with_columns(
        pl.when(pl.col('avg_fastball_ax').is_null())
        .then(pl.col('ax').max().over('ax'))
        .otherwise(pl.col('avg_fastball_ax'))
        .alias('avg_fastball_ax')
    )

    # Calculate pitch differentials
    df = df.with_columns(
        (pl.col('start_speed') - pl.col('avg_fastball_speed')).alias('speed_diff'),
        (pl.col('az') - pl.col('avg_fastball_az')).alias('az_diff'),
        (pl.col('ax') - pl.col('avg_fastball_ax')).abs().alias('ax_diff')
    )

    # Cast the year column to integer type
    df = df.with_columns(
        pl.col('year').cast(pl.Int64)
    )

    return df

df = feature_engineering(df.clone())

### Model Training

As outlined in my Medium Article, I will be training on 2020-22 Data. I will then validate the model's predictiveness using 2023 data and comparing it to 2024 results. After I validate that the model is effectively predicting future performance, I will train with 2023 data and then test its predictiveness using 2024 data and comparing it to 2024 results.

##### Prepare Training Data

In [None]:
# Filter the dataframe to include only the years 2020, 2021, and 2022
df_train = df.filter(pl.col('year').is_in([2020, 2021, 2022]))

# Define the features to be used for training
features = ['start_speed',
            'spin_rate',
            'extension',
            'az',
            'ax',
            'x0',
            'z0',
            'speed_diff',
            'az_diff',
            'ax_diff']

# Define the target variable
target = 'target'

# Drop rows with null values in the specified features and target column
df_train = df_train.drop_nulls(subset=features + [target])

### Model Training

I used an LGBMRegressor for the model. I also applied a RobustScaler to the training data.

In [None]:
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMRegressor
from sklearn.preprocessing import RobustScaler

# Extract features and target from the training dataframe
X = df_train[features]
y = df_train['target']

# Create a pipeline with RobustScaler and LGBMRegressor
model = make_pipeline(
    RobustScaler(),            # Robust Scaler to scale the features
    LGBMRegressor(
        n_estimators=1000,         # Number of boosting rounds (trees) to be built.
        learning_rate=0.01,        # Step size shrinkage used to prevent overfitting. Smaller values require more boosting rounds.
        num_leaves=31,             # Maximum number of leaves in one tree. Controls the complexity of the model.
        max_depth=-1,              # Maximum depth of the tree. -1 means no limit.
        min_child_samples=20,      # Minimum number of data points required in a leaf. Helps control overfitting.
        subsample=0.8,             # Fraction of data to be used for each boosting round. Helps prevent overfitting.
        colsample_bytree=0.8,      # Fraction of features to be used for each boosting round. Helps prevent overfitting.
        reg_alpha=0.1,             # L1 regularization term on weights. Helps prevent overfitting.
        reg_lambda=0.2,            # L2 regularization term on weights. Helps prevent overfitting.
        random_state=42,           # Seed for reproducibility.
        force_row_wise=True        # Force row-wise (data parallel) computation. Useful for handling large datasets.
    )
)

# Fit the model to the training data
model.fit(X, y)

# # Save the model to a file
import joblib
joblib.dump(model, 'model/stuff_modelv2.joblib')
print("Model saved to model/lgbm_model_2020_2022.joblib")

### Feature Importance

LGBMRegressor returns feature importance which helps us understand which features are most influential in making predictions. From our trained model, we see that Pitch Velocity and iVB are the most impactful features, which is intuitive.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns  
sns.set_theme(style='whitegrid')

lgbm_model = model.named_steps['lgbmregressor']

# Extract feature importances
feature_importances = lgbm_model.feature_importances_

# Assuming 'features' is a list of feature names
importance_df = pl.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
importance_df = importance_df.sort(by='Importance', descending=True)

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()

### Testing

To validate the model, we wil predicting tjStuff+ values on 2023 data and then calculating the correlation of tjStuff+ to 2024 results. The results we will use are FIP, wOBA, and K-BB%. We will also calculate tjStuff+ on 2024 data to evaluate the 'stickiness' of the metric.

In [None]:
# Filter the dataframe to include only the rows for the year 2023 and drop rows with null values in the specified features and target column
df_test = df.filter(pl.col('year').is_in([2023, 2024])).drop_nulls(subset=features + [target])

# Predict the target values for the 2023 data using the trained model
df_test = df_test.with_columns(
    pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
)

# For help with plotting the pitch data, we will use the following dictionary to map pitch types to their corresponding colours
### PITCH COLOURS ###
pitch_colours = {
    ## Fastballs ##
    'FF': {'colour': '#FF0000', 'name': '4-Seam Fastball'},
    'FA': {'colour': '#FF0000', 'name': 'Fastball'},
    'SI': {'colour': '#623fff', 'name': 'Sinker'},
    'FC': {'colour': '#FF007D', 'name': 'Cutter'},

    ## Offspeed ##
    'CH': {'colour': '#FFB000', 'name': 'Changeup'},
    'FS': {'colour': '#FE6100', 'name': 'Splitter'},
    'SC': {'colour': '#F08223', 'name': 'Screwball'},
    'FO': {'colour': '#FFEB00', 'name': 'Forkball'},

    ## Sliders ##
    'SL': {'colour': '#67E18D', 'name': 'Slider'},
    'ST': {'colour': '#8e44ad', 'name': 'Sweeper'},
    'SV': {'colour': '#00d7e1', 'name': 'Slurve'},

    ## Curveballs ##
    'KC': {'colour': '#648FFF', 'name': 'Knuckle Curve'},
    'CU': {'colour': '#274BFC', 'name': 'Curveball'},
    'CS': {'colour': '#3025CE', 'name': 'Slow Curve'},
    'EP': {'colour': '#C2C2C2', 'name': 'Eephus'},

    ## Others ##
    'KN': {'colour': '#867A08', 'name': 'Knuckleball'},
    'PO': {'colour': '#472C30', 'name': 'Pitch Out'},
    'UN': {'colour': '#9C8975', 'name': 'Unknown'},
}

# Create a dictionary mapping pitch types to their colors
dict_colour = dict(zip(pitch_colours.keys(), [pitch_colours[key]['colour'] for key in pitch_colours]))

# Create a dictionary mapping pitch types to their colors
dict_pitch = dict(zip(pitch_colours.keys(), [pitch_colours[key]['name'] for key in pitch_colours]))

# Create a dictionary mapping pitch types to their colors
dict_pitch_desc_type = dict(zip([pitch_colours[key]['name'] for key in pitch_colours],pitch_colours.keys()))


# Create a dictionary mapping pitch types to their colors
dict_pitch_name = dict(zip([pitch_colours[key]['name'] for key in pitch_colours], 
                           [pitch_colours[key]['colour'] for key in pitch_colours]))

tjStuff+ is normally distributed and can be calculated using the following code.

In [None]:
# Filter the dataframe to include only the rows for the year 2023 and 2024
df_2023 = df_test.filter(pl.col('year') == 2023)
df_2024 = df_test.filter(pl.col('year') == 2024)

## 2023 tjStuff+ ##
# Calculate the mean and standard deviation of the target column for 2023
target_mean = df_2023['target'].mean()
target_std = df_2023['target'].std()

# Standardize the target column to create a z-score for 2023
df_2023 = df_2023.with_columns(
    ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
)

# Convert the z-score to tj_stuff_plus for 2023
df_2023 = df_2023.with_columns(
    (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
)

# Aggregate tj_stuff_plus by pitcher_id and year for 2023
df_agg_2023 = df_2023.group_by(['pitcher_id', 'year']).agg(
    pl.col('tj_stuff_plus').len().alias('count'),
    pl.col('tj_stuff_plus').mean()
)

## 2024 tjStuff+ ##
# Standardize the target column to create a z-score for 2024 using 2023 mean and std
df_2024 = df_2024.with_columns(
    ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
)

# Convert the z-score to tj_stuff_plus for 2024
df_2024 = df_2024.with_columns(
    (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
)

# Aggregate tj_stuff_plus by pitcher_id and year for 2024
df_agg_2024 = df_2024.group_by(['pitcher_id', 'year']).agg(
    pl.col('tj_stuff_plus').len().alias('count'),
    pl.col('tj_stuff_plus').mean()
)


The following code calculates tjStuff+ and plots the Histogram of 2023 pitch level tjStuff+. 

In [None]:
# Import necessary libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Convert Polars DataFrame to Pandas DataFrame for Seaborn compatibility
df_2023_pd = df_2023.to_pandas()

# Create subplots for the histograms
fig, ax = plt.subplots(2, 1, figsize=(10, 10))

# Plot the histogram of tj_stuff_plus for specific pitch types
sns.histplot(data=df_2023_pd[df_2023_pd['pitch_type'].isin(['FF', 'SI', 'FC', 'SL', 'ST', 'CH', 'FS', 'CU', 'KC'])], 
             x='tj_stuff_plus', 
             binrange=[60, 140], 
             bins=40,
             ax=ax[0]
             )

# Set the title of the first subplot
ax[0].set_title('2023 Pitch Stuff+ Distribution')

# Plot the histogram of tj_stuff_plus for specific pitch types, colored by pitch type
sns.histplot(data=df_2023_pd[df_2023_pd['pitch_type'].isin(['FF', 'SI', 'FC', 'SL', 'ST', 'CH', 'FS', 'CU', 'KC'])], 
             x='tj_stuff_plus', 
             binrange=[60, 140], 
             bins=40,
             hue='pitch_type',
             multiple='stack',  
             palette=dict_colour,
             ax=ax[1]
             )

# Set the title of the second subplot
ax[1].set_title('2023 Pitch Stuff+ Distribution by Pitch Type')

# Set the x-axis label
ax[0].set_xlabel('Stuff+')
ax[1].set_xlabel('Stuff+')

# Change the legend title to 'Pitch Type'
ax[1].get_legend().set_title("Pitch Type")   

# Adjust layout to prevent overlap
fig.tight_layout()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the median of tj_stuff_plus for each pitch_type
mean_values = df_2023_pd[df_2023_pd['pitch_type'].isin(['FF', 'SI', 'FC', 'SL', 'ST', 
                                                        'CH', 'FS', 'CU', 'KC'])].groupby('pitch_type')['tj_stuff_plus'].median().sort_values(ascending=False)

# Map the median values to the dataframe
df_2023_pd['tj_stuff_plus_mean'] = df_2023_pd['pitch_type'].map(mean_values.to_dict())

# Sort the dataframe by the median values of tj_stuff_plus
df_2023_pd = df_2023_pd.sort_values(by='tj_stuff_plus_mean', ascending=False)

# Create a subplot for the boxen plot
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# Plot the boxen plot of tj_stuff_plus for specific pitch types, colored by pitch type
bp = sns.boxenplot(data=df_2023_pd[df_2023_pd['pitch_type'].isin(['FF', 'SI', 'FC', 'SL', 'ST', 'CH', 'FS', 'CU', 'KC'])], 
               x='tj_stuff_plus', 
               y='pitch_type',
               palette=dict_colour,
               ax=ax,
               showfliers=False,  # Do not show outliers
               k_depth=6          # Number of boxes to draw
               )

bp.set_yticklabels([dict_pitch[x.get_text()] + f' ({x.get_text()})' for x in bp.get_yticklabels()])

# Annotate the median values on the plot
for index, row in mean_values.reset_index().iterrows():
    ax.text(row['tj_stuff_plus'], 
            index, 
            f'{row["tj_stuff_plus"]:.0f}', 
            color='black', 
            ha="center", 
            va="center",
            bbox=dict(facecolor='white', alpha=1,edgecolor='k')  # White background for the text
            )


# Set the x-axis limits
ax.set_xlim(60, 140)

# Set the title of the plot
ax.set_title('2023 tjStuff+ Distribution and Median by Pitch Type')

# Set the x-axis and y-axis label
ax.set_xlabel('tjStuff+')
ax.set_ylabel('Pitch Type')

# Display the plot
plt.show()

Thanks to the Fangraphs API, we can easily grab 2024 MLB Pitcher Results. I also downloaded Pitching wOBA data from Baseball Savant.

In [None]:
import requests

# Fetch data from Fangraphs API for the 2023 and 2024 MLB seasons
data = requests.get("https://www.fangraphs.com/api/leaders/major-league/data?age=&pos=all&stats=pit&lg=all&season=2024&season1=2023&ind=1&qual=0&type=8&month=0&pageitems=500000").json()

# Define the schema explicitly for the incoming data
schema = {
    'playerid': pl.Int64,
    'xMLBAMID': pl.Int64,
    'PlayerName': pl.Utf8,
    'Season': pl.Int64,
    'Team': pl.Utf8,
    'G': pl.Int64,
    'IP': pl.Float64,
    'K-BB%': pl.Float64,
    'ERA': pl.Float64,
    'FIP': pl.Float64,
    'xFIP': pl.Float64,
    'TBF': pl.Int64,
    'Pitches': pl.Int64,
}

# Create a Polars DataFrame from the fetched data
df_fg = pl.DataFrame(data=data['data'], schema=schema)

# Add a column for the previous year (Season - 1)
df_fg = df_fg.with_columns((pl.col('Season') - 1).alias('year_n1'))

# Load wOBA data from a CSV file
df_woba = pl.read_csv('woba_2020_2024.csv')

# Join the Fangraphs data with the wOBA data on player ID and season
df_fg = df_fg.join(df_woba, left_on=['xMLBAMID', 'Season'], right_on=['player_id', 'year'], how='left')

# Join the Fangraphs data with itself to get the previous season's data
df_join = df_fg.join(df_fg, left_on=['xMLBAMID', 'year_n1'], right_on=['xMLBAMID', 'Season'], how='inner', suffix='_2023')


# Join the resulting DataFrame with the aggregated data to get tj_stuff_plus
df_join = df_join.join(df_agg_2024, left_on=['xMLBAMID', 'Season'], right_on=['pitcher_id', 'year'], how='inner').sort('tj_stuff_plus', descending=True)


# Join the resulting DataFrame with the aggregated data to get tj_stuff_plus
df_join = df_join.join(df_agg_2023, left_on=['xMLBAMID', 'year_n1'], right_on=['pitcher_id', 'year'], how='inner',suffix='_2023').sort('tj_stuff_plus', descending=True)



With the 2023 and 2024 Data loaded into a DataFrame, we can calculate correlations.

In [None]:
# Set the minimum number of pitches required for filtering
min_pitches = 100

# Filter the DataFrame to include only rows where the count and Pitches are greater than or equal to min_pitches
df_join_filter = df_join.filter(
    (pl.col('count') >= min_pitches) & (pl.col('Pitches') >= min_pitches)
)

# Print the minimum pitches, sample size, and average pitches
print('Minimum Pitches:', min_pitches)
print('Sample Size:', len(df_join_filter))
print('Average Pitches:', int(df_join_filter['Pitches'].mean()))

# Calculate and print the correlation between 2023 and 2024 metrics
print('Correlation between 2023 and 2024:')
corr_df = df_join_filter.to_pandas()[[
    'tj_stuff_plus_2023', 'tj_stuff_plus', 'FIP', 'woba', 'K-BB%',
    'FIP_2023', 'woba_2023', 'K-BB%_2023'
]].corr()[[
    'FIP', 'woba', 'K-BB%', 'tj_stuff_plus'
]].loc[
    ['tj_stuff_plus_2023', 'FIP_2023', 'woba_2023', 'K-BB%_2023']
].abs().round(2)

# Rename the index and columns for better readability
corr_df.index = ['2023 tjStuff+', '2023 FIP', '2023 wOBA', '2023 K-BB%']
corr_df.columns = ['2024 FIP', '2024 wOBA', '2024 K-BB%', '2024 tjStuff+']

# Display the correlation DataFrame
corr_df

## Updating the Model

I am content with how the model is performing on a predictive level. To evaluate the model on a descriptive level, I will retrain the model using 2020-23 data, and then test on 2024 data.

In [None]:
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMRegressor
from sklearn.preprocessing import RobustScaler


# Filter the dataframe to include only the years 2020, 2021, 2022, and 2023
df_train = df.filter(pl.col('year').is_in([2020, 2021, 2022,2023]))

# Define the features to be used for training
features = ['start_speed',
            'spin_rate',
            'extension',
            'az',
            'ax',
            'x0',
            'z0',
            'speed_diff',
            'az_diff',
            'ax_diff']

# Define the target variable
target = 'target'

# Drop rows with null values in the specified features and target column
df_train = df_train.drop_nulls(subset=features + [target])

# Extract features and target from the training dataframe
X = df_train[features]
y = df_train['target']

# Create a pipeline with RobustScaler and LGBMRegressor
model = make_pipeline(
    RobustScaler(),            # Robust Scaler to scale the features
    LGBMRegressor(
        n_estimators=1000,         # Number of boosting rounds (trees) to be built.
        learning_rate=0.01,        # Step size shrinkage used to prevent overfitting. Smaller values require more boosting rounds.
        num_leaves=31,             # Maximum number of leaves in one tree. Controls the complexity of the model.
        max_depth=-1,              # Maximum depth of the tree. -1 means no limit.
        min_child_samples=20,      # Minimum number of data points required in a leaf. Helps control overfitting.
        subsample=0.8,             # Fraction of data to be used for each boosting round. Helps prevent overfitting.
        colsample_bytree=0.8,      # Fraction of features to be used for each boosting round. Helps prevent overfitting.
        reg_alpha=0.1,             # L1 regularization term on weights. Helps prevent overfitting.
        reg_lambda=0.2,            # L2 regularization term on weights. Helps prevent overfitting.
        random_state=42,           # Seed for reproducibility.
        force_row_wise=True        # Force row-wise (data parallel) computation. Useful for handling large datasets.
    )
)

# Fit the model to the training data
model.fit(X, y)

# # Save the model to a file
# import joblib
# joblib.dump(model, 'model/lgbm_model_2020_2023.joblib')
# print("Model saved to model/lgbm_model_2020_2023.joblib")

Let's calculate tjStuff+ again!

In [None]:
# Filter the dataframe to include only the rows for the year 2024 and drop rows with null values in the specified features and target column
df_test = df.filter(pl.col('year').is_in([2024])).drop_nulls(subset=features + [target])

# Predict the target values for the 2024 data using the trained model
df_test = df_test.with_columns(
    pl.Series(name="target", values=model.predict(df_test[features].to_numpy()))
)

# Filter the dataframe to include only the rows for the year 2024
df_2024 = df_test.filter(pl.col('year') == 2024)

## 2024 tjStuff+ ##
# Calculate the mean and standard deviation of the target column
target_mean = df_2024['target'].mean()
target_std = df_2024['target'].std()

# Print the mean and standard deviation of the target column
print('Mean xRV/100:', round(target_mean * 100, 2))
print('StDev xRV/100:', round(target_std * 100, 2))

# Standardize the target column to create a z-score
df_2024 = df_2024.with_columns(
    ((pl.col('target') - target_mean) / target_std).alias('target_zscore')
)

# Convert the z-score to tj_stuff_plus
df_2024 = df_2024.with_columns(
    (100 - (pl.col('target_zscore') * 10)).alias('tj_stuff_plus')
)

# Aggregate tj_stuff_plus by pitcher_id and year
df_agg_2024 = df_2024.group_by(['pitcher_id', 'year']).agg(
    pl.col('tj_stuff_plus').len().alias('count'),
    pl.col('tj_stuff_plus').mean()
)


Let's plot its pitch level distribution.

In [None]:
# Import necessary libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Convert Polars DataFrame to Pandas DataFrame for Seaborn compatibility
df_2024_pd = df_2024.to_pandas()

# Create subplots for the histograms
fig, ax = plt.subplots(2, 1, figsize=(10, 10))

# Plot the histogram of tj_stuff_plus for specific pitch types
sns.histplot(data=df_2024_pd[df_2024_pd['pitch_type'].isin(['FF', 'SI', 'FC', 'SL', 'ST', 'CH', 'FS', 'CU', 'KC'])], 
             x='tj_stuff_plus', 
             binrange=[60, 140], 
             bins=40,
             ax=ax[0]
             )

# Set the title of the first subplot
ax[0].set_title('2024 Pitch Stuff+ Distribution')

# Plot the histogram of tj_stuff_plus for specific pitch types, colored by pitch type
sns.histplot(data=df_2024_pd[df_2024_pd['pitch_type'].isin(['FF', 'SI', 'FC', 'SL', 'ST', 'CH', 'FS', 'CU', 'KC'])], 
             x='tj_stuff_plus',
             palette=dict_colour,
             binrange=[60, 140], 
             bins=40,
             hue='pitch_type',
             multiple='stack',  
             ax=ax[1]
             )

# Set the title of the second subplot
ax[1].set_title('2024 Pitch Stuff+ Distribution by Pitch Type')


# Set the x-axis label
ax[0].set_xlabel('tjStuff+')
ax[1].set_xlabel('tjStuff+')

# Change the legend title to 'Pitch Type'
ax[1].get_legend().set_title("Pitch Type")   

# Adjust layout to prevent overlap
fig.tight_layout()

In [None]:
#now well make a leaderboard of the top 10 pitchers by tj_stuff_plus
df_leaderboard = df_agg_2024.sort('tj_stuff_plus', descending=True).head(10)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('tj_stuff_plus').round(2).alias('tj_stuff_plus')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').cast(pl.Utf8).alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('xMLBAMID_', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2024', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2023', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2022', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2021', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2020', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2023_2024', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2024_2023', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2024_2024', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2023_2023', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2022_2022', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2021_2021', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2020_2020', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2020_2021', '').alias('pitcher_id')
)
df_leaderboard = df_leaderboard.with_columns(
    pl.col('pitcher_id').str.replace('_2020_2022', '').alias('pitcher_id')
)
#now show the leaderboard
plt.show(df_leaderboard.to_pandas())

Now let's plot its Pitch Type distribution.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the mean of tj_stuff_plus for each pitch_type
df_2024_pd_group = df_2024_pd.groupby('pitch_type')['tj_stuff_plus'].agg(
    mean='mean',
    std='std',
    median='median',
    min='min',
    max='max',
    percentile_2=lambda x: x.quantile(0.025),
    percentile_98=lambda x: x.quantile(0.975)
).sort_values('mean', ascending=False).reset_index()
df_2024_pd_group.to_csv('tj_stuff_plus_pitch.csv', index=False, header=True)

# Calculate the median of tj_stuff_plus for each pitch_type
median_values = df_2024_pd[df_2024_pd['pitch_type'].isin(['FF', 'SI', 'FC', 'SL', 'ST', 
                                                        'CH', 'FS', 'CU', 'KC'])].groupby('pitch_type')['tj_stuff_plus'].median().sort_values(ascending=False)

# Map the median values to the dataframe
df_2024_pd['tj_stuff_plus_median'] = df_2024_pd['pitch_type'].map(median_values.to_dict())

# Sort the dataframe by the median values of tj_stuff_plus
df_2024_pd = df_2024_pd.sort_values(by='tj_stuff_plus_median', ascending=False)

# Create a subplot for the boxen plot
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# Plot the boxen plot of tj_stuff_plus for specific pitch types, colored by pitch type
bp = sns.boxenplot(data=df_2024_pd[df_2024_pd['pitch_type'].isin(['FF', 'SI', 'FC', 'SL', 'ST', 'CH', 'FS', 'CU', 'KC'])], 
               x='tj_stuff_plus', 
               y='pitch_type',
               palette=dict_colour,
               ax=ax,
               showfliers=False,  # Do not show outliers
               k_depth=6          # Number of boxes to draw
               )

bp.set_yticklabels([dict_pitch[x.get_text()] + f' ({x.get_text()})' for x in bp.get_yticklabels()])


# Annotate the median values on the plot
for index, row in median_values.reset_index().iterrows():
    ax.text(row['tj_stuff_plus'], 
            index, 
            f'{row["tj_stuff_plus"]:.0f}', 
            color='black', 
            ha="center", 
            va="center",
            bbox=dict(facecolor='white', alpha=1,edgecolor='k')  # White background for the text
            )

# Set the x-axis limits
ax.set_xlim(60, 140)

# Set the title of the plot
ax.set_title('tjStuff+ Distribution and Median by Pitch Type - 2024 Season')

# Set the x-axis and y-axis label
ax.set_xlabel('tjStuff+')
ax.set_ylabel('Pitch Type')

# Display the plot
plt.show()

In [None]:
import requests

# Fetch data from Fangraphs API for the 2023 and 2024 MLB seasons
data = requests.get("https://www.fangraphs.com/api/leaders/major-league/data?age=&pos=all&stats=pit&lg=all&season=2024&season1=2024&ind=1&qual=0&type=8&month=0&pageitems=500000").json()

# Define the schema explicitly for the incoming data
schema = {
    'playerid': pl.Int64,
    'xMLBAMID': pl.Int64,
    'PlayerName': pl.Utf8,
    'Season': pl.Int64,
    'Team': pl.Utf8,
    'G': pl.Int64,
    'IP': pl.Float64,
    'K-BB%': pl.Float64,
    'ERA': pl.Float64,
    'FIP': pl.Float64,
    'xFIP': pl.Float64,
    'TBF': pl.Int64,
    'Pitches': pl.Int64,
}

# Create a Polars DataFrame from the fetched data
df_fg = pl.DataFrame(data=data['data'], schema=schema)

# Load wOBA data from a CSV file
df_woba = pl.read_csv('woba_2020_2024.csv')

# Join the Fangraphs data with the wOBA data on player ID and season
df_fg = df_fg.join(df_woba, left_on=['xMLBAMID', 'Season'], right_on=['player_id', 'year'], how='left')


# Join the Fangraphs data with itself to get the previous season's data
df_join = df_fg.join(df_agg_2024, left_on=['xMLBAMID', 'Season'], right_on=['pitcher_id', 'year'], how='inner', suffix='_2023')



In [None]:
# Set the minimum number of pitches required for filtering
min_pitches = 100

# Filter the DataFrame to include only rows where the count and Pitches are greater than or equal to min_pitches
df_join_filter = df_join.filter(
    (pl.col('count') >= min_pitches) & (pl.col('Pitches') >= min_pitches)
)

# Print the minimum pitches, sample size, and average pitches
print('Minimum Pitches:', min_pitches)
print('Sample Size:', len(df_join_filter))
print('Average Pitches:', int(df_join_filter['Pitches'].mean()))

# Calculate and print the correlation between 2024 metrics
print('Correlation between 2024 metrics:')
corr_df = df_join_filter.to_pandas()[[
    'tj_stuff_plus', 'FIP', 'woba', 'K-BB%'
]].corr()[[
    'tj_stuff_plus', 'FIP', 'woba', 'K-BB%'
]].loc[
    ['tj_stuff_plus', 'FIP', 'woba', 'K-BB%']
].abs().round(2)

# Rename the index and columns for better readability
corr_df.index = ['2024 tjStuff+', '2024 FIP', '2024 wOBA', '2024 K-BB%']
corr_df.columns = ['2024 tjStuff+', '2024 FIP', '2024 wOBA', '2024 K-BB%']

# Display the correlation DataFrame
corr_df


### 2024 Metrics

#### Player Metrics

With the model trained and validated, we can now apply it to 2024 data to get all sorts of metrics! Let's take a look at tjStuff+ by pitcher and pitch type and create a leaderboard.

This code block aggregates tjStuff+ by Pitcher and pitch type. To better contextualize tjStuff+, I also calculate a 'Pitch Grade' for each pitch type which is scaled to the traditional 20-80 Scouting Grades. It is normally distributed, however the Standard Deviation (σ) is determined by taking the difference between the 99.9th and 0.1th Percentile of tjStuff+. This ensures that the greatest tjStuff+ pitch of a specific type is graded at 80, while the worst tjStuff+ pitch is graded at 20.

I decided to make it like this because applying the Standard deviation at the pitch level for each pitch type caused very tight distributions, especially for 4-Seam Fastballs. The greatest 4-Seam "Pitch Grade" for this method was 65. While it is mathematically sound, having the best Fastballs in baseball graded as "Good" rather than "Elite" did not sit well with me.

Each pitch type has pitches that can span from 20 to 80 in grade, with grades following a normal distribution.

In [None]:
import pandas as pd
import polars as pl

def pitch_agg(df: pl.DataFrame) -> pl.DataFrame:
    # Group by pitcher_id, pitcher_name, and year, then aggregate to calculate the number of pitches, unique games, and mean tj_stuff_plus
    df_plot = df.group_by(['pitcher_id', 'pitcher_name', 'year']).agg(
        pl.col('tj_stuff_plus').len().alias('pitches'),
        pl.col('game_id').n_unique().alias('games'),
        pl.col('tj_stuff_plus').mean()
    )

    # Calculate pitches per game
    df_plot = df_plot.with_columns(
        (pl.col('pitches') / pl.col('games')).alias('pitches_per_game')
    )

    # Create a new column 'position' based on the condition: if pitches_per_game >= 40, then 'SP' (Starting Pitcher), else 'RP' (Relief Pitcher)
    df_plot = df_plot.with_columns(
        pl.when(pl.col('pitches_per_game') >= 40)
        .then(pl.lit('SP'))
        .otherwise(pl.lit('RP'))
        .alias('position')
    )

    # Create a dictionary mapping pitcher_id to position
    position_dict = dict(zip(df_plot['pitcher_id'], df_plot['position']))

    # Add a column 'pitch_type' with a constant value 'All'
    df_plot = df_plot.with_columns(
        (pl.lit('All')).alias('pitch_type')
    )

    # Group by pitcher_id, pitcher_name, pitch_type, and year, then aggregate to calculate the number of pitches, unique games, and mean tj_stuff_plus
    df_plot_pitch = df.group_by(['pitcher_id', 'pitcher_name', 'pitch_type', 'year']).agg(
        pl.col('tj_stuff_plus').len().alias('pitches'),
        pl.col('game_id').n_unique().alias('games'),
        pl.col('tj_stuff_plus').mean()
    )

    # Concatenate the two DataFrames (df_plot and df_plot_pitch) into a single DataFrame
    df_pitch_all_pd = pd.concat([df_plot.to_pandas(), df_plot_pitch.to_pandas()])
    df_pitch_all = pl.DataFrame(df_pitch_all_pd)

    # Filter and aggregate pitch types statistics
    df_pitch_types = df_pitch_all.filter(pl.col('pitches') >= 10).to_pandas().groupby('pitch_type')['tj_stuff_plus'].agg(
        mean='mean',
        std='std',
        median='median',
        min='min',
        max='max',
        percentile_1=lambda x: x.quantile(0.001),
        percentile_99=lambda x: x.quantile(0.999)
    ).sort_values('mean', ascending=False).reset_index()

    # Calculate standard deviation based on percentiles to scale it to the 20-80 Grade Scale
    df_pitch_types['std'] = (df_pitch_types['percentile_99'] - df_pitch_types['percentile_1']) / 6
    df_pitch_types.to_csv('tj_stuff_plus_pitch.csv', index=False, header=True)
    df_pitch_types = pl.DataFrame(df_pitch_types)

    # Join the pitch type statistics with the main DataFrame based on pitch_type
    df_pitch_all = df_pitch_all.join(df_pitch_types, left_on='pitch_type', right_on='pitch_type')

    # Normalize pitch_grade values to a range between -0.5 and 0.5 based on the percentiles
    df_pitch_all = df_pitch_all.with_columns(
        ((pl.col('tj_stuff_plus') - pl.col('mean')) / pl.col('std')).alias('pitch_grade')
    )

    # Scale the pitch_grade values to a range between 20 and 80
    df_pitch_all = df_pitch_all.with_columns(
        (pl.col('pitch_grade') * 10 + 50).clip(20, 80)
    )

    # Map the 'pitcher_id' to 'position' using the position_dict
    df_pitch_all = df_pitch_all.with_columns(
        df_pitch_all['pitcher_id'].map_elements(lambda x: position_dict.get(x, None)).alias('position')
    )

    # Filter the DataFrame to include only specific pitch types
    df_pitch_all = df_pitch_all.filter(pl.col('pitch_type').is_in([
        'FS', 'FO', 'SC', 'FF', 'SI', 'SV', 'KC', 'All', 'FC', 'SL', 'ST', 'CU', 'CH', 'KN'
    ]))
    
    return df_plot, df_plot_pitch, df_pitch_all, position_dict

# Call the pitch_agg function and store the results in df_plot, df_plot_pitch, and df_pitch_all
df_plot, df_plot_pitch, df_pitch_all, position_dict = pitch_agg(df_2024)

# Write the final DataFrame to a CSV file
df_pitch_all.write_csv('tjstuff_plus_pitch_data_2024.csv')

Let's take a look at tjStuff+ by position. Starters (SP) and Relievers (RP) play two distinct roles in baseball. Starters are tasked with pitching longer outings and are geared towards command and control rather than higher velocity and strikeout numbers. Relievers are quite the opposite, as they pitch shorter outings and tend to post incredible K% with less emphasis on lower BB%.

This shows up in the distribution of tjStuff+ by SP and RP. SP are more clustered together with just a handful displaying elite stuff, while RP is positively skewed. Thanks to their shorter outing, RP can consistently output higher quality pitches, making both the average and the max greater than SP.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the median of tj_stuff_plus for each pitch_type
mean_values = df_plot.to_pandas().groupby('position')['tj_stuff_plus'].median().sort_values(ascending=False)


# Sort the dataframe by the median values of tj_stuff_plus
df_2024_sp_rp = df_plot.filter(df_plot['pitches']>=100).to_pandas().sort_values(by='tj_stuff_plus', ascending=False)

# Create a subplot for the boxen plot
fig, ax = plt.subplots(1, 1, figsize=(10, 6),dpi=300)

# Plot the boxen plot of tj_stuff_plus for specific pitch types, colored by pitch type
bp = sns.boxenplot(data=df_2024_sp_rp, 
               x='tj_stuff_plus', 
               y='position',
               ax=ax,
               palette=[dict_colour[x] for x in dict_colour][7:9],
               showfliers=True,  # Do not show outliers
               k_depth=6          # Number of boxes to draw
               )

# Annotate the median values on the plot
for index, row in mean_values.reset_index().iterrows():
    ax.text(row['tj_stuff_plus'], 
            index, 
            f'{row["tj_stuff_plus"]:.0f}', 
            color='black', 
            ha="center", 
            va="center",
            bbox=dict(facecolor='white', alpha=1,edgecolor='k')  # White background for the text
            )

# Set the x-axis limits
ax.set_xlim(80, 120)

# Set the title of the plot
ax.set_title('Pitcher Level tjStuff+ Distribution and Median by Position (min. 100 Pitches) - 2024 Season')

# Set the x-axis and y-axis label
ax.set_xlabel('tjStuff+')
ax.set_ylabel('Position')

# Display the plot
plt.show()

This graphic is a simple leader board of the best tjStuff+ pitchers during the 2024 MLB Season.It could be displayed in a table, but I like illustrating leader boards in different ways, such as this.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import datetime

# Select the statistic to display
stat_select = 'tj_stuff_plus'

# Number of top pitchers to display and minimum pitches required
len_pit = 10
min_pitch = 100

# Set the number of rows and columns for the subplot grid
num_rows = len_pit + 2  # 10 players + 2 for top and bottom
num_cols = 3  # Three columns: two thin ones on the edges and one main column in the center

# Set the font style to Calibri
plt.rcParams['font.family'] = 'calibri'

# Create a figure
fig = plt.figure(figsize=(20, 20), dpi=300)

# Create a GridSpec object with different widths for the columns
gs = gridspec.GridSpec(num_rows, num_cols, figure=fig, width_ratios=[0.5, 9, 0.5])

# Create a new column 'picture' with the formatted URLs for pitcher images
df_plot = df_plot.with_columns(
    pl.col('pitcher_id').map_elements(lambda i: f'https://img.mlbstatic.com/mlb-photos/image/upload/w_180,d_people:generic:headshot:silo:current.png,q_auto:best,f_auto/v1/people/{i}/headshot/silo/current').alias('picture')
)

# Sort the dataframe in descending order based on the selected metric (tj_stuff_plus)
sorted_df = df_plot.to_pandas()[df_plot.to_pandas()['pitches'] >= min_pitch].sort_values(by=stat_select, ascending=False)

# Define the positions for the top 10 players in the grid
positions = [(i + 1, 1) for i in range(len_pit)]

# Iterate over the top 10 players in the sorted dataframe
for i, (_, team_row) in enumerate(sorted_df.head(len_pit).iterrows()):
    player = team_row['pitcher_name']
    logo_url = team_row['picture']
    
    # Determine the position in the grid
    row, col = positions[i]
    
    # Create a subplot in the GridSpec layout
    ax = fig.add_subplot(gs[row, col])
    
    # Plot the team logo
    # img = plt.imread(logo_url)
    ax.set_xlim(-1, 1)
    # ax.imshow(img, extent=[-0.6, -0.4, 0, 1], aspect=0.2)
    ax.axis('off')
    
    # Add the rank number to the left of the logo, italicized
    ax.text(0.1, 0.5, f'{i + 1}', transform=ax.transAxes, ha='center', va='center', fontsize=36, style='italic')
    
    # Add the player name and metric value as text with bigger font size, bold the metric
    ax.text(0.45, 0.5, f'{player}', transform=ax.transAxes, ha='left', va='center', fontsize=36, style='italic')
    ax.text(0.9, 0.5, f'{team_row[stat_select]:.0f}', transform=ax.transAxes, ha='left', va='center', fontsize=36, weight='bold')

# Adjust the spacing between subplots to place them on the borders
ax_top = fig.add_subplot(gs[0, :])
ax_bot = fig.add_subplot(gs[-1, :])
ax_left = fig.add_subplot(gs[:, 0])
ax_right = fig.add_subplot(gs[:, -1])

ax_top.axis('off')
ax_bot.axis('off')
ax_left.axis('off')
ax_right.axis('off')

# Add text annotations at the bottom
ax_bot.text(s='By: @TJStats', x=0.1, y=0.5, fontsize=24, ha='left')
ax_bot.text(s='Data: MLB', x=0.9, y=0.5, fontsize=24, ha='right')
ax_bot.text(s=f"{datetime.datetime.today().strftime('%Y-%m-%d')}", x=0.5, y=0.5, fontsize=16, ha='center')

# Add the title at the top
ax_top.text(s=f'tjStuff+ v3.0 Leaders - 2024 MLB Season - min. {min_pitch} Pitches', x=0.5, y=0.5, fontsize=36, ha='center', style='italic', weight='bold')

# Show the plot
plt.show()


This graphic is a leader board of the greatest tjStuff+ by Pitch Type. These pitches are assigned a "Pitch Grade" of 80 by our aforementioned definition. 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import datetime

# Constants
stat_select = 'tj_stuff_plus'
len_pit = 9
min_pitch = 10

# Set the number of rows and columns for the subplot grid
num_rows = len_pit + 2  # 10 players + 2 for top and bottom
num_cols = 3  # Three columns: two thin ones on the edges and one main column in the center

# Set the font style to Calibri
plt.rcParams['font.family'] = 'calibri'

# Create a figure
fig = plt.figure(figsize=(20, 20), dpi=300)

# Create a GridSpec object with different widths for the columns
gs = gridspec.GridSpec(num_rows, num_cols, figure=fig, width_ratios=[0.5, 9, 0.5])

# Filter and sort the dataframe
df_pitch_all_best = df_pitch_all.filter(
    (df_pitch_all['pitch_type'].is_in(['FF', 'SI', 'FC', 'SL', 'ST', 'CH', 'FS', 'CU', 'KC'])) &
    (df_pitch_all['pitches'] >= 150)
).sort('pitch_grade', descending=True).unique(subset=['pitch_type']).sort('pitch_grade', descending=True)

# Add pitcher image URLs
df_pitch_all_best = df_pitch_all_best.with_columns(
    pl.col('pitcher_id').map_elements(lambda i: f'https://img.mlbstatic.com/mlb-photos/image/upload/w_180,d_people:generic:headshot:silo:current.png,q_auto:best,f_auto/v1/people/{i}/headshot/silo/current').alias('picture')
)

# Convert to pandas and filter based on minimum pitches
sorted_df = df_pitch_all_best.to_pandas()[df_pitch_all_best.to_pandas()['pitches'] >= min_pitch].sort_values(by=stat_select, ascending=False)
sorted_df['pitch_description'] = sorted_df['pitch_type'].map(dict_pitch)

# Define the positions for the inner 10 plots in the main column
positions = [(i + 1, 1) for i in range(len_pit)]

# Iterate over the top 10 players in the sorted dataframe
for i, (_, team_row) in enumerate(sorted_df.head(len_pit).iterrows()):
    player = team_row['pitcher_name']
    logo_url = team_row['picture']
    pitch_name = team_row['pitch_description']
    
    # Determine the position in the grid
    row, col = positions[i]
    
    # Create a subplot in the GridSpec layout
    ax = fig.add_subplot(gs[row, col])
    
    # Plot the team logo
    # img = plt.imread(logo_url)
    ax.set_xlim(-1, 1)
    # ax.imshow(img, extent=[-0.4, -0.2, 0, 1], aspect=0.2)
    ax.axis('off')
    
    # Add the pitch description, player name, and metric value
    ax.text(0.1, 0.5, f'{pitch_name}', transform=ax.transAxes, ha='center', va='center', fontsize=36, style='italic', weight='bold', color=dict_colour[team_row['pitch_type']])
    ax.text(0.45, 0.5, f'{player}', transform=ax.transAxes, ha='left', va='center', fontsize=36, style='italic')
    ax.text(0.9, 0.5, f'{team_row[stat_select]:.0f}', transform=ax.transAxes, ha='left', va='center', fontsize=36, weight='bold')

# Add top and bottom text
ax_top = fig.add_subplot(gs[0, :])
ax_bot = fig.add_subplot(gs[-1, :])
ax_left = fig.add_subplot(gs[:, 0])
ax_right = fig.add_subplot(gs[:, -1])

ax_top.axis('off')
ax_bot.axis('off')
ax_left.axis('off')
ax_right.axis('off')

ax_bot.text(s='By: @TJStats', x=0.1, y=0.5, fontsize=24, ha='left')
ax_bot.text(s='Data: MLB', x=0.9, y=0.5, fontsize=24, ha='right')
ax_bot.text(s=f"{datetime.datetime.today().strftime('%Y-%m-%d')}", x=0.5, y=0.5, fontsize=16, ha='center')

# Add the main title
ax_top.text(s=f'Highest tjStuff+ by Pitch Type - 2024 MLB Season - min. {min_pitch} Pitches', x=0.5, y=0.5, fontsize=36, ha='center', style='italic', weight='bold')

# Show the plot
plt.show()


I created a [Streamlit App](https://tjstatsapps-tjstuffplus.hf.space/) which tabulates and plots tjStuff+ for all MLB players during the 2024 MLB Season.

Here is an example

!['Tyler Glasnow'](image/glasnow.png)

#### Team Metrics




Let's calculate some team metrics!

In [None]:
import pandas as pd

# List of MLB teams and their corresponding ESPN logo URLs
mlb_teams = [
    {"team": "AZ", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/ari.png&h=500&w=500"},
    {"team": "ATL", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/atl.png&h=500&w=500"},
    {"team": "BAL", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/bal.png&h=500&w=500"},
    {"team": "BOS", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/bos.png&h=500&w=500"},
    {"team": "CHC", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/chc.png&h=500&w=500"},
    {"team": "CWS", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/chw.png&h=500&w=500"},
    {"team": "CIN", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/cin.png&h=500&w=500"},
    {"team": "CLE", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/cle.png&h=500&w=500"},
    {"team": "COL", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/col.png&h=500&w=500"},
    {"team": "DET", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/det.png&h=500&w=500"},
    {"team": "HOU", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/hou.png&h=500&w=500"},
    {"team": "KC", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/kc.png&h=500&w=500"},
    {"team": "LAA", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/laa.png&h=500&w=500"},
    {"team": "LAD", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/lad.png&h=500&w=500"},
    {"team": "MIA", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/mia.png&h=500&w=500"},
    {"team": "MIL", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/mil.png&h=500&w=500"},
    {"team": "MIN", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/min.png&h=500&w=500"},
    {"team": "NYM", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/nym.png&h=500&w=500"},
    {"team": "NYY", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/nyy.png&h=500&w=500"},
    {"team": "OAK", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/oak.png&h=500&w=500"},
    {"team": "PHI", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/phi.png&h=500&w=500"},
    {"team": "PIT", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/pit.png&h=500&w=500"},
    {"team": "SD", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/sd.png&h=500&w=500"},
    {"team": "SF", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/sf.png&h=500&w=500"},
    {"team": "SEA", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/sea.png&h=500&w=500"},
    {"team": "STL", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/stl.png&h=500&w=500"},
    {"team": "TB", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/tb.png&h=500&w=500"},
    {"team": "TEX", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/tex.png&h=500&w=500"},
    {"team": "TOR", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/tor.png&h=500&w=500"},
    {"team": "WSH", "logo_url": "https://a.espncdn.com/combiner/i?img=/i/teamlogos/mlb/500/scoreboard/wsh.png&h=500&w=500"}
]

# Create a DataFrame from the list of dictionaries
df_image = pl.DataFrame(mlb_teams)
image_dict = dict(zip(df_image['team'], df_image['logo_url']))

# Nap positions to the dataframe
df_2024 = df_2024.with_columns(
    pl.col('pitcher_id').map_elements(lambda x: position_dict.get(x, x)).alias('position')
)

Here is a leader board for tjStuff+ by team. I will also show tjStuff+ for Starters and Relievers.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Group by pitcher_team and aggregate to calculate the number of pitches and mean tj_stuff_plus
df_plot_team = df_2024.group_by(['pitcher_team']).agg(
    pl.col('tj_stuff_plus').len().alias('pitches'),
    pl.col('tj_stuff_plus').mean(),
)

# Map team logos to the dataframe
df_plot_team = df_plot_team.with_columns(
    pl.col('pitcher_team').map_elements(lambda x: image_dict.get(x, x)).alias('logo')
)

stat_select = 'tj_stuff_plus'

# Set the number of rows and columns for the subplot grid
num_rows = 13
num_cols = 7  # Updated to 7 columns

# Set the font style to Calibri
plt.rcParams['font.family'] = 'calibri'

# Create a figure
fig = plt.figure(figsize=(25, 25), dpi=300)

# Create a GridSpec object with specified column ratios
gs = gridspec.GridSpec(num_rows, num_cols, figure=fig, width_ratios=[0.01, 0.4, 0.1, 0.4, 0.1, 0.4, 0.01])

# Sort the dataframe in descending order based on the metric (tj_stuff_plus)
sorted_df = df_plot_team.to_pandas().sort_values(by=stat_select, ascending=False).reset_index(drop=True)

# Define the positions for the inner 30 plots
positions_left = [(i + 1, 1) for i in range(10)]
positions_middle = [(i + 1, 3) for i in range(10)]
positions_right = [(i + 1, 5) for i in range(10)]

# Combine positions for left, middle, and right sides
positions = positions_left + positions_middle + positions_right

# Iterate over the top 30 teams in the sorted dataframe
for i, (_, team_row) in enumerate(sorted_df.head(30).iterrows()):
    logo_url = team_row['logo']
    
    # Determine the position in the grid
    row, col = positions[i]
    
    # Create a subplot in the GridSpec layout
    ax = fig.add_subplot(gs[row, col])
    
    # Plot the team logo
    # img = plt.imread(logo_url)
    # ax.imshow(img, extent=[0.3, 0.8, 0.3, 0.8], origin='upper')
    ax.axis('off')
    
    # Add the rank number to the left of the logo, italicized
    ax.text(-1, 0.5, f'{i + 1}', transform=ax.transAxes, ha='center', va='center', fontsize=48, style='italic')
    
    # Add the team name and metric value as text with bigger font size, bold the metric
    ax.text(1.6, 0.5, f'{team_row[stat_select]:.0f}', transform=ax.transAxes, ha='left', va='center', fontsize=48, weight='bold')

# Adjust the spacing between subplots to place them on the borders
ax_top = fig.add_subplot(gs[0, :])
ax_info = fig.add_subplot(gs[-2, :])
ax_bot = fig.add_subplot(gs[-1, :])
ax_left = fig.add_subplot(gs[:, 0])
ax_right = fig.add_subplot(gs[:, -1])

ax_top.axis('off')
ax_bot.axis('off')
ax_left.axis('off')
ax_right.axis('off')
ax_info.axis('off')

ax_middle = fig.add_subplot(gs[0, :])

# Add title and additional information
ax_middle.set_title(label=f'tjStuff+ by Team - 2024 MLB Season', x=0.5, y=0, fontsize=56, ha='center', va='bottom', style='italic', weight='bold')
ax_middle.axis('off')
ax_bot.text(s='By: @TJStats', x=0.1, y=0.5, fontsize=24, ha='left')
ax_bot.text(s='Data: MLB', x=0.9, y=0.5, fontsize=24, ha='right')

import datetime
ax_bot.text(s=f"{datetime.datetime.today().strftime('%Y-%m-%d')}", x=0.5, y=0.5, fontsize=12, ha='center')

ax_info.text(x=0.5, y=0, s='tjStuff+ calculates the Expected Run Value (xRV) of a pitch regardless of type\n'
                            'tjStuff+ is normally distributed, where 100 is the mean and Standard Deviation is 10\n',
                            ha='center', va='bottom', fontsize=24, style='italic')

# Show the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Filter the dataframe for starting pitchers (SP) and group by pitcher_team
df_plot_team_sp = df_2024.filter(df_2024['position'] == 'SP').group_by(['pitcher_team']).agg(
    pl.col('tj_stuff_plus').len().alias('pitches'),
    pl.col('tj_stuff_plus').mean(),
)

# Map team logos to the dataframe
df_plot_team_sp = df_plot_team_sp.with_columns(
    pl.col('pitcher_team').map_elements(lambda x: image_dict.get(x, x)).alias('logo')
)

stat_select = 'tj_stuff_plus'

# Set the number of rows and columns for the subplot grid
num_rows = 13
num_cols = 7  # Updated to 7 columns

# Set the font style to Calibri
plt.rcParams['font.family'] = 'calibri'

# Create a figure
fig = plt.figure(figsize=(25, 25), dpi=300)

# Create a GridSpec object with specified column ratios
gs = gridspec.GridSpec(num_rows, num_cols, figure=fig, width_ratios=[0.01, 0.4, 0.1, 0.4, 0.1, 0.4, 0.01])

# Sort the dataframe in descending order based on the metric (tj_stuff_plus)
sorted_df = df_plot_team_sp.to_pandas().sort_values(by=stat_select, ascending=False).reset_index(drop=True)

# Define the positions for the inner 30 plots
positions_left = [(i + 1, 1) for i in range(10)]
positions_middle = [(i + 1, 3) for i in range(10)]
positions_right = [(i + 1, 5) for i in range(10)]

# Combine positions for left, middle, and right sides
positions = positions_left + positions_middle + positions_right

# Iterate over the top 30 teams in the sorted dataframe
for i, (_, team_row) in enumerate(sorted_df.head(30).iterrows()):
    logo_url = team_row['logo']
    
    # Determine the position in the grid
    row, col = positions[i]
    
    # Create a subplot in the GridSpec layout
    ax = fig.add_subplot(gs[row, col])
    
    # Plot the team logo
    img = plt.imread(logo_url)
    ax.imshow(img, extent=[0.3, 0.8, 0.3, 0.8], origin='upper')
    ax.axis('off')
    
    # Add the rank number to the left of the logo, italicized
    ax.text(-1, 0.5, f'{i + 1}', transform=ax.transAxes, ha='center', va='center', fontsize=48, style='italic')
    
    # Add the team name and metric value as text with bigger font size, bold the metric
    ax.text(1.6, 0.5, f'{team_row[stat_select]:.0f}', transform=ax.transAxes, ha='left', va='center', fontsize=48, weight='bold')

# Adjust the spacing between subplots to place them on the borders
ax_top = fig.add_subplot(gs[0, :])
ax_info = fig.add_subplot(gs[-2, :])
ax_bot = fig.add_subplot(gs[-1, :])
ax_left = fig.add_subplot(gs[:, 0])
ax_right = fig.add_subplot(gs[:, -1])

ax_top.axis('off')
ax_bot.axis('off')
ax_left.axis('off')
ax_right.axis('off')
ax_info.axis('off')

ax_middle = fig.add_subplot(gs[0, :])

# Add title and additional information
ax_middle.set_title(label=f'tjStuff+ by Team Starters - 2024 MLB Season', x=0.5, y=0, fontsize=56, ha='center', va='bottom', style='italic', weight='bold')
ax_middle.axis('off')
ax_bot.text(s='By: @TJStats', x=0.1, y=0.5, fontsize=24, ha='left')
ax_bot.text(s='Data: MLB', x=0.9, y=0.5, fontsize=24, ha='right')

import datetime
ax_bot.text(s=f"{datetime.datetime.today().strftime('%Y-%m-%d')}", x=0.5, y=0.5, fontsize=12, ha='center')

ax_info.text(x=0.5, y=0, s='tjStuff+ calculates the Expected Run Value (xRV) of a pitch regardless of type\n'
                            'tjStuff+ is normally distributed, where 100 is the mean and Standard Deviation is 10\n',
                            ha='center', va='bottom', fontsize=24, style='italic')

# Show the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Filter the dataframe for relief pitchers (RP) and group by pitcher_team
df_plot_team_rp = df_2024.filter(df_2024['position'] == 'RP').group_by(['pitcher_team']).agg(
    pl.col('tj_stuff_plus').len().alias('pitches'),
    pl.col('tj_stuff_plus').mean(),
)

# Map team logos to the dataframe
df_plot_team_rp = df_plot_team_rp.with_columns(
    pl.col('pitcher_team').map_elements(lambda x: image_dict.get(x, x)).alias('logo')
)

stat_select = 'tj_stuff_plus'

# Set the number of rows and columns for the subplot grid
num_rows = 13
num_cols = 7  # Updated to 7 columns

# Set the font style to Calibri
plt.rcParams['font.family'] = 'calibri'

# Create a figure
fig = plt.figure(figsize=(25, 25), dpi=300)

# Create a GridSpec object with specified column ratios
gs = gridspec.GridSpec(num_rows, num_cols, figure=fig, width_ratios=[0.01, 0.4, 0.1, 0.4, 0.1, 0.4, 0.01])

# Sort the dataframe in descending order based on the metric (tj_stuff_plus)
sorted_df = df_plot_team_rp.to_pandas().sort_values(by=stat_select, ascending=False).reset_index(drop=True)

# Define the positions for the inner 30 plots
positions_left = [(i + 1, 1) for i in range(10)]
positions_middle = [(i + 1, 3) for i in range(10)]
positions_right = [(i + 1, 5) for i in range(10)]

# Combine positions for left, middle, and right sides
positions = positions_left + positions_middle + positions_right

# Iterate over the top 30 teams in the sorted dataframe
for i, (_, team_row) in enumerate(sorted_df.head(30).iterrows()):
    logo_url = team_row['logo']
    
    # Determine the position in the grid
    row, col = positions[i]
    
    # Create a subplot in the GridSpec layout
    ax = fig.add_subplot(gs[row, col])
    
    # Plot the team logo
    img = plt.imread(logo_url)
    ax.imshow(img, extent=[0.3, 0.8, 0.3, 0.8], origin='upper')
    ax.axis('off')
    
    # Add the rank number to the left of the logo, italicized
    ax.text(-1, 0.5, f'{i + 1}', transform=ax.transAxes, ha='center', va='center', fontsize=48, style='italic')
    
    # Add the team name and metric value as text with bigger font size, bold the metric
    ax.text(1.6, 0.5, f'{team_row[stat_select]:.0f}', transform=ax.transAxes, ha='left', va='center', fontsize=48, weight='bold')

# Adjust the spacing between subplots to place them on the borders
ax_top = fig.add_subplot(gs[0, :])
ax_info = fig.add_subplot(gs[-2, :])
ax_bot = fig.add_subplot(gs[-1, :])
ax_left = fig.add_subplot(gs[:, 0])
ax_right = fig.add_subplot(gs[:, -1])

ax_top.axis('off')
ax_bot.axis('off')
ax_left.axis('off')
ax_right.axis('off')
ax_info.axis('off')

ax_middle = fig.add_subplot(gs[0, :])

# Add title and additional information
ax_middle.set_title(label=f'tjStuff+ by Team Relievers - 2024 MLB Season', x=0.5, y=0, fontsize=56, ha='center', va='bottom', style='italic', weight='bold')
ax_middle.axis('off')
ax_bot.text(s='By: @TJStats', x=0.1, y=0.5, fontsize=24, ha='left')
ax_bot.text(s='Data: MLB', x=0.9, y=0.5, fontsize=24, ha='right')

import datetime
ax_bot.text(s=f"{datetime.datetime.today().strftime('%Y-%m-%d')}", x=0.5, y=0.5, fontsize=12, ha='center')

ax_info.text(x=0.5, y=0, s='tjStuff+ calculates the Expected Run Value (xRV) of a pitch regardless of type\n'
                            'tjStuff+ is normally distributed, where 100 is the mean and Standard Deviation is 10\n',
                            ha='center', va='bottom', fontsize=24, style='italic')

# Show the plot
plt.show()

### Scaling

We can look at the distribution of tjStuff+ as we aggregate to different levels. Recall, tjStuff+ is normally distributed with a mean of 100 and a standard deviation of 10 at the pitch level. As we aggregate, we deal with larger and larger samples which regresses tjStuff+ to the mean. I do not scale tjStuff+ after aggregation, so it is important to understand how the distribution of tjStuff+ varies at different aggregation levels.

The following plot illustrates how the distribution of tjStuff+ tighten as we aggregate.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define the color list for the plots
color_list = [dict_colour[x] for x in dict_colour]

# Sort the dataframe by the median values of tj_stuff_plus
df_2024_sp_rp = df_plot_team.to_pandas().sort_values(by='tj_stuff_plus', ascending=False)

# Create a subplot for the KDE plots
fig, ax = plt.subplots(1, 1, figsize=(10, 6), dpi=300)

# Plot the KDE of tj_stuff_plus at the pitch level
df_2024['tj_stuff_plus'].to_pandas().plot.kde(bw_method=0.8, color=color_list[7])

# Plot the KDE of tj_stuff_plus at the pitcher level
df_pitch_all.filter((df_pitch_all['pitch_type'] == 'All') & (df_pitch_all['pitches'] >= 100))['tj_stuff_plus'].to_pandas().plot.kde(bw_method=0.8, color=color_list[8])

# Plot the KDE of tj_stuff_plus at the team level
df_plot_team['tj_stuff_plus'].to_pandas().plot.kde(bw_method=0.8, color=color_list[1])

# Set the x-axis limits
ax.set_xlim(60, 140)

# Set the title of the plot
ax.set_title('tjStuff+ Scales\nBy: Thomas Nestico, Data: MLB', fontsize=24)

# Set the x-axis and y-axis labels
ax.set_xlabel('tjStuff+')
ax.set_ylabel('Density')

# Set the y-axis limits
ax.set_ylim(0, None)

# Add a legend to the plot
ax.legend([f"Pitch Level - σ = {df_2024['tj_stuff_plus'].std():.1f}",
           f"Pitcher Level - σ = {df_pitch_all.filter((df_pitch_all['pitch_type'] == 'All') & (df_pitch_all['pitches'] >= 100))['tj_stuff_plus'].std():.1f}",
           f"Team Level - σ = {df_plot_team['tj_stuff_plus'].std():.1f}"])

# Display the plot
plt.show()


### Park Factors

The physical characteristics of a pitch can vary depending on the environment. The most popular example of this is Coors Field in Colorado which is notorious for its extreme elevation, sitting 5200 ft above seas level. Due to this elevation, the air is less dense Colorado, which causes pitches to have less overall movement. This causes pitches in Colorado to be negatively affected in the calculation of tjStuff+.

The way I calculate the park factors is first computing each teams tjStuff+ at home and on the road. After than I transform the tjStuff+ values into respective Z-Score and then Calculate the CDF probability. Finally, I divide Home CDF by Road CDF and multiply by 100 to get the Park Factor.

In [None]:
# Convert the Polars DataFrame to a Pandas DataFrame for further processing
df_2024_pd_pf = df_2024.to_pandas()

# Create a dictionary to map game_id to the home team
home_dict_id = df_2024_pd_pf.sort_values(by=['game_id', 'start_time']).groupby(['game_id']).head(1).set_index(['game_id'])['pitcher_team'].to_dict()

# Map the home team to each row in the dataframe
df_2024_pd_pf['home'] = df_2024_pd_pf['game_id'].map(home_dict_id).astype(str)

# Determine if the pitcher is playing at home
df_2024_pd_pf['pitcher_home'] = df_2024_pd_pf['home'] == df_2024_pd_pf['pitcher_team']

import scipy.stats as stats

def calculate_probability(z_score):
    """
    Calculate the cumulative distribution function (CDF) for a given z-score.
    """
    probability = stats.norm.cdf(z_score)
    return probability

# Step 1: Calculate the home and away means
grouped_means = df_2024_pd_pf.groupby(['pitcher_team', 'pitcher_home'])['tj_stuff_plus'].mean().unstack()

# Step 2: Compute the park factors
# Assumption: True = Home, False = Away
grouped_means['target_true'] = (grouped_means[True] - 100) / 10
grouped_means['target_false'] = (grouped_means[False] - 100) / 10
grouped_means['false_prob'] = grouped_means['target_true'].apply(calculate_probability)
grouped_means['true_prob'] = grouped_means['target_false'].apply(calculate_probability)
grouped_means['park_factor'] = abs(grouped_means['false_prob'] / grouped_means['true_prob']) * 100

# Convert the grouped_means DataFrame to a Polars DataFrame
grouped_means = pl.DataFrame(grouped_means.reset_index())

Let's plot Park Factors!

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Map team logos to the dataframe
grouped_means = grouped_means.with_columns(
    pl.col('pitcher_team').map_elements(lambda x: image_dict.get(x, x)).alias('logo')
)

stat_select = 'park_factor'

# Set the number of rows and columns for the subplot grid
num_rows = 13
num_cols = 7  # Updated to 7 columns

# Set the font style to Calibri
plt.rcParams['font.family'] = 'calibri'

# Create a figure
fig = plt.figure(figsize=(25, 25), dpi=300)

# Create a GridSpec object with specified column ratios
gs = gridspec.GridSpec(num_rows, num_cols, figure=fig, width_ratios=[0.01, 0.4, 0.1, 0.4, 0.1, 0.4, 0.01])

# Sort the dataframe in descending order based on the metric (park_factor)
sorted_df = grouped_means.to_pandas().sort_values(by=stat_select, ascending=False).reset_index(drop=True)

# Define the positions for the inner 30 plots
positions_left = [(i + 1, 1) for i in range(10)]
positions_middle = [(i + 1, 3) for i in range(10)]
positions_right = [(i + 1, 5) for i in range(10)]

# Combine positions for left, middle, and right sides
positions = positions_left + positions_middle + positions_right

# Iterate over the top 30 teams in the sorted dataframe
for i, (_, team_row) in enumerate(sorted_df.head(30).iterrows()):
    logo_url = team_row['logo']
    
    # Determine the position in the grid
    row, col = positions[i]
    
    # Create a subplot in the GridSpec layout
    ax = fig.add_subplot(gs[row, col])
    
    # Plot the team logo
    img = plt.imread(logo_url)
    ax.imshow(img, extent=[0.3, 0.8, 0.3, 0.8], origin='upper')
    ax.axis('off')
    
    # Add the rank number to the left of the logo, italicized
    ax.text(-1, 0.5, f'{i + 1}', transform=ax.transAxes, ha='center', va='center', fontsize=48, style='italic')
    
    # Add the team name and metric value as text with bigger font size, bold the metric
    ax.text(1.6, 0.5, f'{team_row[stat_select]:.0f}', transform=ax.transAxes, ha='left', va='center', fontsize=48, weight='bold')

# Adjust the spacing between subplots to place them on the borders
ax_top = fig.add_subplot(gs[0, :])
ax_info = fig.add_subplot(gs[-2, :])
ax_bot = fig.add_subplot(gs[-1, :])
ax_left = fig.add_subplot(gs[:, 0])
ax_right = fig.add_subplot(gs[:, -1])

ax_top.axis('off')
ax_bot.axis('off')
ax_left.axis('off')
ax_right.axis('off')
ax_info.axis('off')

ax_middle = fig.add_subplot(gs[0, :])

# Add title and additional information
ax_middle.set_title(label=f'tjStuff+ Park Factors - 2024 MLB Season', x=0.5, y=0, fontsize=56, ha='center', va='bottom', style='italic', weight='bold')
ax_middle.axis('off')
ax_bot.text(s='By: @TJStats', x=0.1, y=0.5, fontsize=32, ha='left')
ax_bot.text(s='Data: MLB', x=0.9, y=0.5, fontsize=32, ha='right')

import datetime
ax_bot.text(s=f"{datetime.datetime.today().strftime('%Y-%m-%d')}", x=0.5, y=0.5, fontsize=18, ha='center')

ax_info.text(x=0.5, y=0, s='''tjStuff+ Park Factors show the observed effect of tjStuff+ in the selected park
                            Park Factors are calculated by comparing Home tjStuff+ and Away tjStuff+ at the pitch level''',
                            ha='center', va='bottom', fontsize=24, style='italic')

# Show the plot
plt.show()
