In [1]:
import torch
import torch.nn as nn
import json
import numpy as np
import pandas as pd
from models.pitch_grader_mlp import PitchGraderMLP
from utils.dataloader import BaseballDataset
from utils.grader import grade_pitcher, grade_batter
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# Pitch grader

## Load data
Get the player id info mapped to embedding index

In [2]:
with open("data/preprocessed/batter_map_2015_2024.json","r") as infile:
    batter_map = json.load(infile)
with open("data/preprocessed/pitcher_map_2015_2024.json","r") as infile:
    pitcher_map = json.load(infile)

In [3]:
pitchers = pd.read_csv("data/preprocessed/pitchers.csv", index_col=False)
batters = pd.read_csv("data/preprocessed/batters.csv", index_col=False)

Load the validation pitch data from 2024

In [4]:
x_data_file = "data/preprocessed/X_val_norm.npy"
y_data_file = "data/preprocessed/Y_val.npy"

In [5]:
full_dataset = BaseballDataset(
    x_data_file,
    y_data_file
)
test_loader = DataLoader(full_dataset, batch_size=512, shuffle=False, num_workers=4)

# Load model
Instantiate our pitch grading MLP model with the same parameters as during training

In [13]:
hidden_dim = 64
pitch_features = full_dataset[0][0].shape[0]
output_dim = 3
batch_size = 512
epochs = 100
device = 'cuda'
train_split = 0.8
test_split = 0.2
learning_rate = 0.001
verbose = True

model = PitchGraderMLP(
    pitch_features,
    len(batters), len(pitchers),
    hidden_dim,
    output_dim
)

Load our checkpoint after training

In [14]:
model.load_state_dict(torch.load('models/checkpoints/pitch_grader_500.pt'))
model = model.to(device)
model.device = device

# Evaluation
Evaluate the model on the test dataset and print the classification report

Our classes are {0: strike, 1: ball, 2: in-play}

In [15]:
model.eval()
with torch.no_grad():
    total = 0
    correct = 0
    pred = []
    y_test = []
    for batch_x, batch_y in test_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        outputs = nn.functional.softmax(model(batch_x),dim=1)
        predicted = torch.argmax(outputs.data, 1)
        pred.append(predicted)
        y_test.append(batch_y)
        total += len(batch_y)
        correct += (predicted == batch_y).sum().item()
# move from GPU to CPU
y_test = np.concatenate([p.cpu().numpy() for p in y_test])
pred = np.concatenate([p.cpu().numpy() for p in pred])
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       0.71      0.61      0.65    354608
         1.0       0.71      0.91      0.80    272498
         2.0       0.44      0.36      0.40    134214

    accuracy                           0.67    761320
   macro avg       0.62      0.62      0.62    761320
weighted avg       0.66      0.67      0.66    761320



Not surprisingly we can predict balls very well, as this can be largely determined by the 'zone' feature. For most classification tasks we would consider a f1-score of 0.4 is quite poor, predicting whether a pitch will be put in play is a very difficult task with a lot of variables outside the pitch itself. We consider this acceptable for now and will further evaluate by analyzing correlation with other known statistics.

# Grade players pitches

We train the model to classify pitch outcomes to force the model to learn something about the quality of the pitches. 

We don't really care about the accuracy of predicting pitch outcomes, we care that the model can give us a good assessment of the quality, or hittability, of the pitches.

## Grade each player's pitches
For each player's pitches compute the average probability of their pitches being classified as ball, strike, or in-play.

In [16]:
df = pd.read_csv("data/statcast/2015-2024_preproc_pitch_outcomes.csv",index_col=False)
df['game_date'] = pd.to_datetime(df['game_date'])
X_min = pd.read_csv("data/preprocessed/X_min.csv",header=None,index_col=0)[1]
X_max = pd.read_csv("data/preprocessed/X_max.csv",header=None,index_col=0)[1]
pitch_features = [
    'batter','pitcher','release_pos_y','release_pos_z',
    'release_spin_rate','effective_speed','sz_top','sz_bot',
    'ay','plate_z','pfx_z','zone','arm_angle','api_break_x_arm',
    'api_break_x_batter_in','previous_pitch_speed','previous_zone',
    'previous_plate_z','balls','strikes'
]
start_year = 2015
end_year = 2024

In [17]:
pitcher_grades = {
    pitcher:{} for pitcher in pitcher_map.keys()
}
for year in range(start_year,end_year + 1):
    df_year = df[df['game_date'].dt.year == year]
    for pitcher in pitcher_map.keys():
        grades = grade_pitcher(
            model,df_year,pitcher_map[pitcher],pitch_features,X_min,X_max,
        )
        pitcher_grades[pitcher][year] =  {
                "strikes":torch.mean(grades[:,0],dim=0).item(),
                "balls":torch.mean(grades[:,1],dim=0).item(),
                "in-play":torch.mean(grades[:,2],dim=0).item()
        }

In [18]:
batter_grades = {
    batter:{} for batter in batter_map.keys()
}
for year in range(start_year,end_year + 1):
    df_year = df[df['game_date'].dt.year == year]
    for batter in list(batter_map.keys()):
        grades = grade_batter(
            model,df_year,batter_map[batter],pitch_features,X_min,X_max
            #model_embed,df_year,batter_map[batter],pitch_features,X_min,X_max,embed=True

        )
        batter_grades[batter][year] =  {
                "strikes":torch.mean(grades[:,0],dim=0).item(),
                "balls":torch.mean(grades[:,1],dim=0).item(),
                "in-play":torch.mean(grades[:,2],dim=0).item()
        }

# Analyze results of pitch grading

## Add classical stats and build DataFrame

In [19]:
pitchers['fullname'] = pitchers.name_first + " " + pitchers.name_last
batters['fullname'] = batters.name_first + " " + batters.name_last

p_df = pd.DataFrame.from_dict(pitcher_grades,orient='index')
p_df = p_df.stack().apply(pd.Series).reset_index()
p_df.columns = ['key_mlbam', 'year', 'Strikes', 'Balls', 'In-play']
p_df.key_mlbam = p_df.key_mlbam.astype(np.int64)
p_df = p_df.join(pitchers.set_index('key_mlbam'), on='key_mlbam')

b_df = pd.DataFrame.from_dict(batter_grades,orient='index')
b_df = b_df.stack().apply(pd.Series).reset_index()
b_df.columns = ['key_mlbam', 'year', 'Strikes', 'Balls', 'In-play']
b_df.key_mlbam = b_df.key_mlbam.astype(np.int64)
b_df = b_df.join(batters.set_index('key_mlbam'), on='key_mlbam')


In [20]:
ps = pd.read_csv('data/stats/pitching_stats_2015_2024.csv',index_col=False)
ps.rename(columns={'mlbID':'key_mlbam'},inplace=True)
p_df = pd.merge(p_df, ps, on=['key_mlbam', 'year'], how='inner')
p_df['h_per_ip'] = p_df['H'] / p_df['IP']
p_df['bb_per_ip'] = p_df['BB'] / p_df['IP']
p_df['Hittability'] = p_df['In-play'] / p_df['Strikes']
p_df['ERA'] = p_df.ERA.map(lambda x: 5 if x > 5 else x)
p_df['WHIP'] = p_df.WHIP.map(lambda x: 5 if x > 5 else x)
p_df['bb_per_ip'] = p_df.bb_per_ip.map(lambda x: 5 if x > 5 else x)
p_df['h_per_ip'] = p_df.h_per_ip.map(lambda x: 5 if x > 5 else x)
p_df.dropna(inplace=True)
p_df = p_df[p_df['IP'] > 50]

In [21]:
bs = pd.read_csv("data/stats/batting_stats_2015_2024.csv",index_col=False)
bs.rename(columns={'mlbID':'key_mlbam'},inplace=True)
b_df = pd.merge(b_df, bs, on=['key_mlbam', 'year'], how='inner')
b_df['so_per_pa'] = b_df['SO'] / b_df['PA']
b_df['hr_per_pa'] = b_df['HR'] / b_df['PA']
b_df['Hittability'] = b_df['In-play'] / b_df['Strikes']
aggregates = b_df.groupby('key_mlbam').agg({'H':"sum","AB":"sum"})
aggregates['BA_total'] = aggregates['H'] / aggregates['AB']
b_df = b_df.join(aggregates.drop(["H","AB"],axis=1),on='key_mlbam')
b_df['Handedness'] = b_df['Bats'].map(lambda x: 1 if x.strip() == 'Right' else 0)
b_df.dropna(inplace=True)
b_df = b_df[b_df['PA'] > 400]

## Verify our model with correlation analysis
We expect our model to learn a latent understanding of the hittability of a pitch, we see how well it correlates to well-known stats to see if it learned what we intended.

Due to the nature of baseball and the unpredictability involved, we consider a correlation (pearson coefficient) around +-0.25 to be a strong relationship and a correlation above +-0.1 to be a moderate relationship

### Pitcher analysis
We expect pitchers with high 'Strikes' score to get a lot of swing and misses and correlate with the strikeouts per 9 innings (SO9)

We observe a strong negative correlation between average pitch hittability and strikeouts per 9 innings, this matches our expectation

In [35]:
p_df['Strikes'].corr(p_df['SO9'])

np.float64(0.5083266154578803)

We expect pitchers with high 'balls' score to walk a lot of batters, let's look at their walks per inning

We observe a strong positive correlation between the average balls score and the number of walks per inning, this matches our expectation

In [31]:
p_df['Balls'].corr(p_df['bb_per_ip'])

np.float64(0.40584589957216427)

We expect pitcher's who throw a lot of pitches that are put in play should give up more hits

We observe a strong positive correlation between the average hittability score and the number of hits per inning, this matches our expectation

In [37]:
p_df['In-play'].corr(p_df['h_per_ip'])

np.float64(0.303481883459113)

We define hittability as the ratio of 'In-play' score to 'Strikes' score. Intuitively we can understand it as what percentage of pitches around the strike zone are likely to be put in play.

We sort by hittability and view the top 5 and bottom 5 pitchers, notice the SO9 for the top 5 vs the bottom 5

In [33]:
p_df.sort_values(by='Hittability',inplace=True)
p_df[['name_last','name_first','Hittability','SO9']]

Unnamed: 0,name_last,name_first,Hittability,SO9
4046,kopech,michael,0.327511,11.5
1174,kimbrel,craig,0.352990,14.1
3544,hader,josh,0.366958,12.9
4388,díaz,alexis,0.371905,11.7
3543,hader,josh,0.375769,13.6
...,...,...,...,...
188,kintzler,brandon,0.883140,6.4
3565,suárez,ranger,0.911490,9.1
1058,alexander,scott,0.920258,7.6
439,price,david,0.932272,7.1


### Batter analysis
Let's look further at how these pitches impact the batters, we expect very good power hitters would get fewer hittable pitches. We will use the HR for a season to approximate good power hitters.

We observe a moderate negative correlation between HR and hittability, indicating that good power hitters may see fewer hittable pitches

In [26]:
b_df['Hittability'].corr(b_df['HR'])

np.float64(-0.19715543396133325)

We expect hitters who lay down a lot of bunts are likely not as feared and may get more hittable pitches

We observe a moderate positive correlation between sacrifice bunts and hittability score

In [27]:
b_df['Hittability'].corr(b_df['SH'])

np.float64(0.1575090698272386)

Now we sort by hittability and view the top 5 and bottom 5 pitchers, notice the HR and SLG for the top 5 vs the bottom 5

In [38]:
b_df.sort_values(by='Hittability',inplace=True)
b_df[['year','name_last','name_first','BA','SLG','HR','Hittability']]

Unnamed: 0,year,name_last,name_first,BA,SLG,HR,Hittability
2692,2024,seager,corey,0.278,0.512,30,0.200747
2049,2024,judge,aaron,0.311,0.678,61,0.229150
3552,2024,wong,connor,0.280,0.425,13,0.266880
2691,2023,seager,corey,0.326,0.630,39,0.289148
2690,2022,seager,corey,0.245,0.455,33,0.304270
...,...,...,...,...,...,...,...
1420,2015,strange-gordon,dee,0.333,0.418,4,0.786084
2501,2015,panik,joe,0.312,0.455,8,0.792935
3178,2021,hampson,garrett,0.234,0.380,11,0.796280
1925,2015,gyorko,jedd,0.247,0.397,16,0.823698


# Adjusted Batting Average

The main idea for adjusting a player's batting average is to account for the average difficulty of pitches they faced. We expect a batter that faced harder pitches to have a lower batting average, so we will adjust it upwards to reflect their true skill. Similarly, we will adjust the batting average downwards for batters that faced easier pitches. 

We can apply this same idea to adjust other relevant batting statistic. 

We build a regression model where a batter's actual batting average is the dependent variable, and the hittability score is an independent variable, along with other factors that influence batting average as control variables (i.e. handedness, batter age, and ball score). 

In [39]:
import statsmodels.formula.api as smf
model = smf.ols('BA ~ Hittability + BA_total + C(Bats) + Balls + Age', data=b_df).fit()
model.params['Hittability']

np.float64(0.0738124636327642)

To remove the effect of the average pitch hittability, we substract/add from the actual batting average the portion that can be explained by them facing pitchers that were easier/tougher than the league average. 

In [40]:
beta_hittability = model.params['Hittability']
b_df['adjusted_ba'] = b_df['BA'] - \
                    (beta_hittability * (b_df['Hittability'] - b_df['Hittability'].mean()))

Now we get the level of adjustment and the range of adjustment is made for a player across seasons

In [41]:
b_df['adjustment_diff'] = b_df['adjusted_ba'] - b_df['BA']
maxes = b_df.groupby('key_mlbam')['adjustment_diff'].max()
mins = b_df.groupby('key_mlbam')['adjustment_diff'].min()
diffs = pd.DataFrame(maxes - mins)
diffs.rename(columns={'adjustment_diff':'adjustment_range'},inplace=True)
b_df = pd.merge(b_df, diffs, on=['key_mlbam'], how='left')

The level of adjustment can be used to identify batters who are seeing abnormal amounts of hittable pitches

We observe the top 5 and bottom 5 player seasons based on batting average adjustment:

In [55]:
b_df.sort_values(by='adjustment_diff',inplace=True)
b_df[['key_mlbam','year','name_last','name_first','Hittability','BA','BA_total','adjusted_ba','adjustment_diff']]

Unnamed: 0,key_mlbam,year,name_last,name_first,Hittability,BA,BA_total,adjusted_ba,adjustment_diff
1805,542255,2015,inciarte,ender,0.841735,0.303,0.280384,0.287296,-0.015704
1804,576397,2015,gyorko,jedd,0.823698,0.247,0.251245,0.232627,-0.014373
1803,641658,2021,hampson,garrett,0.796280,0.234,0.241228,0.221651,-0.012349
1802,605412,2015,panik,joe,0.792935,0.312,0.259466,0.299898,-0.012102
1801,543829,2015,strange-gordon,dee,0.786084,0.333,0.292492,0.321403,-0.011597
...,...,...,...,...,...,...,...,...,...
4,608369,2022,seager,corey,0.304270,0.245,0.289603,0.268967,0.023967
3,608369,2023,seager,corey,0.289148,0.326,0.289603,0.351083,0.025083
2,657136,2024,wong,connor,0.266880,0.280,0.255973,0.306727,0.026727
1,592450,2024,judge,aaron,0.229150,0.311,0.284971,0.340512,0.029512


We see the biggest negative adjustment is for Ender Inciarte's 2015 season where he batted 303, after adjustment this falls to 287 which is close to his career average. Joe Panik and Dee Gordon have similar adjustments indicating their career-best season was partially attributable to seeing easier pitches.

Jedd Gyorko and Garrett Hampson seem to be having an even worse year than their batting average indicates. One possible explanation for their high hittability score is that pitcher's see them as a quick out and don't want to waste pitches by throwing outside the strike zone. 

On the other side of the spectrum, Aaron Judge's impressive 2024 is even more impressive after considering how tough of pitches he is seeing. Corey Seager is not getting anything to hit after joining the Rangers

Now we can look at the range of adjustments across players' seasons to see if there were any significant changes, possibly due to lineup change or trades.

Interestingly the player with the highest adjustment range is Corey Seager, indicating his time with the Dodgers saw more favorable pitches

In [67]:
b_df.sort_values(by='adjustment_range',inplace=True)
b_df[['key_mlbam','name_last','name_first','adjustment_range']][-1:]

Unnamed: 0,key_mlbam,name_last,name_first,adjustment_range
0,608369,seager,corey,0.033733


If we look at Seager's hittability score by year, we see a noticeable drop once he moves from the Dodgers to the Rangers. He also suffered a career low in batting average in 2022, but our adjustment indicates a large part of the dropoff was due to facing harder pitches.

In [72]:
b_df.sort_values(by='year',inplace=True)

b_df[b_df['key_mlbam']==608369][['year','Tm','Hittability','name_last','name_first','BA','BA_total','OPS','adjusted_ba','adjustment_diff']]

Unnamed: 0,year,Tm,Hittability,name_last,name_first,BA,BA_total,OPS,adjusted_ba,adjustment_diff
505,2016,Los Angeles,0.594525,seager,corey,0.308,0.289603,0.877,0.310543,0.002543
824,2017,Los Angeles,0.622797,seager,corey,0.295,0.289603,0.854,0.295456,0.000456
1194,2019,Los Angeles,0.657179,seager,corey,0.272,0.289603,0.817,0.269918,-0.002082
1198,2021,Los Angeles,0.657752,seager,corey,0.292,0.289603,0.883,0.289876,-0.002124
4,2022,Texas,0.30427,seager,corey,0.245,0.289603,0.772,0.268967,0.023967
3,2023,Texas,0.289148,seager,corey,0.326,0.289603,1.028,0.351083,0.025083
0,2024,Texas,0.200747,seager,corey,0.278,0.289603,0.864,0.309608,0.031608


This example illustrates how a quantified adjustment for pitch difficulty can provide more context for evaluating a player's season. 

# Conclusion

To summarize our analysis so far, we use statcast data and real outcomes to score the pitch difficulty. We show this score correlates well to a pitcher's strikeout ability which can be useful to evaluate pitcher's based solely on statcast data. Additionally, we use this score to quantify it's impact on a specific batting statistic like batting average and adjust the statistic to normalize for pitch difficulty. We look at some of the higher adjustment levels and find that it can be used to identify flukey seasons and see how a player's pitch difficulty is impacted by the team around him.