In [4]:
import pandas as pd
import numpy as np
from utils.text_mappings import pitch_types, events, pitch_outcomes, righty_lefty
from pybaseball import statcast

# Objective
We want to get pitch statcast data to see if the outcome of a pitch can be predicted

# Get pitch data

Starting from scratch we will need to pull using pybaseball's statcast function and save to a csv

In [None]:
df = statcast('2020-01-01','2025-05-10')
df.to_csv('data/statcast/2020101_20250510.csv',index=False)

If we have already saved the csv, we can skip the previous step and load the csv into a DataFrame, we will use 1 million pitches for our feature analysis

In [5]:
df = pd.read_csv('data/statcast/2020101_20250510.csv',nrows=1000000)

  df = pd.read_csv('data/statcast/2020101_20250510.csv',nrows=1000000)


# Get pitch features
From [baseball savant](https://baseballsavant.mlb.com/csv-docs) we read the column descriptions to figure out which are relevant to the pitch

In [6]:
pitch_cols = [
    'spin_axis','pitch_number','pitch_type','at_bat_number',
    'release_speed','release_pos_y','release_pos_x','release_pos_z',
    'release_extension','release_spin_rate','effective_speed',
    'sz_top','sz_bot','ax','ay','az','vx0','vy0','vz0',
    'outs_when_up','inning','on_1b','on_2b','on_3b',
    'plate_x','plate_z','pfx_x','pfx_z','zone','arm_angle',
    'api_break_z_with_gravity','api_break_x_arm','api_break_x_batter_in',
    'balls','strikes','p_throws','stand'
]

We hypothesize the preceding pitch data should be relevant to the outcome, we will add these ourselves

In [7]:
pitch_cols.extend(
    [
    'previous_pitch_speed','previous_zone','previous_plate_x',
    'previous_plate_z','delta_speed','delta_plate_x','delta_plate_z'
    ]
)

We built a mapping utils/text_mappings.py to change text values to integers (more friendly for machine learning)

In [8]:
df.pitch_type = df.pitch_type.map(pitch_types)
df.type = df.type.map(pitch_outcomes)
df.p_throws = df.p_throws.map(righty_lefty)
df.stand = df.stand.map(righty_lefty)

Now let's get the previous pitch values and the delta (changing speeds and eye level)

In [9]:
df = df.sort_values(by=['game_pk', 'at_bat_number', 'pitch_number'])
df_grouped = df.groupby(['game_pk','at_bat_number'])
df['previous_pitch_speed'] = df_grouped['release_speed'].shift(1)
df['previous_pitch_type'] = df_grouped['pitch_type'].shift(1)
df['previous_zone'] = df_grouped['zone'].shift(1)
df['previous_plate_x'] = df_grouped['plate_x'].shift(1)
df['previous_plate_z'] = df_grouped['plate_z'].shift(1)
df['delta_speed'] = df['release_speed'] - df['previous_pitch_speed']
df['delta_plate_x'] = df['plate_x'] - df['previous_plate_x']
df['delta_plate_z'] = df['plate_z'] - df['previous_plate_z']

Clean up any bad data

In [10]:
df.fillna(0,inplace=True)

# Correlation Analysis
The 'type' column tells us the outcome (strike, ball, or in-play), lets see which features are correlated with this

In [12]:
correlations = {}
for column in list(pitch_cols):
    try:
        correlations[column] = df.type.corr(df[column])
    except Exception as e:
        print(column, e)
        pass

The relevant features for predicting pitch outcomes should have a pearson coefficient at least 0.01

In [14]:
best_pitch_features = []
for key,val in correlations.items():
    if abs(val) > 0.01:
        print(key, val)
        best_pitch_features.append(key)

pitch_number 0.07330644098533459
release_speed -0.029638011497940896
release_pos_y -0.026305739977135388
release_pos_z -0.021160093206034403
release_extension -0.025276291714847073
release_spin_rate -0.03862248748520818
effective_speed -0.02949614794427336
sz_top -0.028797222524809835
sz_bot -0.027833055741728677
ay -0.03217957319285074
az -0.012062423280205748
vy0 0.02972906974223812
plate_z -0.03388905875089911
pfx_z -0.019994139157613295
zone 0.11357738760429625
arm_angle -0.01123327317239702
api_break_x_arm 0.024716332121180674
api_break_x_batter_in -0.02145324748575749
balls 0.035821581564858394
strikes 0.09080537101589876
previous_pitch_speed 0.07173248928225641
previous_zone 0.048646983747355994
previous_plate_z 0.04774956104294738


Some of these features may be redundent, let's try to remove some that are also highly correlated with each other

In [15]:
df[best_pitch_features].corr()

Unnamed: 0,pitch_number,release_speed,release_pos_y,release_pos_z,release_extension,release_spin_rate,effective_speed,sz_top,sz_bot,ay,...,pfx_z,zone,arm_angle,api_break_x_arm,api_break_x_batter_in,balls,strikes,previous_pitch_speed,previous_zone,previous_plate_z
pitch_number,1.0,0.063505,0.054093,0.032298,0.063818,0.020978,0.060194,0.039173,0.025245,0.04813,...,-0.000828,0.026212,0.011415,-0.006815,0.003697,0.809934,0.790961,0.63795,0.503296,0.473129
release_speed,0.063505,1.0,0.866676,0.721727,0.736119,0.468739,0.914878,0.792583,0.745629,0.852738,...,0.437309,0.156209,0.108799,0.317946,0.009238,0.076629,0.030516,0.190303,0.139596,0.128826
release_pos_y,0.054093,0.866676,1.0,0.814058,0.738852,0.504755,0.782041,0.900045,0.841675,0.617808,...,0.09877,0.25866,0.080976,0.050923,-0.016433,0.043064,0.049193,0.19571,0.15123,0.148718
release_pos_z,0.032298,0.721727,0.814058,1.0,0.573423,0.400213,0.646742,0.738416,0.684591,0.524372,...,0.163005,0.197322,0.157549,0.01845,-0.005631,0.026307,0.028602,0.15071,0.111007,0.118306
release_extension,0.063818,0.736119,0.738852,0.573423,1.0,0.464044,0.827633,0.686132,0.660352,0.573013,...,0.170699,0.192333,0.263833,0.105255,-0.007078,0.056809,0.049872,0.170125,0.128323,0.126319
release_spin_rate,0.020978,0.468739,0.504755,0.400213,0.464044,1.0,0.512332,0.441886,0.425126,0.385974,...,0.013301,0.103208,0.144469,-0.211702,0.060757,0.014435,0.022828,0.088433,0.059078,0.064817
effective_speed,0.060194,0.914878,0.782041,0.646742,0.827633,0.512332,1.0,0.714784,0.676842,0.773496,...,0.407579,0.139246,0.140204,0.293838,0.010266,0.072588,0.028615,0.175232,0.128827,0.118442
sz_top,0.039173,0.792583,0.900045,0.738416,0.686132,0.441886,0.714784,1.0,0.933187,0.576079,...,0.114426,0.24952,0.062039,0.063431,-0.00812,0.032208,0.035615,0.1688,0.137004,0.133266
sz_bot,0.025245,0.745629,0.841675,0.684591,0.660352,0.425126,0.676842,0.933187,1.0,0.542509,...,0.100031,0.267033,0.118586,0.055712,-0.011927,0.022393,0.022142,0.149241,0.123579,0.119141
ay,0.04813,0.852738,0.617808,0.524372,0.573013,0.385974,0.773496,0.576079,0.542509,1.0,...,0.51115,0.06806,0.128317,0.387441,0.020981,0.069494,0.013753,0.141995,0.096987,0.091019


- Drop pitch_number - highly correlated with balls and strikes
- Drop release_speed, release_extension, and vy0 - highly correlated with release_pos and effective_speed
- Drop az - highly correlated with pfxz and less correlated with pitch_outcome

In [16]:
final_pitch_features = [
    feat for feat in best_pitch_features if feat not in 
    ['pitch_number','release_speed','vy0','az','release_extension']
]

# Final features

In [17]:
final_pitch_features

['release_pos_y',
 'release_pos_z',
 'release_spin_rate',
 'effective_speed',
 'sz_top',
 'sz_bot',
 'ay',
 'plate_z',
 'pfx_z',
 'zone',
 'arm_angle',
 'api_break_x_arm',
 'api_break_x_batter_in',
 'balls',
 'strikes',
 'previous_pitch_speed',
 'previous_zone',
 'previous_plate_z']