In [1]:
!pip install pybaseball pandas

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting lxml>=4.2.1 (from pybaseball)
  Downloading lxml-6.0.2-cp312-cp312-win_amd64.whl.metadata (3.7 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading pygithub-2.8.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading pynacl-1.6.1-cp38-abi3-win_amd64.whl.metadata (10 kB)
Collecting pyjwt>=2.4.0 (from pyjwt[crypto]>=2.4.0->pygithub>=1.51->pybaseball)
  Downloading PyJWT-2.10.1-py3-none-any.whl.metadata (4.0 kB)
Collecting cryptography>=3.4.0 (from pyjwt[crypto]>=2.4.0->pygithub>=1.51->pybaseball)
  Downloading cryptography-46.0.3-cp311-abi3-win_amd64.whl.metadata (5.7 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
Downloading lxml-6.0.2-cp312-cp312-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------- 4.0/4.0 MB 30.0 MB/s  0:00:00
Downlo

In [3]:
from pybaseball import statcast
import pandas as pd

# Download Statcast data for 2023 regular season (March 30 to Oct 1)
data = statcast(start_dt="2023-03-30", end_dt="2023-10-01")

print(f"✅ Download complete! Number of pitches: {len(data)}")
data.head()

This is a large query, it may take a moment to complete


That's a nice request you got there. It'd be a shame if something were to happen to it.
We strongly recommend that you enable caching before running this. It's as simple as `pybaseball.cache.enable()`.
Since the Statcast requests can take a *really* long time to run, if something were to happen, like: a disconnect;
gremlins; computer repair by associates of Rudy Giuliani; electromagnetic interference from metal trash cans; etc.;
you could lose a lot of progress. Enabling caching will allow you to immediately recover all the successful
subqueries if that happens.
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[col

✅ Download complete! Number of pitches: 720684


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
2446,CH,2023-10-01,89.0,-2.8,5.59,"Robertson, Nick",677008,687798,field_out,hit_into_play,...,,2.55,1.53,-1.53,31.7,1.676715,-1.896554,41.830979,30.714944,26.41202
2450,FF,2023-10-01,96.9,-2.4,5.9,"Robertson, Nick",677008,687798,,foul,...,,1.09,0.76,-0.76,47.4,8.715532,3.692542,40.551342,33.656454,26.020583
2552,CH,2023-10-01,90.0,-2.93,5.56,"Robertson, Nick",677008,687798,,ball,...,,2.47,1.65,-1.65,30.3,,,,,
2645,ST,2023-10-01,82.2,-3.09,5.55,"Robertson, Nick",677008,687798,,ball,...,,3.14,-1.43,1.43,28.9,,,,,
2850,CH,2023-10-01,89.2,-2.87,5.58,"Robertson, Nick",677008,687798,,swinging_strike,...,,2.57,1.49,-1.49,34.3,20.169759,-7.584644,37.675911,44.236969,36.187039


In [5]:
data.to_csv("data/statcast_2023.csv", index=False)
print("Saved as statcast_2023.csv")

Saved as statcast_2023.csv


In [None]:
# Basic info
print("Number of rows:", len(data))
print("Number of columns:", len(data.columns))
print("\nColumn names:\n", data.columns.tolist()[:20])

In [None]:
# Check missing values for the most important features
key_columns = ['pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z',
               'pitcher', 'batter', 'balls', 'strikes', 'outs_when_up', 'inning']
print(data[key_columns].isna().sum())

In [None]:
# How many of each pitch type
print(data['pitch_type'].value_counts())

In [None]:
# Columns we will use for the first model
model_cols = [
    'pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z',
    'pitcher', 'batter', 'balls', 'strikes', 'outs_when_up', 'inning',
    'stand', 'p_throws'  # pitcher/batter handedness
]

# Keep only these columns and drop rows with missing pitch_type or release_speed
df_model = data[model_cols].dropna(subset=['pitch_type', 'release_speed']).copy()

print("Rows after dropping missing values:", len(df_model))
print("Columns used:", df_model.columns.tolist())
df_model.head()

**Preliminary Visualizations and Baseline**

In [None]:
import matplotlib.pyplot as plt

# Count of each pitch type
pitch_counts = df_model['pitch_type'].value_counts()
pitch_counts.plot(kind='bar', figsize=(10,5), title="Pitch Type Distribution")
plt.xlabel("Pitch Type")
plt.ylabel("Count")
plt.show()

In [None]:
import seaborn as sns

plt.figure(figsize=(12,6))
sns.boxplot(x='pitch_type', y='release_speed', data=df_model)
plt.title("Pitch Speed by Pitch Type")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Take a random sample of 5000 pitches for plotting
sample = df_model.sample(5000, random_state=42)
plt.figure(figsize=(8,6))
sns.scatterplot(x='release_pos_x', y='release_pos_z', hue='pitch_type', data=sample, alpha=0.7)
plt.title("Release Position Scatter (X vs Z)")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

# Baseline: always predict the most common pitch
most_common_pitch = df_model['pitch_type'].mode()[0]
baseline_preds = [most_common_pitch] * len(df_model)

accuracy = accuracy_score(df_model['pitch_type'], baseline_preds)
print(f"Most common pitch type: {most_common_pitch}")
print(f"Baseline accuracy: {accuracy:.4f}")