In [1]:
pip install pybaseball pandas numpy matplotlib seaborn scikit-learn sqlalchemy psycopg2-binary jupyter notebook

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading uri_template-1.3.0-py3-none-any.whl.metadata (8.8 kB)
Collecting webcolors>=24.6.0 (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading webcolors-24.11.1-py3-none-any.whl.metadata (2.2 kB)
Downloading psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:

In [12]:
# Dependencies
import pandas as pd
import pybaseball 
from pybaseball import statcast, cache
import requests 
import glob
import time
import datetime
from datetime import timedelta
import seaborn
import sqlalchemy
import sqlite3
import os



In [7]:
# cache for pybaseball
cache.enable()

# month breakdown to make api call smoother
months = [
    ('2024-03-28', '2024-04-30'),
    ('2024-05-01', '2024-05-31'),
    ('2024-06-01', '2024-06-30'),
    ('2024-07-01', '2024-07-31'),
    ('2024-08-01', '2024-08-31'),
    ('2024-09-01', '2024-09-30'),
    ('2024-10-01', '2024-10-30')
]

all_data = pd.DataFrame()

# pull data month-by-month and save each to CSV 
for start, end in months:
    print(f"Fetching data from {start} to {end}...")
    try:
        df = statcast(start, end)
        if not df.empty:
            all_data = pd.concat([all_data, df], ignore_index=True)

            # save to CSV with naming convention
            filename = f'statcast_{start}_to_{end}.csv'
            df.to_csv(filename, index=False)
            print(f"Saved: {filename}")
    except Exception as e:
        print(f"Error fetching data from {start} to {end}: {e}")
    time.sleep(1) 

# save the full dataset to a local SQLite database
print("Saving to SQLite database...")
conn = sqlite3.connect('pitch_2024.db')
all_data.to_sql('statcast_2024', conn, if_exists='replace', index=False)
conn.close()
print("✅ Data saved to 'pitch_2024.db' in table 'statcast_2024'.")

Fetching data from 2024-03-28 to 2024-04-30...
This is a large query, it may take a moment to complete


100%|██████████| 34/34 [00:01<00:00, 17.19it/s]


Saved: statcast_2024-03-28_to_2024-04-30.csv
Fetching data from 2024-05-01 to 2024-05-31...
This is a large query, it may take a moment to complete


100%|██████████| 31/31 [00:01<00:00, 19.11it/s]


Saved: statcast_2024-05-01_to_2024-05-31.csv
Fetching data from 2024-06-01 to 2024-06-30...
This is a large query, it may take a moment to complete


100%|██████████| 30/30 [00:01<00:00, 19.80it/s]


Saved: statcast_2024-06-01_to_2024-06-30.csv
Fetching data from 2024-07-01 to 2024-07-31...
This is a large query, it may take a moment to complete


100%|██████████| 31/31 [00:01<00:00, 18.50it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Saved: statcast_2024-07-01_to_2024-07-31.csv
Fetching data from 2024-08-01 to 2024-08-31...
This is a large query, it may take a moment to complete


100%|██████████| 31/31 [00:01<00:00, 19.76it/s]


Saved: statcast_2024-08-01_to_2024-08-31.csv
Fetching data from 2024-09-01 to 2024-09-30...
This is a large query, it may take a moment to complete


100%|██████████| 30/30 [00:01<00:00, 19.05it/s]


Saved: statcast_2024-09-01_to_2024-09-30.csv
Fetching data from 2024-10-01 to 2024-10-30...
This is a large query, it may take a moment to complete


100%|██████████| 30/30 [00:01<00:00, 18.69it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Saved: statcast_2024-10-01_to_2024-10-30.csv
Saving to SQLite database...
✅ Data saved to 'pitch_2024.db' in table 'statcast_2024'.


In [9]:
all_data.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description',
       ...
       'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat',
       'pitcher_days_since_prev_game', 'batter_days_since_prev_game',
       'pitcher_days_until_next_game', 'batter_days_until_next_game',
       'api_break_z_with_gravity', 'api_break_x_arm', 'api_break_x_batter_in',
       'arm_angle'],
      dtype='object', length=113)

In [10]:
all_data.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,...,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length,estimated_slg_using_speedangle,delta_pitcher_run_exp,hyper_speed,home_score_diff,bat_score_diff,home_win_exp,bat_win_exp,age_pit_legacy,age_bat_legacy,age_pit,age_bat,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,CH,2024-04-30,81.2,-2.75,5.42,"Crismatt, Nabil",572233,622503,home_run,hit_into_play,,,,,13,Christian Walker homers (7) on a fly ball to l...,R,R,R,AZ,LAD,X,,fly_ball,1,1,2024,-0.77,0.19,-0.9,2.31,,606466,,0,10,Bot,5.58,73.43,,...,6.0,81,3,Changeup,2,3,2,3,3,4,4,3,Standard,Standard,248,0.562,1.015,77.7,7.6,3.839,-1.015,109.3,-1,-1,0.438,0.438,29,33,30,33,1,4,2,1,12,1,3.26,0.77,0.77,30.6
1,CH,2024-04-30,81.4,-2.73,5.47,"Crismatt, Nabil",572233,622503,,ball,,,,,14,Christian Walker homers (7) on a fly ball to l...,R,R,R,AZ,LAD,B,,,0,1,2024,-0.43,0.49,0.01,1.44,,606466,,0,10,Bot,,,,...,,81,2,Changeup,2,3,2,3,3,2,2,3,Standard,Standard,243,0.0,0.036,,,,-0.036,,-1,-1,0.438,0.438,29,33,30,33,1,4,2,1,12,1,2.94,0.43,0.43,30.0
2,CH,2024-04-30,82.0,-2.63,5.6,"Crismatt, Nabil",572233,622503,,called_strike,,,,,5,Christian Walker homers (7) on a fly ball to l...,R,R,R,AZ,LAD,S,,,0,0,2024,-0.68,0.33,0.25,2.38,,606466,,0,10,Bot,,,,...,,81,1,Changeup,2,3,2,3,3,2,2,3,Standard,Standard,245,0.0,-0.035,,,,0.035,,-1,-1,0.438,0.438,29,33,30,33,1,4,2,1,12,1,3.04,0.68,0.68,34.1
3,SL,2024-04-30,87.8,-1.57,6.03,"McGough, Scott",606192,543518,field_out,hit_into_play,,,,,2,Teoscar Hernández pops out to first baseman Ch...,R,R,R,AZ,LAD,X,3.0,popup,1,1,2024,0.36,0.32,-0.17,2.96,,660271,518692.0,2,10,Top,148.97,172.87,,...,3.0,80,3,Slider,2,3,3,2,3,2,3,2,Standard,Standard,141,0.069,-0.263,73.6,7.8,0.001,0.263,88.0,-1,1,0.369,0.631,34,31,35,32,1,4,1,1,3,1,2.62,-0.36,-0.36,46.3
4,FF,2024-04-30,93.6,-1.49,6.03,"McGough, Scott",606192,543518,,ball,,,,,11,Teoscar Hernández pops out to first baseman Ch...,R,R,R,AZ,LAD,B,,,0,1,2024,-0.66,1.61,-1.62,3.34,,660271,518692.0,2,10,Top,,,,...,,80,2,4-Seam Fastball,2,3,3,2,3,2,3,2,Standard,Standard,207,0.0,0.033,,,,-0.033,,-1,1,0.369,0.631,34,31,35,32,1,4,1,1,3,1,1.0,0.66,0.66,45.2


In [13]:
# Path to your notebook directory containing CSVs
csv_folder = './'

# Path to the data directory for the final combined CSV
output_folder = '../data'

# Find all CSV files in the current notebooks folder
csv_files = glob.glob(os.path.join(csv_folder, '*.csv'))

# Merge CSV files
combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)

# Output file path
output_file = os.path.join(output_folder, 'mlb_pitch_data_2024.csv')

# Save combined DataFrame to CSV
combined_df.to_csv(output_file, index=False)

print(f'Combined CSV created at {output_file}')

Combined CSV created at ../data\mlb_pitch_data_2024.csv
