In [1]:
import pandas as pd
import sys
import os
from pathlib import Path
import sqlite3
import warnings
warnings.filterwarnings("ignore")

project_root = os.path.abspath('../../')
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.data.utils import DBConnection
from src.config import DBConfig

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
class NotebookDBConnection(DBConnection):
    def __init__(self):
        db_path = os.path.join(project_root, 'data', 'pitcher_stats.db')
        super().__init__(db_path)

In [5]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print([table[0] for table in tables])

['statcast_pitchers', 'statcast_batters', 'mlb_api', 'game_level_pitchers', 'game_level_batters', 'game_level_team_stats', 'game_level_starter_stats', 'train_features', 'test_features', 'pitcher_mapping', 'team_mapping', 'mlb_boxscores', 'team_batting', 'statcast_starters', 'statcast_starting_pitchers']


In [None]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("PRAGMA table_info(team_mapping)")
    columns = cursor.fetchall()
    print([column[1] for column in columns])

In [8]:
table_names = ['statcast_pitchers', 'statcast_batters', 'mlb_boxscores']
schemas = {}

with NotebookDBConnection() as conn:
    for table in table_names:
        df_info = pd.read_sql_query(f"PRAGMA table_info('{table}');", conn)
        column_names = df_info['name'].tolist() # Extract just the 'name' column
        schemas[table] = column_names
        print(f"Columns for {table}: {column_names}\n")

Columns for statcast_pitchers: ['pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'player_name', 'batter', 'pitcher', 'events', 'description', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des', 'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk', 'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'babip_value

In [10]:
with NotebookDBConnection() as conn:
    df = pd.read_sql_query("SELECT * FROM statcast_batters LIMIT 20000", conn)

df.tail()


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length,estimated_slg_using_speedangle,delta_pitcher_run_exp,hyper_speed,home_score_diff,bat_score_diff,home_win_exp,bat_win_exp,age_pit_legacy,age_bat_legacy,age_pit,age_bat,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,season
19995,CH,2016-04-25 00:00:00,85.9,2.0,5.89,"Locke, Jeff",448602,502046,,ball,,,,,13.0,Mark Reynolds doubles (5) on a ground ball to ...,R,R,L,COL,PIT,B,,,2,2,2016,0.49,0.33,-1.68,0.37,,,,0,2,Bot,,,,,,160425_191712,-9.835752,-125.217548,-6.460226,6.624,20.435,-27.738,3.58,1.63,,,,86.1,1930.0,6.1,447158,465041,444379,543281,501896,474568,459964,516782,570256,54.5,,,,,,,,14,6,Changeup,0,0,0,0,0,0,0,0,Infield shift,Strategic,118.0,0.0,0.11,,,,-0.11,,0,0,0.55,0.55,28,32,29,33,1,0,5.0,1.0,6.0,1.0,2.78,0.49,-0.49,,2016
19996,SI,2016-04-25 00:00:00,92.2,1.93,5.96,"Locke, Jeff",448602,502046,,foul,,,,,12.0,Mark Reynolds doubles (5) on a ground ball to ...,R,R,L,COL,PIT,S,,,2,2,2016,0.77,1.23,0.86,3.19,,,,0,2,Bot,,,,,,160425_191648,-4.683657,-135.067893,-3.342976,10.043,25.489,-16.816,3.58,1.63,,,,91.9,2211.0,5.7,447158,465041,444379,543281,501896,474568,459964,516782,570256,54.5,,,,,,,,14,5,Sinker,0,0,0,0,0,0,0,0,Infield shift,Strategic,166.0,0.0,0.0,,,,0.0,,0,0,0.55,0.55,28,32,29,33,1,0,5.0,1.0,6.0,1.0,1.46,0.77,-0.77,,2016
19997,SI,2016-04-25 00:00:00,92.5,2.1,5.87,"Locke, Jeff",448602,502046,,ball,,,,,14.0,Mark Reynolds doubles (5) on a ground ball to ...,R,R,L,COL,PIT,B,,,1,2,2016,0.77,1.18,1.33,1.37,,,,0,2,Bot,,,,,,160425_191632,-3.92972,-135.282208,-7.665945,9.941,25.539,-16.544,3.36,1.54,,,,91.9,2149.0,5.6,447158,465041,444379,543281,501896,474568,459964,516782,570256,54.5,,,,,,,,14,4,Sinker,0,0,0,0,0,0,0,0,Infield shift,Strategic,156.0,0.0,0.045,,,,-0.045,,0,0,0.55,0.55,28,32,29,33,1,0,5.0,1.0,6.0,1.0,1.5,0.77,-0.77,,2016
19998,SI,2016-04-25 00:00:00,92.2,1.92,6.05,"Locke, Jeff",448602,502046,,swinging_strike,,,,,3.0,Mark Reynolds doubles (5) on a ground ball to ...,R,R,L,COL,PIT,S,,,1,1,2016,0.91,0.74,0.56,2.85,,,,0,2,Bot,,,,,,160425_191616,-5.753914,-134.951432,-3.171103,11.926,25.036,-22.718,3.36,1.54,,,,91.9,1979.0,5.7,447158,465041,444379,543281,501896,474568,459964,516782,570256,54.5,,,,,,,,14,3,Sinker,0,0,0,0,0,0,0,0,Infield shift,Strategic,148.0,0.0,-0.064,,,,0.064,,0,0,0.55,0.55,28,32,29,33,1,0,5.0,1.0,6.0,1.0,1.95,0.91,-0.91,,2016
19999,SI,2016-04-25 00:00:00,91.7,1.84,6.02,"Locke, Jeff",448602,502046,,called_strike,,,,,9.0,Mark Reynolds doubles (5) on a ground ball to ...,R,R,L,COL,PIT,S,,,1,0,2016,1.04,0.78,0.51,1.86,,,,0,2,Bot,,,,,,160425_191603,-5.962889,-134.168599,-5.61372,13.343,24.273,-21.945,3.36,1.54,,,,91.7,1956.0,5.8,447158,465041,444379,543281,501896,474568,459964,516782,570256,54.5,,,,,,,,14,2,Sinker,0,0,0,0,0,0,0,0,Infield shift,Strategic,141.0,0.0,-0.052,,,,0.052,,0,0,0.55,0.55,28,32,29,33,1,0,5.0,1.0,6.0,1.0,1.94,1.04,-1.04,,2016


In [None]:
df.to_csv('statcast_batters_subset.csv')

In [24]:
df.shape

(3445363, 51)

In [25]:
df.head(20000).to_csv('statcast_starting_pitchers_subset.csv')