In [1]:
import pandas as pd
import sys
import os
from pathlib import Path
import sqlite3

project_root = os.path.abspath('../../')
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.data.utils import DBConnection
from config import DBConfig

In [2]:
class NotebookDBConnection(DBConnection):
    def __init__(self):
        db_path = os.path.join(project_root, 'data', 'pitcher_stats.db')
        super().__init__(db_name=db_path)

In [3]:
pd.set_option('display.max_columns', 150)

In [4]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print([table[0] for table in tables])

['pitcher_mapping', 'statcast_pitchers', 'team_batting']


In [5]:
with NotebookDBConnection() as conn:
    df_batters = pd.read_sql_query("SELECT * FROM statcast_batters LIMIT 25000;", conn)
    df_pitchers = pd.read_sql_query("SELECT * FROM statcast_pitchers LIMIT 25000;", conn)
    df_team_batting = pd.read_sql_query("SELECT * FROM team_batting;", conn)
    

DatabaseError: Execution failed on sql 'SELECT * FROM statcast_batters LIMIT 25000;': no such table: statcast_batters

In [28]:
with NotebookDBConnection() as conn:
    df_game_level = pd.read_sql_query("SELECT * FROM game_level_pitchers LIMIT 10000;", conn)
    df_pred = pd.read_sql_query("SELECT * FROM predictive_pitch_features LIMIT 10000;", conn)

In [33]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute('''SELECT pitcher_id, game_pk, game_date, COUNT(*)
        FROM game_level_pitchers
        GROUP BY pitcher_id, game_pk, game_date
        HAVING COUNT(*) > 1;''')
    print(cursor.fetchall())

[]


In [13]:
df_game_level.to_csv('game_level_pitchers.csv')

In [14]:
df_pred.to_csv('predictive_pitch_features.csv')

In [22]:
df_pitchers.sort_values(by="game_date").tail()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length,estimated_slg_using_speedangle,delta_pitcher_run_exp,hyper_speed,home_score_diff,bat_score_diff,home_win_exp,bat_win_exp,age_pit_legacy,age_bat_legacy,age_pit,age_bat,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,pitcher_id,season
12193,FC,2025-04-06,94.4,-0.75,6.1,"Burnes, Corbin",669743,669203,,ball,,,,,14.0,Alex Call walks. Josh Bell to 2nd.,R,R,R,WSH,AZ,B,,,2,1,2025,0.3,0.97,2.01,1.4,,,605137.0,2,3,Bot,,,,,,,6.546529,-137.170018,-8.737907,2.406804,29.625059,-18.224947,3.54,1.69,,,,95.0,2688.0,6.7,778429,645444,647304,641658,553993,672695,666971,677950,682998,53.84,,,,,,,,29,4,Cutter,3,2,3,2,2,3,3,2,Standard,Standard,195.0,0.0,0.078,,,,-0.078,,1,1,0.65,0.65,30,30,31,31,2,1,5.0,1.0,,,1.59,-0.3,-0.3,42.9,669203,2025
12192,FC,2025-04-06,94.4,-0.77,6.17,"Burnes, Corbin",669743,669203,,called_strike,,,,,5.0,Alex Call walks. Josh Bell to 2nd.,R,R,R,WSH,AZ,S,,,3,1,2025,0.33,0.98,0.11,2.73,,,605137.0,2,3,Bot,,,,,,,1.555764,-137.495965,-5.526066,3.854207,27.660734,-18.77462,3.52,1.68,,,,95.2,2729.0,6.5,778429,645444,647304,641658,553993,672695,666971,677950,682998,54.01,,,,,,,,29,5,Cutter,3,2,3,2,2,3,3,2,Standard,Standard,199.0,0.0,-0.065,,,,0.065,,1,1,0.65,0.65,30,30,31,31,2,1,5.0,1.0,,,1.56,-0.33,-0.33,44.7,669203,2025
12191,FC,2025-04-06,94.3,-0.65,6.22,"Burnes, Corbin",669743,669203,walk,ball,,,,,14.0,Alex Call walks. Josh Bell to 2nd.,R,R,R,WSH,AZ,B,,,3,2,2025,0.38,0.91,1.43,2.0,,,605137.0,2,3,Bot,,,,,,,4.566706,-137.20785,-7.421722,3.978941,26.194283,-19.351877,3.44,1.69,,,,95.4,2703.0,6.6,778429,645444,647304,641658,553993,672695,666971,677950,682998,53.89,,0.698324,0.7,1.0,0.0,0.0,,29,6,Cutter,3,2,3,2,2,3,3,2,Standard,Standard,187.0,0.018,0.217,,,,-0.217,,1,1,0.65,0.65,30,30,31,31,2,1,5.0,1.0,,,1.63,-0.38,-0.38,45.4,669203,2025
12166,SI,2025-04-06,95.4,-0.7,6.19,"Burnes, Corbin",657557,669203,strikeout,called_strike,,,,,7.0,Paul DeJong called out on strikes.,R,R,R,WSH,AZ,S,2.0,,1,2,2025,-0.74,1.34,-0.59,1.94,,,669743.0,2,5,Bot,,,,,,,1.968546,-138.69841,-8.571953,-9.930258,33.643851,-13.07398,3.37,1.56,,,,95.6,2637.0,6.7,778429,645444,647304,641658,553993,672695,666971,677950,682998,53.83,,0.0,0.0,1.0,0.0,0.0,,48,5,Sinker,4,3,4,3,3,4,4,3,Standard,Standard,231.0,-0.022,-0.196,,,,0.196,,1,1,0.689,0.689,30,31,31,32,3,2,5.0,1.0,,,1.18,0.74,0.74,44.6,669203,2025
12178,FC,2025-04-06,93.7,-1.01,6.03,"Burnes, Corbin",663993,669203,,ball,,,,,7.0,Nathaniel Lowe lines out to left fielder Lourd...,R,L,R,WSH,AZ,B,,,1,1,2025,0.33,0.99,-0.55,1.57,686611.0,,660688.0,2,4,Bot,,,,,,,0.447677,-136.309475,-7.967766,4.017236,31.104747,-18.279211,3.4,1.46,,,,93.7,2690.0,6.4,778429,645444,647304,641658,553993,672695,666971,677950,682998,54.11,,,,,,,,38,3,Cutter,4,2,4,2,2,4,4,2,Infield shade,Standard,195.0,0.0,0.024,,,,-0.024,,2,2,0.794,0.794,30,29,31,30,3,2,5.0,1.0,,,1.62,-0.33,0.33,39.6,669203,2025


In [24]:
df_team_batting.to_csv('team_batting.csv')
df_batters.to_csv('statcast_batters_subset.csv')
df_pitchers.to_csv('statcast_pitchers_subset.csv')

In [34]:
df = pd.read_csv('X_train_all.csv')

In [35]:
df.head()

Unnamed: 0,rolling_3g_k9,rolling_3g_k_pct,rolling_3g_swstr_pct,rolling_3g_velocity,rolling_3g_K_std,rolling_5g_k9,rolling_5g_k_pct,rolling_5g_swstr_pct,rolling_5g_velocity,rolling_5g_K_std,rolling_10g_k9,rolling_10g_k_pct,rolling_10g_swstr_pct,rolling_10g_velocity,rolling_10g_K_std,rolling_3g_fastball_pct,rolling_5g_fastball_pct,rolling_3g_breaking_pct,rolling_5g_breaking_pct,rolling_3g_offspeed_pct,rolling_5g_offspeed_pct,career_k9,career_k_pct,is_home,K_last_game,days_since_last_game,rest_days_4_less,rest_days_5,rest_days_6_more,is_month_3,is_month_4,is_month_5,is_month_6,is_month_7,is_month_8,is_month_9,is_month_10,recent_vs_career_k9,throws_right,opp_team_k_percent,opp_team_bb_k_ratio,opp_team_zone_percent,opp_team_o_swing_percent,opp_team_z_contact_percent,opp_team_contact_percent,opp_team_swstr_percent,opp_team_wfb_c,opp_team_wsl_c,opp_team_wct_c,opp_team_wcb_c,opp_team_wch_c,opp_team_wsf_c,opp_lineup_avg_k_pct,opp_lineup_avg_swstr_pct,opp_lineup_avg_chase_pct,opp_lineup_avg_zone_contact,opp_lineup_right_handed,opp_lineup_left_handed,opp_lineup_handedness_ratio
0,3.375,0.1875,0.089286,84.142857,3.464102,3.375,0.1875,0.089286,84.142857,4.549725,3.375,0.1875,0.089286,84.142857,5.108816,0.625,0.625,0.25,0.25,0.125,0.125,7.109131,0.498127,0,8.0,5.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0.474742,1.0,0.223,0.36,0.423,0.315,0.844,0.766,0.11,-0.13,-0.16,0.12,0.13,0.63,-0.94,7.765334,11.348896,0.130627,0.524376,17.0,8.0,2.124734
1,8.217391,0.538462,0.113792,82.903343,10.606602,8.217391,0.538462,0.113792,82.903343,10.606602,8.217391,0.538462,0.113792,82.903343,10.606602,0.583777,0.583777,0.321809,0.321809,0.094415,0.094415,7.109131,0.498127,0,3.0,5.0,0.0,1.0,0.0,0,1,0,0,0,0,0,0,1.155892,1.0,0.24,0.4,0.422,0.302,0.841,0.751,0.115,0.21,-0.84,-0.37,-0.36,0.32,0.22,8.515733,11.291578,0.128028,0.683393,12.0,11.0,1.09081
2,9.947368,0.677419,0.118998,82.896347,9.643651,9.947368,0.677419,0.118998,82.896347,9.643651,9.947368,0.677419,0.118998,82.896347,9.643651,0.593106,0.593106,0.328264,0.328264,0.07863,0.07863,7.109131,0.498127,0,18.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,1.399238,1.0,0.23,0.37,0.419,0.32,0.851,0.764,0.111,0.06,-0.19,0.03,0.13,0.23,-0.43,5.864516,7.852246,0.074875,0.719629,15.0,7.0,2.142551
3,12.42,0.945205,0.12257,82.459728,6.244998,11.172414,0.808989,0.114249,82.88051,11.224972,11.172414,0.808989,0.114249,82.88051,11.224972,0.584773,0.594829,0.348264,0.323698,0.066963,0.081472,7.109131,0.498127,0,21.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,1.571558,1.0,0.223,0.36,0.423,0.315,0.844,0.766,0.11,-0.13,-0.16,0.12,0.13,0.63,-0.94,6.814869,9.949506,0.107184,0.774933,12.0,8.0,1.499813
4,11.25,0.925926,0.113854,82.875834,4.582576,10.409639,0.8,0.113829,82.886838,10.084642,10.409639,0.8,0.113829,82.886838,10.084642,0.606414,0.597359,0.329208,0.326248,0.064378,0.076393,7.109131,0.498127,0,30.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,1.464263,1.0,0.23,0.4,0.432,0.312,0.838,0.764,0.112,-0.13,-0.23,0.13,0.13,0.67,1.47,7.081475,11.917241,0.132379,0.727696,9.0,10.0,0.89991
