In [1]:
import pandas as pd
import sys
import os
from pathlib import Path
import sqlite3

project_root = os.path.abspath('../../')
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.data.utils import DBConnection
from src.config import DBConfig

In [2]:
class NotebookDBConnection(DBConnection):
    def __init__(self):
        db_path = os.path.join(project_root, 'data', 'pitcher_stats.db')
        super().__init__(db_name=db_path)

In [3]:
pd.set_option('display.max_columns', 150)

In [4]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print([table[0] for table in tables])

['pitcher_mapping', 'statcast_pitchers', 'team_batting', 'statcast_batters', 'game_level_pitchers', 'game_level_batters', 'train_predictive_pitch_features', 'test_predictive_pitch_features', 'train_batter_predictive_features', 'test_batter_predictive_features', 'team_season_features', 'train_combined_features', 'test_combined_features']


In [6]:
with NotebookDBConnection() as conn:
    df_train = pd.read_sql_query("SELECT * FROM train_combined_features;", conn)
    
df_train.head()

Unnamed: 0,pitcher_id,player_name,game_date,game_pk,home_team,away_team,p_throws,season,strikeouts,batters_faced,total_pitches,avg_velocity,max_velocity,avg_spin_rate,avg_horizontal_break,avg_vertical_break,zone_percent,swinging_strike_percent,innings_pitched,k_per_9,k_percent,fastball_percent,breaking_percent,offspeed_percent,rolling_3g_k9,rolling_3g_k_pct,rolling_3g_swstr_pct,rolling_3g_velocity,rolling_3g_K_std,rolling_5g_k9,rolling_5g_k_pct,rolling_5g_swstr_pct,rolling_5g_velocity,rolling_5g_K_std,rolling_10g_k9,rolling_10g_k_pct,rolling_10g_swstr_pct,rolling_10g_velocity,rolling_10g_K_std,rolling_3g_fastball_pct,rolling_5g_fastball_pct,rolling_3g_breaking_pct,rolling_5g_breaking_pct,rolling_3g_offspeed_pct,rolling_5g_offspeed_pct,career_k9,career_k_pct,is_home,K_last_game,days_since_last_game,rest_days_4_less,rest_days_5,rest_days_6_more,game_month,is_month_3,is_month_4,is_month_5,is_month_6,is_month_7,is_month_8,is_month_9,is_month_10,recent_vs_career_k9,throws_right,lag_1_fastball_percent,lag_2_fastball_percent,lag_1_breaking_percent,lag_2_breaking_percent,lag_1_offspeed_percent,lag_2_offspeed_percent,inning,score_differential,is_close_game,is_playoff,rolling_3g_k_per_9_std_lag1,rolling_5g_k_per_9_std_lag1,rolling_10g_k_per_9_std_lag1,ewma_5g_k_per_9_lag1,rolling_3g_k_percent_std_lag1,rolling_5g_k_percent_std_lag1,rolling_10g_k_percent_std_lag1,ewma_5g_k_percent_lag1,k_percent_change_lag1,rolling_3g_swinging_strike_percent_std_lag1,rolling_5g_swinging_strike_percent_std_lag1,rolling_10g_swinging_strike_percent_std_lag1,ewma_5g_swinging_strike_percent_lag1,swinging_strike_percent_change_lag1,rolling_3g_avg_velocity_std_lag1,rolling_5g_avg_velocity_std_lag1,rolling_10g_avg_velocity_std_lag1,ewma_10g_avg_velocity_lag1,rolling_3g_fastball_percent_std_lag1,rolling_5g_fastball_percent_std_lag1,rolling_10g_fastball_percent_std_lag1,ewma_10g_fastball_percent_lag1,fastball_percent_change_lag1,rolling_3g_breaking_percent_std_lag1,rolling_5g_breaking_percent_std_lag1,rolling_10g_breaking_percent_std_lag1,ewma_10g_breaking_percent_lag1,breaking_percent_change_lag1,rolling_3g_offspeed_percent_std_lag1,rolling_5g_offspeed_percent_std_lag1,rolling_10g_offspeed_percent_std_lag1,ewma_10g_offspeed_percent_lag1,offspeed_percent_change_lag1
0,112526,"Colon, Bartolo",2016-04-03 00:00:00,446873,KC,NYM,R,2016,1,5,20,88.52,92.3,2101.85,-1.205,1.102,0.5,0.0,1.333333,6.75,0.2,0.9,0.273684,0.1,8.790698,0.214286,0.103791,88.874259,1.154701,8.735294,0.214876,0.104787,88.867773,1.516575,8.720497,0.215909,0.105494,88.867494,1.75119,0.578947,0.578618,0.273208,0.273344,0.140936,0.140365,8.690867,0.214844,0,3.0,5.0,0.0,0.0,0.0,4,0,1,0,0,0,0,0,0,1.001896,1.0,0.580645,0.581395,0.273684,0.273684,0.142857,0.142857,5,0,1,0,4.058325,4.346211,4.566326,8.660661,0.104973,0.115597,0.121217,0.218048,0.0,0.041796,0.045939,0.048802,0.105082,0.0,0.878334,0.990321,1.105162,88.945918,0.069624,0.078508,0.086142,0.576559,0.0,0.059841,0.067788,0.075454,0.270909,0.0,0.04542,0.051604,0.058719,0.13374,-0.000227
1,112526,"Colon, Bartolo",2016-04-09 00:00:00,446936,NYM,PHI,R,2016,7,21,90,88.296667,93.0,2070.350649,-1.215889,1.011667,0.5,0.122222,5.333333,11.8125,0.333333,0.933333,0.011111,0.055556,6.75,0.2,0.0,88.52,1.154701,6.75,0.2,0.0,88.52,1.516575,6.75,0.2,0.0,88.52,1.75119,0.9,0.9,0.273208,0.273344,0.1,0.1,6.75,0.2,0,1.0,6.0,0.0,0.0,1.0,4,0,1,0,0,0,0,0,0,1.0,1.0,0.9,0.581395,0.273684,0.273684,0.1,0.142857,5,0,1,0,4.058325,4.346211,4.566326,8.660661,0.104973,0.115597,0.121217,0.218048,0.0,0.041796,0.045939,0.048802,0.105082,0.0,0.878334,0.990321,1.105162,88.945918,0.069624,0.078508,0.086142,0.576559,0.0,0.059841,0.067788,0.075454,0.270909,0.0,0.04542,0.051604,0.058719,0.13374,-0.000227
2,112526,"Colon, Bartolo",2016-04-15 00:00:00,447017,CLE,NYM,R,2016,5,26,97,87.334021,92.4,2148.086957,-0.796804,0.94268,0.57732,0.051546,5.333333,8.4375,0.192308,0.824742,0.082474,0.092784,10.8,0.307692,0.061111,88.408333,4.242641,10.8,0.307692,0.061111,88.408333,4.242641,10.8,0.307692,0.061111,88.408333,4.242641,0.916667,0.916667,0.011111,0.011111,0.077778,0.077778,10.8,0.307692,0,7.0,6.0,0.0,0.0,1.0,4,0,1,0,0,0,0,0,0,1.0,1.0,0.933333,0.9,0.011111,0.273684,0.055556,0.1,5,0,1,0,3.579728,3.579728,4.566326,8.4375,0.094281,0.094281,0.121217,0.244444,0.133333,0.086424,0.086424,0.048802,0.040741,0.122222,0.157921,0.157921,1.105162,88.945918,0.02357,0.02357,0.086142,0.576559,0.033333,0.059841,0.067788,0.075454,0.270909,0.0,0.031427,0.031427,0.058719,0.13374,-0.044444
3,112526,"Colon, Bartolo",2016-04-20 00:00:00,447096,PHI,NYM,R,2016,4,23,90,87.996667,94.0,2123.579545,-1.082667,1.093889,0.577778,0.111111,6.0,6.0,0.173913,0.777778,0.055556,0.166667,9.75,0.25,0.057923,88.050229,3.05505,9.75,0.25,0.057923,88.050229,3.05505,9.75,0.25,0.057923,88.050229,3.05505,0.886025,0.886025,0.046793,0.046793,0.08278,0.08278,9.75,0.25,0,5.0,5.0,0.0,1.0,0.0,4,0,1,0,0,0,0,0,0,1.0,1.0,0.824742,0.933333,0.082474,0.011111,0.092784,0.055556,5,0,1,0,2.577699,2.577699,4.566326,8.4375,0.079294,0.079294,0.121217,0.227066,-0.141026,0.06136,0.06136,0.048802,0.044343,-0.070676,0.630226,0.630226,1.105162,88.945918,0.055628,0.055628,0.086142,0.576559,-0.108591,0.050461,0.050461,0.075454,0.270909,0.071363,0.023851,0.023851,0.058719,0.13374,0.037228
4,112526,"Colon, Bartolo",2016-04-26 00:00:00,447164,NYM,CIN,R,2016,4,22,88,86.763218,91.8,2018.565789,-1.148851,1.049195,0.522727,0.056818,4.333333,8.307692,0.181818,0.806818,0.102273,0.068182,8.64,0.228571,0.09496,87.875785,1.527525,8.5,0.226667,0.07122,88.036838,2.5,8.5,0.226667,0.07122,88.036838,2.5,0.845284,0.858963,0.049714,0.049714,0.105002,0.103751,8.5,0.226667,0,4.0,6.0,0.0,0.0,1.0,4,0,1,0,0,0,0,0,0,1.0,1.0,0.777778,0.824742,0.055556,0.082474,0.166667,0.092784,5,0,1,0,2.918824,2.584509,4.566326,7.625,0.087218,0.07312,0.121217,0.209348,-0.018395,0.038005,0.056721,0.048802,0.066599,0.059565,0.492576,0.515274,1.105162,88.945918,0.079786,0.070657,0.086142,0.576559,-0.046964,0.036038,0.036038,0.075454,0.270909,-0.026919,0.056554,0.046244,0.058719,0.13374,0.073883


In [8]:
df_train.to_csv('train.csv')

In [None]:
with NotebookDBConnection() as conn:
    df_batters = pd.read_sql_query("SELECT * FROM statcast_batters LIMIT 25000;", conn)
    df_pitchers = pd.read_sql_query("SELECT * FROM statcast_pitchers LIMIT 25000;", conn)
    df_team_batting = pd.read_sql_query("SELECT * FROM team_batting;", conn)
    

In [8]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT MAX(game_date) FROM statcast_pitchers")
    print(cursor.fetchall())

[('2025-04-08',)]


In [9]:
from pybaseball import schedule_and_record

In [11]:
dod = schedule_and_record(2025, 'LAD')

http://www.baseball-reference.com/teams/LAD/2025-schedule-scores.shtml


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Attendance'].replace(r'^Unknown$', np.nan, regex=True, inplace = True) # make this a NaN so the column can benumeric


In [13]:
dod.head(20)

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,cLI,Streak,Orig. Scheduled
1,"Tuesday, Mar 18",LAD,@,CHC,W,4.0,1.0,9.0,1-0,1.0,up 0.5,Yamamoto,Brown,Scott,2:38,N,42365.0,0.97,1.0,
2,"Wednesday, Mar 19",LAD,@,CHC,W,6.0,3.0,9.0,2-0,1.0,up 1.0,Knack,Steele,Vesia,2:45,N,42367.0,0.95,2.0,
3,"Thursday, Mar 27",LAD,Home,DET,W,5.0,4.0,9.0,3-0,1.0,up 1.0,Snell,Skubal,Treinen,2:32,D,53595.0,0.98,3.0,
4,"Friday, Mar 28",LAD,Home,DET,W-wo,8.0,5.0,10.0,4-0,1.0,up 1.0,García,Brieske,,2:52,N,52029.0,0.98,4.0,
5,"Saturday, Mar 29",LAD,Home,DET,W,7.0,3.0,9.0,5-0,1.0,up 1.0,Banda,Olson,,2:32,N,51788.0,1.02,5.0,
6,"Monday, Mar 31",LAD,Home,ATL,W,6.0,1.0,9.0,6-0,1.0,up 0.5,Glasnow,Holmes,,2:26,N,50816.0,1.07,6.0,
7,"Tuesday, Apr 1",LAD,Home,ATL,W,3.0,1.0,9.0,7-0,1.0,up 0.5,Banda,Sale,Scott,2:31,N,50182.0,1.1,7.0,
8,"Wednesday, Apr 2",LAD,Home,ATL,W-wo,6.0,5.0,9.0,8-0,1.0,up 0.5,Dreyer,Iglesias,,2:44,N,50281.0,1.1,8.0,
9,"Friday, Apr 4",LAD,@,PHI,L,2.0,3.0,9.0,8-1,1.0,up 0.5,Luzardo,Yamamoto,Romano,2:29,N,43024.0,1.13,-1.0,
10,"Saturday, Apr 5",LAD,@,PHI,W,3.0,1.0,9.0,9-1,1.0,up 1.0,Banda,Nola,Scott,2:26,D,44404.0,1.13,1.0,
