In [99]:
import pandas as pd
import sys
import os
from pathlib import Path
import sqlite3

project_root = os.path.abspath('../../')
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.data.utils import DBConnection
from src.config import DBConfig

In [2]:
class NotebookDBConnection(DBConnection):
    def __init__(self):
        db_path = os.path.join(project_root, 'data', 'pitcher_stats.db')
        super().__init__(db_name=db_path)

In [3]:
pd.set_option('display.max_columns', 150)

In [4]:
df = pd.read_csv('umpire_data_all_multiseason.csv')

In [9]:
df_ump = df[['Date', 'Umpire', 'Home', 'Away', 'R [H]']].copy()

In [34]:
df_ump.columns = ['game_date', 'umpire', 'home_team', 'away_team']

In [13]:
df_ump = df_ump.drop('nan', axis=1)

In [20]:
df_ump = df_ump[~df_ump['date'].str.startswith('2015')]

In [35]:
df_ump.sort_values(by='game_date').head()

Unnamed: 0,game_date,umpire,home_team,away_team
20430,2016-04-03,Gerry Davis,KC,NYM
20429,2016-04-03,Mike Everitt,TB,TOR
20428,2016-04-03,Jerry Layne,PIT,STL
20419,2016-04-04,Mike Winters,BAL,MIN
20418,2016-04-04,Brian Gorman,MIL,SF


In [19]:
df_ump.dtypes

date      object
umpire    object
home      object
away      object
dtype: object

In [36]:
with NotebookDBConnection() as conn:
    df_ump.to_sql('umpire_data', conn, if_exists='replace', index=False)

In [112]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print([table[0] for table in tables])

['statcast_pitchers', 'statcast_batters', 'team_mapping', 'pitcher_mapping', 'prediction_features', 'umpire_data', 'team_batting', 'mlb_api', 'pitcher_features', 'combined_features', 'batter_features', 'game_level_pitchers', 'game_level_batters', 'game_level_team_stats', 'advanced_opponent_game_features', 'team_season_features', 'train_combined_features', 'test_combined_features']


In [85]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM train_combined_features")
    print(cursor.fetchall())


[(76667,)]


In [98]:
with NotebookDBConnection() as conn:
    pitchers = pd.read_sql_query('''
    SELECT statcast_pitchers.*, umpire_data.umpire 
    FROM statcast_pitchers 
    JOIN umpire_data 
    ON statcast_pitchers.game_date = umpire_data.game_date
    AND statcast_pitchers.home_team = umpire_data.home_team
    AND statcast_pitchers.away_team = umpire_data.away_team
    ''', conn)
    pitchers.head()


In [103]:
pitchers.shape

(7788466, 116)

In [117]:
with NotebookDBConnection() as conn:
    df1 = pd.read_sql_query("SELECT * FROM prediction_features ORDER BY game_date DESC LIMIT 100", conn)
df1

Unnamed: 0,gamePk,game_date,pitcher_id,team_id,opponent_team_id,is_home,rolling_3g_k9,rolling_3g_k_pct,rolling_3g_swstr_pct,rolling_3g_velocity,rolling_3g_K_std,rolling_5g_k9,rolling_5g_k_pct,rolling_5g_swstr_pct,rolling_5g_velocity,rolling_5g_K_std,rolling_10g_k9,rolling_10g_k_pct,rolling_10g_swstr_pct,rolling_10g_velocity,rolling_10g_K_std,rolling_3g_fastball_pct,rolling_5g_fastball_pct,rolling_3g_breaking_pct,rolling_5g_breaking_pct,rolling_3g_offspeed_pct,rolling_5g_offspeed_pct,career_k9,career_k_pct,K_last_game,days_since_last_game,rest_days_4_less,rest_days_5,rest_days_6_more,is_month_3,is_month_4,is_month_5,is_month_6,is_month_7,is_month_8,is_month_9,is_month_10,recent_vs_career_k9,throws_right,lag_1_fastball_percent,lag_2_fastball_percent,lag_1_breaking_percent,lag_2_breaking_percent,lag_1_offspeed_percent,lag_2_offspeed_percent,rolling_3g_k_per_9_std_lag1,rolling_5g_k_per_9_std_lag1,rolling_10g_k_per_9_std_lag1,ewma_5g_k_per_9_lag1,rolling_3g_k_percent_std_lag1,rolling_5g_k_percent_std_lag1,rolling_10g_k_percent_std_lag1,ewma_5g_k_percent_lag1,k_percent_change_lag1,rolling_3g_swinging_strike_percent_std_lag1,rolling_5g_swinging_strike_percent_std_lag1,rolling_10g_swinging_strike_percent_std_lag1,ewma_5g_swinging_strike_percent_lag1,swinging_strike_percent_change_lag1,rolling_3g_avg_velocity_std_lag1,rolling_5g_avg_velocity_std_lag1,rolling_10g_avg_velocity_std_lag1,ewma_10g_avg_velocity_lag1,rolling_3g_fastball_percent_std_lag1,rolling_5g_fastball_percent_std_lag1,rolling_10g_fastball_percent_std_lag1,ewma_10g_fastball_percent_lag1,fastball_percent_change_lag1,rolling_3g_breaking_percent_std_lag1,rolling_5g_breaking_percent_std_lag1,rolling_10g_breaking_percent_std_lag1,ewma_10g_breaking_percent_lag1,breaking_percent_change_lag1,rolling_3g_offspeed_percent_std_lag1,rolling_5g_offspeed_percent_std_lag1,rolling_10g_offspeed_percent_std_lag1,ewma_10g_offspeed_percent_lag1,offspeed_percent_change_lag1,opp_base_team_k_percent,opp_base_team_bb_k_ratio,opp_base_team_zone_percent,opp_base_team_o_swing_percent,opp_base_team_z_contact_percent,opp_base_team_contact_percent,opp_base_team_swstr_percent,opp_base_team_wfb_c,opp_base_team_wsl_c,opp_base_team_wct_c,opp_base_team_wcb_c,opp_base_team_wch_c,opp_base_team_wsf_c
0,778348,2025-04-12,663978,142,116,1,7.615385,0.183333,0.10009,87.306052,2.081666,6.3,0.147368,0.085823,87.067364,2.167948,7.448276,0.192308,0.094763,87.265356,2.211083,0.457672,0.462938,0.322574,0.300756,0.219754,0.236306,8.130112,0.200495,2.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,0.774897,1.0,0.494382,0.455556,0.325843,0.244444,0.179775,0.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,778347,2025-04-12,571945,138,143,1,6.1875,0.166667,0.070344,88.844536,0.57735,7.253731,0.183673,0.082947,88.678388,0.547723,7.328571,0.184466,0.068581,88.622194,1.229273,0.527809,0.525945,0.384637,0.389116,0.071482,0.060482,6.504049,0.165049,3.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,1.115264,1.0,0.475,0.565217,0.4125,0.358696,0.0875,0.065217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,778347,2025-04-12,650911,143,138,0,9.6,0.246154,0.142738,90.259978,1.527525,10.253165,0.263158,0.137461,90.100078,1.414214,9.252809,0.247967,0.136207,89.943282,1.66333,0.483184,0.488245,0.160278,0.145944,0.356538,0.365811,8.139706,0.205514,7.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,1.259648,0.0,0.462366,0.465909,0.107527,0.181818,0.430108,0.352273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,778355,2025-04-12,663855,137,147,0,9.0,0.228571,0.095594,91.669754,2.886751,7.2,0.177778,0.067883,92.32568,2.50998,6.612245,0.162162,0.080669,91.122466,1.75119,0.664304,0.720113,0.258962,0.258272,0.053544,0.053544,8.473846,0.203187,6.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,0.849673,1.0,0.722222,0.62069,0.26087,0.26087,0.041667,0.068966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,778346,2025-04-12,656849,121,133,0,11.117647,0.241379,0.100322,89.073389,3.785939,8.816327,0.202532,0.105585,89.147633,3.34664,8.612069,0.20904,0.128247,89.420941,2.983287,0.645018,0.634142,0.175908,0.201023,0.179074,0.164835,8.038168,0.197302,9.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,1.096808,0.0,0.550562,0.759494,0.213483,0.139241,0.235955,0.101266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,778362,2025-04-12,670102,141,110,0,7.56,0.212121,0.088505,85.992929,1.154701,6.230769,0.177966,0.10078,86.115391,2.04939,8.526316,0.25641,0.112627,86.962401,2.905933,0.574326,0.558342,0.261192,0.253466,0.164482,0.188192,8.125392,0.222738,4.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,0.766827,1.0,0.571429,0.591549,0.197802,0.295775,0.230769,0.112676,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,778349,2025-04-12,527048,145,111,1,11.045455,0.28125,0.104691,85.906783,3.0,8.766234,0.235849,0.100605,86.013135,2.738613,8.28,0.211009,0.102931,85.300001,2.674987,0.658774,0.642004,0.072769,0.083046,0.268456,0.274951,7.974093,0.192568,9.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,1.099339,0.0,0.655914,0.703125,0.053763,0.078125,0.290323,0.21875,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,778353,2025-04-12,656876,139,144,1,10.0,0.294118,0.08683,92.821227,1.154701,10.945946,0.3125,0.110034,93.504879,1.0,9.703125,0.27381,0.133466,93.743118,1.251666,0.939683,0.958254,0.071429,0.071429,0.169272,0.16967,10.57732,0.285714,4.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,1.034851,1.0,0.885714,0.933333,0.071429,0.26087,0.166667,0.166667,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,778353,2025-04-12,700363,144,139,0,8.37931,0.1875,0.113748,88.11745,1.732051,7.815789,0.15942,0.094927,88.136165,1.5,6.402062,0.150327,0.100025,88.236255,1.509231,0.447729,0.486437,0.236155,0.228398,0.316117,0.285165,8.37931,0.1875,4.0,6.0,0.0,0.0,1.0,0,1,0,0,0,0,0,0,0.932749,1.0,0.285714,0.517241,0.363636,0.137931,0.350649,0.344828,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,778354,2025-04-12,645261,146,120,1,5.4,0.125,0.0,89.037821,0.707107,2.7,0.058824,0.0,88.670547,0.57735,11.571429,0.272727,0.095134,86.047244,0.834523,0.491453,0.447635,0.354701,0.436467,0.307692,0.173846,8.735294,0.222798,2.0,5.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,1.0,1.0,0.552083,0.552786,0.26087,0.26087,0.166667,0.166667,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [119]:
df1.shape

(19, 96)

In [118]:
df1['rolling_5g_k_per_9_std_lag1'].unique()

array([0], dtype=int64)

In [83]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    df = pd.read_sql_query("SELECT * FROM combined_features WHERE umpire IS NULL ORDER BY game_date LIMIT 10 OFFSET 2500", conn)

df

Unnamed: 0,pitcher_id,player_name,game_date,game_pk,home_team,away_team,p_throws,season,strikeouts,batters_faced,total_pitches,avg_velocity,max_velocity,avg_spin_rate,avg_horizontal_break,avg_vertical_break,zone_percent,swinging_strike_percent,innings_pitched,k_per_9,k_percent,fastball_percent,breaking_percent,offspeed_percent,rolling_3g_k9,rolling_3g_k_pct,rolling_3g_swstr_pct,rolling_3g_velocity,rolling_3g_K_std,rolling_5g_k9,rolling_5g_k_pct,rolling_5g_swstr_pct,rolling_5g_velocity,rolling_5g_K_std,rolling_10g_k9,rolling_10g_k_pct,rolling_10g_swstr_pct,rolling_10g_velocity,rolling_10g_K_std,rolling_3g_fastball_pct,rolling_5g_fastball_pct,rolling_3g_breaking_pct,rolling_5g_breaking_pct,rolling_3g_offspeed_pct,rolling_5g_offspeed_pct,career_k9,career_k_pct,is_home,K_last_game,days_since_last_game,rest_days_4_less,rest_days_5,rest_days_6_more,game_month,is_month_3,is_month_4,is_month_5,is_month_6,is_month_7,is_month_8,is_month_9,is_month_10,recent_vs_career_k9,throws_right,lag_1_fastball_percent,lag_2_fastball_percent,lag_1_breaking_percent,lag_2_breaking_percent,lag_1_offspeed_percent,lag_2_offspeed_percent,inning,score_differential,is_close_game,is_playoff,rolling_3g_k_per_9_std_lag1,rolling_5g_k_per_9_std_lag1,rolling_10g_k_per_9_std_lag1,ewma_5g_k_per_9_lag1,rolling_3g_k_percent_std_lag1,rolling_5g_k_percent_std_lag1,rolling_10g_k_percent_std_lag1,ewma_5g_k_percent_lag1,k_percent_change_lag1,rolling_3g_swinging_strike_percent_std_lag1,rolling_5g_swinging_strike_percent_std_lag1,rolling_10g_swinging_strike_percent_std_lag1,ewma_5g_swinging_strike_percent_lag1,swinging_strike_percent_change_lag1,rolling_3g_avg_velocity_std_lag1,rolling_5g_avg_velocity_std_lag1,rolling_10g_avg_velocity_std_lag1,ewma_10g_avg_velocity_lag1,rolling_3g_fastball_percent_std_lag1,rolling_5g_fastball_percent_std_lag1,rolling_10g_fastball_percent_std_lag1,ewma_10g_fastball_percent_lag1,fastball_percent_change_lag1,rolling_3g_breaking_percent_std_lag1,rolling_5g_breaking_percent_std_lag1,rolling_10g_breaking_percent_std_lag1,ewma_10g_breaking_percent_lag1,breaking_percent_change_lag1,rolling_3g_offspeed_percent_std_lag1,rolling_5g_offspeed_percent_std_lag1,rolling_10g_offspeed_percent_std_lag1,ewma_10g_offspeed_percent_lag1,offspeed_percent_change_lag1,split,umpire
0,663765,"Woodford, Jake",2017-03-06,509787,MIN,STL,R,2017,0,6,9,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.0,1.0,0.0,0.0,0.577778,0.272727,0.147727,7.714286,0.2,0.027778,88.238889,1.414214,6.48,0.26087,0.051322,87.101317,1.914854,7.156627,0.285714,0.065099,88.369681,2.603417,0.666667,0.57041,0.222222,0.261834,0.111111,0.167756,0.0,0.0,0,0.0,5.0,0.0,1.0,0.0,3,1,0,0,0,0,0,0,0,inf,1.0,0.577778,0.578313,0.272727,0.272727,0.147727,0.147541,5,0,1,0,4.487559,4.679496,4.977887,8.735585,0.19245,0.217307,0.231774,0.387245,0.0,0.049076,0.056021,0.067425,0.112759,0.0,0.88854,0.997723,1.116884,88.987456,0.070806,0.079586,0.088049,0.578343,0.0,0.060685,0.068328,0.076676,0.270045,0.0,0.046111,0.052014,0.059604,0.1348,-0.000208,train,
1,664285,"Valdez, Framber",2017-03-06,509684,HOU,BOS,L,2017,1,7,12,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.25,1.666667,5.4,0.142857,0.577778,0.272727,0.147727,10.8,0.285714,0.254167,90.346667,1.414214,9.0,0.235294,0.262711,91.111795,0.816497,6.75,0.192308,0.166766,90.639816,1.054093,0.333333,0.397436,0.333333,0.358974,0.266667,0.210256,18.0,0.5,0,2.0,4.0,1.0,0.0,0.0,3,1,0,0,0,0,0,0,0,0.5,0.0,0.577778,0.578313,0.272727,0.272727,0.147727,0.147541,5,0,1,0,4.487559,4.679496,4.977887,8.735585,0.19245,0.217307,0.231774,0.387245,0.0,0.049076,0.056021,0.067425,0.112759,0.0,0.88854,0.997723,1.116884,88.987456,0.070806,0.079586,0.088049,0.578343,0.0,0.060685,0.068328,0.076676,0.270045,0.0,0.046111,0.052014,0.059604,0.1348,-0.000208,train,
2,282332,"Sabathia, CC",2017-03-07,509824,NYY,TB,L,2017,2,9,16,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.0,2.0,9.0,0.222222,0.577778,0.272727,0.147727,7.5,0.384615,0.087744,86.943854,6.0,8.258824,0.429752,0.103889,86.523843,4.774935,9.411429,0.48996,0.112244,86.808962,6.425643,0.710714,0.673666,0.250321,0.255124,0.038965,0.07121,7.915966,0.37072,0,16.0,159.0,0.0,0.0,1.0,3,1,0,0,0,0,0,0,0,1.043312,0.0,0.685714,0.714286,0.257143,0.252747,0.057143,0.032967,5,0,1,0,4.173084,3.15162,4.54354,8.191621,0.230769,0.190715,0.228243,0.436796,0.461538,0.032055,0.032415,0.037934,0.101135,0.046886,0.872154,1.431098,1.641463,86.764077,0.023419,0.102548,0.129145,0.689759,-0.028571,0.008306,0.027289,0.0857,0.254431,0.004396,0.016043,0.07675,0.061124,0.061148,0.024176,train,
3,425794,"Wainwright, Adam",2017-03-07,509948,STL,MIA,R,2017,2,10,14,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.357143,2.333333,7.714286,0.2,0.577778,0.272727,0.147727,10.875,0.453125,0.204523,85.248403,5.859465,8.261194,0.353448,0.145545,84.910273,4.816638,8.665541,0.390947,0.118957,85.126822,4.143268,0.66466,0.682094,0.260485,0.246866,0.014854,0.03073,7.828125,0.359914,0,3.0,5.0,0.0,1.0,0.0,3,1,0,0,0,0,0,0,0,1.055322,1.0,0.577778,0.64,0.272727,0.22,0.147727,0.02,5,0,1,0,1.992725,3.977634,2.863648,10.66611,0.103355,0.144069,0.133963,0.391472,-0.205128,0.236012,0.185737,0.129564,0.20428,0.42619,1.002108,0.73354,0.8567,85.037388,0.034875,0.029366,0.041571,0.694135,0.0,0.057255,0.039078,0.032639,0.246239,0.0,0.007277,0.019235,0.026408,0.03237,-0.000208,train,
4,432934,"Young, Chris",2017-03-07,509617,CIN,KC,R,2017,5,13,29,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.344828,3.0,15.0,0.384615,0.577778,0.272727,0.147727,13.5,0.608696,0.219088,84.625769,2.309401,11.571429,0.418605,0.160203,84.965072,2.607681,11.8125,0.405797,0.1385,85.445135,2.149935,0.392857,0.413616,0.592857,0.551637,0.028571,0.046329,9.856401,0.418651,0,2.0,5.0,0.0,1.0,0.0,3,1,0,0,0,0,0,0,0,1.174001,1.0,0.577778,0.285714,0.272727,0.685714,0.147727,0.028571,5,0,1,0,3.637307,6.383024,7.955517,11.283276,0.39707,0.408337,0.435264,0.510203,-0.777778,0.118678,0.117401,0.089502,0.160163,-0.088889,0.121296,0.473597,0.91046,85.357689,0.151523,0.094949,0.121737,0.453176,0.0,0.13132,0.10751,0.104326,0.517087,0.0,0.046111,0.041497,0.059604,0.041286,-0.000208,train,
5,434378,"Verlander, Justin",2017-03-07,509671,DET,PHI,R,2017,2,15,26,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.038462,3.0,6.0,0.133333,0.577778,0.272727,0.147727,11.659091,0.584615,0.102165,90.458788,11.372481,12.182927,0.637931,0.113948,90.701049,9.011104,11.679775,0.633745,0.114762,91.162777,6.736303,0.587013,0.606611,0.366017,0.338177,0.027922,0.045688,10.061505,0.51655,0,0.0,5.0,0.0,1.0,0.0,3,1,0,0,0,0,0,0,0,1.210845,1.0,0.577778,0.628571,0.272727,0.304762,0.147727,0.028571,5,0,1,0,7.391832,6.599496,4.76991,8.398043,0.421203,0.347247,0.246019,0.457088,-0.592593,0.089085,0.065401,0.045969,0.093506,-0.142857,0.2717,0.443191,0.776331,90.862468,0.058773,0.054138,0.047869,0.602016,0.0,0.086628,0.070406,0.052507,0.339372,0.0,0.000918,0.020629,0.020824,0.051679,-0.000208,train,
6,434672,"Smith, Chris",2017-03-07,509512,AZ,OAK,R,2017,0,3,13,83.036364,89.1,2015.0,-0.651818,1.351818,0.538462,0.153846,1.0,0.0,0.0,0.384615,0.230769,0.153846,8.307692,0.333333,0.113072,83.062255,3.05505,9.391304,0.457143,0.122465,83.410959,2.683282,11.089286,0.547619,0.126,83.089638,3.405877,0.490686,0.496919,0.213725,0.283859,0.295588,0.292297,10.87013,0.488189,0,2.0,6.0,0.0,0.0,1.0,3,1,0,0,0,0,0,0,0,0.863955,1.0,0.577778,0.514706,0.272727,0.294118,0.147727,0.191176,5,0,1,0,6.076049,5.230503,4.906984,8.601822,0.232939,0.313707,0.24057,0.379773,-0.175824,0.022894,0.058725,0.050036,0.117467,0.029412,0.606587,0.562239,0.615986,83.170584,0.033969,0.033813,0.066411,0.503237,0.0,0.113692,0.177149,0.12529,0.252863,0.0,0.147661,0.104567,0.069682,0.280449,-0.000208,train,
7,448802,"García, Jaime",2017-03-07,509877,PIT,ATL,L,2017,4,13,27,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.222222,2.666667,13.5,0.307692,0.577778,0.272727,0.147727,7.941176,0.357143,0.049645,87.191388,5.773503,10.607143,0.468085,0.082367,87.110395,5.176872,9.918367,0.431138,0.090061,86.738161,5.593647,0.557244,0.542545,0.294326,0.255885,0.14843,0.201569,8.362832,0.386978,0,0.0,6.0,0.0,0.0,1.0,3,1,0,0,0,0,0,0,0,1.268367,0.0,0.577778,0.47619,0.272727,0.333333,0.147727,0.190476,5,0,1,0,6.495191,9.426399,7.001161,5.529984,0.444116,0.50018,0.356631,0.292527,0.0,0.085988,0.083505,0.059807,0.058705,0.0,0.923896,0.583033,1.161922,87.118389,0.114627,0.069104,0.060323,0.572021,0.0,0.055164,0.070867,0.064783,0.226,0.0,0.059463,0.07844,0.075638,0.195598,-0.000208,train,
8,459987,"Ramos, Cesár",2017-03-07,509671,DET,PHI,L,2017,1,7,9,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.111111,2.0,4.5,0.142857,0.577778,0.272727,0.147727,2.25,0.05,0.040714,84.353901,0.57735,3.521739,0.076923,0.044429,84.635224,0.894427,4.736842,0.10101,0.058956,84.228633,1.054093,0.355398,0.415794,0.284082,0.342041,0.36052,0.242165,6.352941,0.149813,0,0.0,4.0,1.0,0.0,0.0,3,1,0,0,0,0,0,0,0,0.554348,0.0,0.577778,0.37037,0.272727,0.185185,0.147727,0.444444,5,0,1,0,2.226922,3.449933,4.070865,1.739371,0.057735,0.068213,0.116285,0.041855,0.0,0.042672,0.046758,0.047391,0.036234,-0.037037,1.243706,1.517479,1.330902,84.12029,0.021174,0.13645,0.123032,0.416857,0.0,0.139861,0.118162,0.093622,0.345561,0.0,0.118687,0.165054,0.146269,0.254462,-0.000208,train,
9,474699,"Worley, Vance",2017-03-07,510015,WSH,BOS,R,2017,1,7,15,88.939903,95.0,2237.0,-0.189537,0.671216,0.0,0.2,1.0,9.0,0.142857,0.577778,0.272727,0.147727,7.448276,0.275862,0.011299,88.308051,4.618802,5.294118,0.196078,0.00678,88.47391,3.464102,5.256637,0.217822,0.013947,88.539178,2.573368,0.940678,0.868037,0.118644,0.168704,0.145556,0.021739,5.893657,0.26,0,0.0,4.0,1.0,0.0,0.0,3,1,0,0,0,0,0,0,0,0.898274,1.0,0.577778,0.881356,0.272727,0.118644,0.147727,0.147541,5,0,1,0,6.928203,5.460082,4.520746,3.963636,0.329914,0.249969,0.24467,0.191098,-0.571429,0.019571,0.01516,0.018802,0.010757,-0.033898,0.342168,0.672489,0.757206,88.532417,0.083894,0.100048,0.102565,0.853322,0.0,0.060685,0.060056,0.089557,0.174597,0.0,0.046111,0.052014,0.059604,0.029179,-0.000208,train,


In [6]:
with NotebookDBConnection() as conn:
    df_pred = pd.read_sql_query("SELECT * FROM mlb_api", conn)

In [7]:
df_pred

Unnamed: 0,gamePk,game_date,home_team_id,home_team_name,home_team_abbr,away_team_id,away_team_name,away_team_abbr,home_probable_pitcher_id,home_probable_pitcher_name,away_probable_pitcher_id,away_probable_pitcher_name
0,778348,2025-04-12,142,Minnesota Twins,MIN,116,Detroit Tigers,DET,663978,Chris Paddack,695549,Jackson Jobe
1,778347,2025-04-12,138,St. Louis Cardinals,STL,143,Philadelphia Phillies,PHI,571945,Miles Mikolas,650911,Cristopher Sánchez
2,778355,2025-04-12,147,New York Yankees,NYY,137,San Francisco Giants,SF,701542,Will Warren,663855,Jordan Hicks
3,778346,2025-04-12,133,Athletics,ATH,121,New York Mets,NYM,669372,J.T. Ginn,656849,David Peterson
4,778362,2025-04-12,110,Baltimore Orioles,BAL,141,Toronto Blue Jays,TOR,608372,Tomoyuki Sugano,670102,Bowden Francis
5,778349,2025-04-12,145,Chicago White Sox,CWS,111,Boston Red Sox,BOS,527048,Martín Pérez,690916,Richard Fitts
6,778353,2025-04-12,139,Tampa Bay Rays,TB,144,Atlanta Braves,ATL,656876,Drew Rasmussen,700363,AJ Smith-Shawver
7,778354,2025-04-12,146,Miami Marlins,MIA,120,Washington Nationals,WSH,645261,Sandy Alcantara,592866,Trevor Williams
8,778359,2025-04-12,114,Cleveland Guardians,CLE,118,Kansas City Royals,KC,682847,Luis L. Ortiz,547179,Michael Lorenzen
9,778357,2025-04-12,113,Cincinnati Reds,CIN,134,Pittsburgh Pirates,PIT,671096,Andrew Abbott,571760,Andrew Heaney


In [122]:
with NotebookDBConnection() as conn:
    df_pitch = pd.read_sql_query("SELECT * FROM statcast_pitchers LIMIT 20000", conn)
    df_batter = pd.read_sql_query("SELECT * FROM statcast_batters LIMIT 20000", conn)
    df_team = pd.read_sql_query("SELECT * FROM team_batting LIMIT 20000", conn)

In [123]:
df_pitch.to_csv('statcast_pitcher_subset.csv')
df_batter.to_csv('statcast_batter_subset.csv')
df_team.to_csv('team_batting.csv')