In [1]:
import pandas as pd
import sys
import os
from pathlib import Path
import sqlite3

project_root = os.path.abspath('../../')
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.data.utils import DBConnection
from src.config import DBConfig

In [2]:
pd.set_option('display.max_columns', None)

In [4]:
class NotebookDBConnection(DBConnection):
    def __init__(self):
        db_path = os.path.join(project_root, 'data', 'pitcher_stats.db')
        super().__init__(db_name=db_path)

In [5]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print([table[0] for table in tables])

['statcast_pitchers', 'statcast_batters', 'pitcher_mapping', 'umpire_data', 'team_batting', 'pitcher_features', 'combined_features', 'batter_features', 'game_level_batters', 'advanced_opponent_game_features', 'team_season_features', 'train_combined_features', 'test_combined_features', 'prediction_features', 'daily_predictions', 'game_level_pitchers', 'game_level_team_stats', 'train_features', 'test_features', 'mlb_api', 'team_mapping', 'train_features_advanced']


In [None]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM train_combined_features")
    print(cursor.fetchall())


[(76667,)]


In [18]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    df = pd.read_sql_query("SELECT * FROM daily_predictions", conn)

df

Unnamed: 0,prediction_run_date,game_date,gamePk,pitcher_id,pitcher_name,team_abbr,opponent_abbr,predicted_strikeouts,model_version,actual_strikeouts
0,2025-04-15,2025-04-15,778302,689017,Landon Knack,LAD,COL,4.27,test_strikeout_model_20250415_175930,
1,2025-04-15,2025-04-15,778302,663372,Ryan Feltner,COL,LAD,4.3,test_strikeout_model_20250415_175930,
2,2025-04-15,2025-04-15,778305,684007,Shota Imanaga,CHC,SD,4.35,test_strikeout_model_20250415_175930,
3,2025-04-15,2025-04-15,778305,681190,Randy Vásquez,SD,CHC,3.5,test_strikeout_model_20250415_175930,
4,2025-04-15,2025-04-15,778306,641816,Tyler Mahle,TEX,LAA,4.66,test_strikeout_model_20250415_175930,
5,2025-04-15,2025-04-15,778306,579328,Yusei Kikuchi,LAA,TEX,6.41,test_strikeout_model_20250415_175930,
6,2025-04-15,2025-04-15,778307,656731,Tylor Megill,NYM,MIN,4.89,test_strikeout_model_20250415_175930,
7,2025-04-15,2025-04-15,778307,641927,Bailey Ober,MIN,NYM,5.52,test_strikeout_model_20250415_175930,
8,2025-04-15,2025-04-15,778308,607200,Erick Fedde,STL,HOU,3.16,test_strikeout_model_20250415_175930,
9,2025-04-15,2025-04-15,778308,686613,Hunter Brown,HOU,STL,6.05,test_strikeout_model_20250415_175930,


In [None]:
with NotebookDBConnection() as conn:
    df_pred = pd.read_sql_query("SELECT * FROM mlb_api", conn)

In [None]:
df_pred

Unnamed: 0,gamePk,game_date,home_team_id,home_team_name,home_team_abbr,away_team_id,away_team_name,away_team_abbr,home_probable_pitcher_id,home_probable_pitcher_name,away_probable_pitcher_id,away_probable_pitcher_name
0,778348,2025-04-12,142,Minnesota Twins,MIN,116,Detroit Tigers,DET,663978,Chris Paddack,695549,Jackson Jobe
1,778347,2025-04-12,138,St. Louis Cardinals,STL,143,Philadelphia Phillies,PHI,571945,Miles Mikolas,650911,Cristopher Sánchez
2,778355,2025-04-12,147,New York Yankees,NYY,137,San Francisco Giants,SF,701542,Will Warren,663855,Jordan Hicks
3,778346,2025-04-12,133,Athletics,ATH,121,New York Mets,NYM,669372,J.T. Ginn,656849,David Peterson
4,778362,2025-04-12,110,Baltimore Orioles,BAL,141,Toronto Blue Jays,TOR,608372,Tomoyuki Sugano,670102,Bowden Francis
5,778349,2025-04-12,145,Chicago White Sox,CWS,111,Boston Red Sox,BOS,527048,Martín Pérez,690916,Richard Fitts
6,778353,2025-04-12,139,Tampa Bay Rays,TB,144,Atlanta Braves,ATL,656876,Drew Rasmussen,700363,AJ Smith-Shawver
7,778354,2025-04-12,146,Miami Marlins,MIA,120,Washington Nationals,WSH,645261,Sandy Alcantara,592866,Trevor Williams
8,778359,2025-04-12,114,Cleveland Guardians,CLE,118,Kansas City Royals,KC,682847,Luis L. Ortiz,547179,Michael Lorenzen
9,778357,2025-04-12,113,Cincinnati Reds,CIN,134,Pittsburgh Pirates,PIT,671096,Andrew Abbott,571760,Andrew Heaney


In [None]:
with NotebookDBConnection() as conn:
    df_pitch = pd.read_sql_query("SELECT * FROM statcast_pitchers LIMIT 20000", conn)
    df_batter = pd.read_sql_query("SELECT * FROM statcast_batters LIMIT 20000", conn)
    df_team = pd.read_sql_query("SELECT * FROM team_batting LIMIT 20000", conn)

In [None]:
df_pitch.to_csv('statcast_pitcher_subset.csv')
df_batter.to_csv('statcast_batter_subset.csv')
df_team.to_csv('team_batting.csv')

In [None]:
df_pitch.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length,estimated_slg_using_speedangle,delta_pitcher_run_exp,hyper_speed,home_score_diff,bat_score_diff,home_win_exp,bat_win_exp,age_pit_legacy,age_bat_legacy,age_pit,age_bat,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,pitcher_id,season
0,CH,2016-09-20,91.2,-2.66,5.68,"Fernández, José",502517,605228,field_out,hit_into_play,,,,,13.0,"Daniel Murphy grounds out, second baseman Dee ...",R,L,R,MIA,WSH,X,4.0,ground_ball,1,2,2016,-1.27,0.52,-0.97,1.32,572191.0,,571718.0,2,8,Top,149.23,158.53,,,,160920_210323,7.439148,-133.556559,-5.258917,-16.269,28.524,-25.05,3.3,1.58,33.0,79.9,0.0,92.0,1493.0,6.8,449130,592663,500743,543829,445988,588751,542303,592885,400085,54.5,0.28,0.243,0.0,1.0,0.0,0.0,2.0,54,4,Changeup,1,0,0,1,0,1,0,1,Standard,Standard,244.0,0.119,-0.197,,,0.326,0.197,88.0,1,-1,0.752,0.248,23,31,24,31,3,0,6.0,1.0,,12.0,2.26,1.27,-1.27,,605228,2016
1,FF,2016-09-20,96.3,-2.39,5.99,"Fernández, José",502517,605228,,foul,,,,,11.0,"Daniel Murphy grounds out, second baseman Dee ...",R,L,R,MIA,WSH,S,,,1,1,2016,-0.72,1.38,-0.04,3.62,572191.0,,571718.0,2,8,Top,,,,,,160920_210233,8.161504,-141.023424,-3.274408,-11.13,33.356,-13.724,3.3,1.58,,,,97.0,2353.0,6.8,449130,592663,500743,543829,445988,588751,542303,592885,400085,54.5,,,,,,,,54,3,4-Seam Fastball,1,0,0,1,0,1,0,1,Standard,Standard,199.0,0.0,-0.068,,,,0.068,,1,-1,0.752,0.248,23,31,24,31,3,0,6.0,1.0,,12.0,1.13,0.72,-0.72,,605228,2016
2,FF,2016-09-20,95.6,-2.26,5.99,"Fernández, José",502517,605228,,ball,,,,,14.0,"Daniel Murphy grounds out, second baseman Dee ...",R,L,R,MIA,WSH,B,,,0,1,2016,-0.72,1.42,1.39,2.19,572191.0,,571718.0,2,8,Top,,,,,,160920_210210,11.4735,-139.462135,-7.054365,-11.535,30.023,-12.651,3.36,1.76,,,,96.4,2498.0,6.8,449130,592663,500743,543829,445988,588751,542303,592885,400085,54.5,,,,,,,,54,2,4-Seam Fastball,1,0,0,1,0,1,0,1,Standard,Standard,214.0,0.0,0.028,,,,-0.028,,1,-1,0.752,0.248,23,31,24,31,3,0,6.0,1.0,,12.0,1.12,0.72,-0.72,,605228,2016
3,FF,2016-09-20,96.3,-2.42,6.02,"Fernández, José",502517,605228,,called_strike,,,,,5.0,"Daniel Murphy grounds out, second baseman Dee ...",R,L,R,MIA,WSH,S,,,0,0,2016,-0.95,1.62,-0.17,2.42,572191.0,,571718.0,2,8,Top,,,,,,160920_210149,8.495281,-140.82199,-7.148967,-14.162,33.45,-9.821,3.28,1.81,,,,96.7,2492.0,6.7,449130,592663,500743,543829,445988,588751,542303,592885,400085,54.5,,,,,,,,54,1,4-Seam Fastball,1,0,0,1,0,1,0,1,Standard,Standard,222.0,0.0,-0.043,,,,0.043,,1,-1,0.752,0.248,23,31,24,31,3,0,6.0,1.0,,12.0,0.9,0.95,-0.95,,605228,2016
4,CH,2016-09-20,91.4,-2.7,5.59,"Fernández, José",457787,605228,strikeout,swinging_strike,,,,,13.0,Danny Espinosa strikes out swinging.,R,L,R,MIA,WSH,S,2.0,,1,2,2016,-1.32,-0.23,-1.35,1.21,572191.0,,571718.0,1,8,Top,,,,,,160920_210056,6.741996,-133.953855,-3.432552,-16.813,30.627,-34.025,3.6,1.7,,,,92.2,1682.0,6.9,449130,592663,500743,543829,445988,588751,542303,592885,400085,54.5,,0.0,0.0,1.0,0.0,0.0,,53,4,Changeup,1,0,0,1,0,1,0,1,Standard,Standard,278.0,0.155,-0.285,,,,0.285,,1,-1,0.597,0.403,23,29,24,29,3,2,6.0,1.0,,1.0,3.01,1.32,-1.32,,605228,2016


In [None]:
import  pickle
from pathlib import Path
import os
from sys import path

In [None]:
# Get the current working directory (where the notebook is running)
notebook_dir = Path.cwd() # Or Path('.').resolve()

# Go up two levels to get to mlb_pred
project_root = notebook_dir.parent.parent
models_dir = os.path.join(project_root, 'models')

In [None]:
try:
    encoding_files = [f for f in os.listdir(models_dir) if f.startswith('test_feature_columns_') and f.endswith('.pkl')]
    if not encoding_files:
        print(f"Error: No 'test_feature_columns_*.pkl' files found in {models_dir}")
        feature_columns = None
    else:
        # Sort by modification time (or filename timestamp if consistent)
        latest_file = max(encoding_files, key=lambda f: os.path.getmtime(os.path.join(models_dir, f)))
        file_path = os.path.join(models_dir, latest_file)
        print(f"Loading feature columns from: {file_path}")

        # --- Load the pickle file ---
        try:
            with open(file_path, 'rb') as f:
                feature_columns = pickle.load(f)

            # --- Inspect the loaded columns ---
            print(f"\nLoaded {len(feature_columns)} feature columns.")
            # Print the list of columns
            print("\nFeature Columns:")
            for col in feature_columns:
                print(f"- {col}")

            # You can also just print the list directly
            # print(feature_columns)

        except FileNotFoundError:
            print(f"Error: File not found at {file_path}")
            feature_columns = None
        except Exception as e:
            print(f"An error occurred loading the file: {e}")
            feature_columns = None

except Exception as e:
    print(f"An error occurred finding the file: {e}")
    feature_columns = None


Loading feature columns from: C:\Users\kekoa\Documents\DataScience\mlb_pred\models\test_feature_columns_20250415_133304.pkl

Loaded 59 feature columns.

Feature Columns:
- rolling_3g_k9
- rolling_3g_k_pct
- rolling_3g_swstr_pct
- rolling_3g_velocity
- rolling_3g_K_std
- rolling_5g_k_pct
- rolling_5g_swstr_pct
- rolling_5g_velocity
- rolling_5g_K_std
- rolling_10g_k_pct
- rolling_10g_swstr_pct
- rolling_10g_K_std
- rolling_5g_fastball_pct
- career_k9
- career_k_pct
- K_last_game
- days_since_last_game
- recent_vs_career_k9
- lag_1_fastball_percent
- lag_2_fastball_percent
- rolling_3g_k_per_9_std_lag1
- rolling_5g_k_per_9_std_lag1
- rolling_10g_k_per_9_std_lag1
- rolling_3g_k_percent_std_lag1
- rolling_5g_k_percent_std_lag1
- rolling_10g_k_percent_std_lag1
- k_percent_change_lag1
- rolling_3g_swinging_strike_percent_std_lag1
- rolling_5g_swinging_strike_percent_std_lag1
- rolling_10g_swinging_strike_percent_std_lag1
- ewma_5g_swinging_strike_percent_lag1
- swinging_strike_percent_change