In [3]:
import pandas as pd
import sys
import os
from pathlib import Path
import sqlite3

project_root = os.path.abspath('../../')
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.data.utils import DBConnection
from config import DBConfig

In [4]:
class NotebookDBConnection(DBConnection):
    def __init__(self):
        db_path = os.path.join(project_root, 'data', 'pitcher_stats.db')
        super().__init__(db_name=db_path)

In [5]:
pd.set_option('display.max_columns', 150)

In [6]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print([table[0] for table in tables])

['pitcher_mapping', 'statcast_pitchers', 'team_batting', 'statcast_batters', 'game_level_pitchers', 'predictive_pitch_features']


In [27]:
with NotebookDBConnection() as conn:
    df_batters = pd.read_sql_query("SELECT * FROM statcast_batters LIMIT 25000;", conn)
    df_pitchers = pd.read_sql_query("SELECT * FROM statcast_pitchers LIMIT 25000;", conn)
    df_team_batting = pd.read_sql_query("SELECT * FROM team_batting;", conn)
    

In [11]:
with NotebookDBConnection() as conn:
    df_game_level = pd.read_sql_query("SELECT * FROM game_level_pitchers LIMIT 10000;", conn)
    df_pred = pd.read_sql_query("SELECT * FROM predictive_pitch_features LIMIT 10000;", conn)

In [12]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM predictive_pitch_features;")
    print(cursor.fetchall())

[(21919,)]


In [13]:
df_game_level.to_csv('game_level_pitchers.csv')

In [14]:
df_pred.to_csv('predictive_pitch_features.csv')

In [15]:
df_pred.head()

Unnamed: 0,pitcher_id,player_name,game_date,game_pk,home_team,away_team,p_throws,season,strikeouts,batters_faced,total_pitches,avg_velocity,max_velocity,avg_spin_rate,avg_horizontal_break,avg_vertical_break,zone_percent,swinging_strike_percent,innings_pitched,k_per_9,k_percent,fastball_percent,breaking_percent,offspeed_percent,rolling_3g_strikeouts,rolling_3g_innings_pitched,rolling_5g_strikeouts,rolling_5g_innings_pitched,rolling_10g_strikeouts,rolling_10g_innings_pitched,rolling_3g_K_std,rolling_5g_K_std,rolling_10g_K_std,days_since_last_game,rest_days_4_less,rest_days_5,rest_days_6_more,throws_right
0,425794,"Wainwright, Adam",2021-04-08 00:00:00,634568,STL,MIL,R,2021,18,16.0,168.0,84.142857,91.8,2227.196429,0.127321,0.302321,0.5,0.089286,8.0,3.375,0.1875,0.625,0.25,0.125,3.0,8.0,3.0,8.0,3.0,8.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,1.0
1,425794,"Wainwright, Adam",2021-04-14 00:00:00,634541,STL,WSH,R,2021,21,23.0,282.0,81.66383,91.8,2452.387097,0.467553,0.049787,0.361702,0.138298,15.0,10.8,0.782609,0.542553,0.393617,0.06383,10.5,11.5,10.5,11.5,10.5,11.5,10.606602,10.606602,10.606602,5.0,0.0,1.0,0.0,1.0
2,425794,"Wainwright, Adam",2021-04-20 00:00:00,634509,WSH,STL,R,2021,30,23.0,255.0,82.882353,92.4,2435.333333,0.203647,0.252118,0.4,0.129412,15.0,12.6,0.913043,0.611765,0.341176,0.047059,14.0,12.666667,14.0,12.666667,14.0,12.666667,9.643651,9.643651,9.643651,6.0,0.0,0.0,1.0,1.0
3,425794,"Wainwright, Adam",2021-04-26 00:00:00,634345,STL,PHI,R,2021,24,27.0,300.0,82.833,92.4,2400.35,0.2053,0.2649,0.41,0.1,20.0,13.5,1.111111,0.6,0.31,0.09,23.0,16.666667,18.0,14.5,18.0,14.5,6.244998,11.224972,11.224972,6.0,0.0,0.0,1.0,1.0
4,425794,"Wainwright, Adam",2021-05-03 00:00:00,634292,STL,NYM,R,2021,15,31.0,321.0,82.91215,92.0,2503.915888,0.343271,0.168411,0.53271,0.11215,25.0,8.64,0.774194,0.607477,0.336449,0.056075,25.0,20.0,19.2,16.6,19.2,16.6,4.582576,10.084642,10.084642,6.0,0.0,0.0,1.0,1.0


In [24]:
df_team_batting.to_csv('team_batting.csv')
df_batters.to_csv('statcast_batters_subset.csv')
df_pitchers.to_csv('statcast_pitchers_subset.csv')