In [1]:
import pandas as pd
import sys
import os
from pathlib import Path
import sqlite3
import warnings
warnings.filterwarnings("ignore")

project_root = os.path.abspath('../../')
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.data.utils import DBConnection
from src.config import DBConfig

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
class NotebookDBConnection(DBConnection):
    def __init__(self):
        db_path = os.path.join(project_root, 'data', 'pitcher_stats.db')
        super().__init__(db_name=db_path)

In [81]:
with NotebookDBConnection() as conn:
    cursor = conn.cursor()
    
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print([table[0] for table in tables])

    # cursor.execute("PRAGMA table_info(master_schedule)")
    # columns = cursor.fetchall()
    # print([column[1] for column in columns])

['statcast_pitchers', 'statcast_batters', 'pitcher_mapping', 'mlb_api', 'train_features_advanced', 'test_features_advanced', 'prediction_features', 'prediction_features_advanced', 'team_mapping', 'master_schedule', 'game_level_pitchers', 'game_level_team_stats', 'train_features', 'test_features']


In [None]:
with NotebookDBConnection() as conn:
    df = pd.read_sql_query("

In [78]:
with NotebookDBConnection() as conn:
    tbls = ['statcast_pitchers', 'statcast_batters', 'master_schedule']

    schema = []
    # 2) for each table, pull its PRAGMA table_info and extract the column names
    for tbl in tbls:
        cols = pd.read_sql_query(f"PRAGMA table_info('{tbl}');", conn)['name'].tolist()
        schema.append((tbl, cols))

In [79]:
print(schema)

[('statcast_pitchers', ['pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'player_name', 'batter', 'pitcher', 'events', 'description', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des', 'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated', 'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk', 'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'babip_value', 'iso_

In [35]:
df = pd.read_csv('ump.csv')

In [36]:
df.head()

Unnamed: 0,game_date,home_team,away_team,home_plate_umpire,first_base_umpire,second_base_umpire,third_base_umpire
0,2016-04-01,Miami Marlins,New York Yankees,Mike Estabrook,Manny Gonzalez,Derek Mollica,Angel Hernandez
1,2016-04-03,Pittsburgh Pirates,St. Louis Cardinals,Jerry Layne,Hunter Wendelstedt,Tripp Gibson,Clint Fagan
2,2016-04-03,Tampa Bay Rays,Toronto Blue Jays,Mike Everitt,Paul Emmel,Tim Timmons,Laz Diaz
3,2016-04-03,Kansas City Royals,New York Mets,Gerry Davis,Sam Holbrook,Rob Drake,Carlos Torres
4,2016-04-04,Milwaukee Brewers,San Francisco Giants,Brian Gorman,Mark Carlson,Mike DiMuro,Quinn Wolcott


In [8]:
df.shape

(20614, 7)

In [28]:
teams = {
    "ARI": "Arizona Diamondbacks",
    "ATL": "Atlanta Braves",
    "ATH": "Oakland Athletics",
    "BAL": "Baltimore Orioles",
    "BOS": "Boston Red Sox",
    "CHC": "Chicago Cubs",
    "CHW": "Chicago White Sox",
    "CIN": "Cincinnati Reds",
    "CLE": "Cleveland Guardians",
    "COL": "Colorado Rockies",
    "DET": "Detroit Tigers",
    "HOU": "Houston Astros",
    "KCR": "Kansas City Royals",
    "LAA": "Los Angeles Angels",
    "LAD": "Los Angeles Dodgers",
    "MIA": "Miami Marlins",
    "MIL": "Milwaukee Brewers",
    "MIN": "Minnesota Twins",
    "NYM": "New York Mets",
    "NYY": "New York Yankees",
    "PHI": "Philadelphia Phillies",
    "PIT": "Pittsburgh Pirates",
    "SEA": "Seattle Mariners",
    "SDP": "San Diego Padres",
    "SFG": "San Francisco Giants",
    "STL": "St. Louis Cardinals",
    "TBR": "Tampa Bay Rays",
    "TEX": "Texas Rangers",
    "TOR": "Toronto Blue Jays",
    "WSN": "Washington Nationals"
}

df1 = pd.DataFrame(list(teams.items()), columns=["abbr", "team"])

In [42]:
with NotebookDBConnection() as conn:
    # df1.to_sql('team_mapping', conn, if_exists='replace', index = False)
    df_test = pd.read_sql_query("SELECT * FROM team_mapping", conn)
    df_test

In [57]:
df_ump = pd.read_csv('ump.csv')

In [58]:
df_ump.head()

Unnamed: 0,game_date,home_team,away_team,home_plate_umpire,first_base_umpire,second_base_umpire,third_base_umpire
0,2016-04-01,Miami Marlins,New York Yankees,Mike Estabrook,Manny Gonzalez,Derek Mollica,Angel Hernandez
1,2016-04-03,Pittsburgh Pirates,St. Louis Cardinals,Jerry Layne,Hunter Wendelstedt,Tripp Gibson,Clint Fagan
2,2016-04-03,Tampa Bay Rays,Toronto Blue Jays,Mike Everitt,Paul Emmel,Tim Timmons,Laz Diaz
3,2016-04-03,Kansas City Royals,New York Mets,Gerry Davis,Sam Holbrook,Rob Drake,Carlos Torres
4,2016-04-04,Milwaukee Brewers,San Francisco Giants,Brian Gorman,Mark Carlson,Mike DiMuro,Quinn Wolcott


In [64]:
team_names = ['American All-Stars', 'National All-Stars']

In [67]:
import re

def clean(name: str) -> str:
    """Strip footnotes, periods, extra whitespace."""
    name = re.sub(r"\[.*?\]", "", name)
    name = name.replace(".", "")
    return re.sub(r"\s+", " ", name).strip()

def normalize(name: str) -> str:
    """Clean and then map old‑name aliases to the current full name."""
    n = clean(name)
    # Indians → Guardians
    if n.endswith("Indians"):
        n = n.replace("Indians", "Guardians")
    # Athletics (nickname only) → full “Oakland Athletics”
    if n == "Athletics":
        n = "Oakland Athletics"
    return n

# rebuild lookup on the normalized full names
name_to_abbr = {
    normalize(team): abbr
    for abbr, team in zip(df_test['abbr'], df_test['team'])
}

# now map into new abbr columns, keeping the originals
df_ump['home_abbr'] = df_ump['home_team'].apply(normalize).map(name_to_abbr)
df_ump['away_abbr'] = df_ump['away_team'].apply(normalize).map(name_to_abbr)

df_ump.head()


Unnamed: 0,game_date,home_team,away_team,home_plate_umpire,first_base_umpire,second_base_umpire,third_base_umpire,home_abbr,away_abbr
0,2016-04-01,Miami Marlins,New York Yankees,Mike Estabrook,Manny Gonzalez,Derek Mollica,Angel Hernandez,MIA,NYY
1,2016-04-03,Pittsburgh Pirates,St. Louis Cardinals,Jerry Layne,Hunter Wendelstedt,Tripp Gibson,Clint Fagan,PIT,STL
2,2016-04-03,Tampa Bay Rays,Toronto Blue Jays,Mike Everitt,Paul Emmel,Tim Timmons,Laz Diaz,TBR,TOR
3,2016-04-03,Kansas City Royals,New York Mets,Gerry Davis,Sam Holbrook,Rob Drake,Carlos Torres,KCR,NYM
4,2016-04-04,Milwaukee Brewers,San Francisco Giants,Brian Gorman,Mark Carlson,Mike DiMuro,Quinn Wolcott,MIL,SFG


In [65]:
df_ump = df_ump[~df_ump['home_team'].isin(team_names)]

In [74]:
df_ump.head()

Unnamed: 0,game_date,home_plate_umpire,first_base_umpire,second_base_umpire,third_base_umpire,home_team,away_team
0,2016-04-01,Mike Estabrook,Manny Gonzalez,Derek Mollica,Angel Hernandez,MIA,NYY
1,2016-04-03,Jerry Layne,Hunter Wendelstedt,Tripp Gibson,Clint Fagan,PIT,STL
2,2016-04-03,Mike Everitt,Paul Emmel,Tim Timmons,Laz Diaz,TBR,TOR
3,2016-04-03,Gerry Davis,Sam Holbrook,Rob Drake,Carlos Torres,KCR,NYM
4,2016-04-04,Brian Gorman,Mark Carlson,Mike DiMuro,Quinn Wolcott,MIL,SFG


In [61]:
import re

def clean(name: str) -> str:
    """Strip footnotes, periods, extra whitespace."""
    name = re.sub(r"\[.*?\]", "", name)
    name = name.replace(".", "")
    return re.sub(r"\s+", " ", name).strip()

def normalize(name: str) -> str:
    """Clean and then map any old‑name aliases to the current names."""
    n = clean(name)
    # alias the Indians → Guardians so both key to 'CLE'
    if n.endswith("Indians"):
        n = n.replace("Indians", "Guardians")
    return n

# build lookup on normalized keys
name_to_abbr = {
    normalize(team): abbr
    for abbr, team in zip(df_test['abbr'], df_test['team'])
}

# now map into new columns, preserving original names
df_ump['home_abbr'] = df_ump['home_team'].apply(normalize).map(name_to_abbr)
df_ump['away_abbr'] = df_ump['away_team'].apply(normalize).map(name_to_abbr)


In [59]:
import re

# 1) regex‐based cleaning function
def clean(name: str) -> str:
    # drop any “[n]” footnotes
    name = re.sub(r"\[.*?\]", "", name)
    # remove periods (so “St. Louis” → “St Louis”)
    name = name.replace(".", "")
    # collapse whitespace
    return re.sub(r"\s+", " ", name).strip()

# 2) build lookup: cleaned full name → 3‑letter abbr
name_to_abbr = {
    clean(team): abbr
    for abbr, team in zip(df_test['abbr'], df_test['team'])
}

# 3) keep original names, add new abbr columns
df_ump['home_abbr'] = (
    df_ump['home_team']
      .apply(clean)
      .map(name_to_abbr)
)
df_ump['away_abbr'] = (
    df_ump['away_team']
      .apply(clean)
      .map(name_to_abbr)
)

# now df_ump has columns: home_team, away_team, home_abbr, away_abbr
df_ump.head()


Unnamed: 0,game_date,home_team,away_team,home_plate_umpire,first_base_umpire,second_base_umpire,third_base_umpire,home_abbr,away_abbr
0,2016-04-01,Miami Marlins,New York Yankees,Mike Estabrook,Manny Gonzalez,Derek Mollica,Angel Hernandez,MIA,NYY
1,2016-04-03,Pittsburgh Pirates,St. Louis Cardinals,Jerry Layne,Hunter Wendelstedt,Tripp Gibson,Clint Fagan,PIT,STL
2,2016-04-03,Tampa Bay Rays,Toronto Blue Jays,Mike Everitt,Paul Emmel,Tim Timmons,Laz Diaz,TBR,TOR
3,2016-04-03,Kansas City Royals,New York Mets,Gerry Davis,Sam Holbrook,Rob Drake,Carlos Torres,KCR,NYM
4,2016-04-04,Milwaukee Brewers,San Francisco Giants,Brian Gorman,Mark Carlson,Mike DiMuro,Quinn Wolcott,MIL,SFG


In [47]:
df_test

Unnamed: 0,abbr,team,ballpark
0,ARI,Arizona Diamondbacks,Chase Field
1,ATL,Atlanta Braves,Truist Park
2,ATH,Oakland Athletics,Oakland Coliseum
3,BAL,Baltimore Orioles,Oriole Park at Camden Yards
4,BOS,Boston Red Sox,Fenway Park
5,CHC,Chicago Cubs,Wrigley Field
6,CHW,Chicago White Sox,Guaranteed Rate Field
7,CIN,Cincinnati Reds,Great American Ball Park
8,CLE,Cleveland Guardians,Progressive Field
9,COL,Colorado Rockies,Coors Field
