In [44]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

# Load environment
load_dotenv()
engine = create_engine(f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}")

# Query all needed columns
query = """
SELECT 
    pitcher,
    stand,
    balls,
    strikes,
    inning,
    outs_when_up,
    on_1b,
    on_2b,
    on_3b,
    pitch_type,
    home_score,
    away_score,
    inning_topbot
FROM pitch_data
WHERE pitch_type IS NOT NULL
"""
df = pd.read_sql(query, engine)


In [45]:
from sklearn.preprocessing import LabelEncoder

# Encode batter stance
le_stand = LabelEncoder()
df['stand'] = le_stand.fit_transform(df['stand'])  # R=1, L=0

# Compute score differential (positive = pitcher’s team is winning)
df['score_diff'] = df.apply(
    lambda row: (row['home_score'] - row['away_score']) if row['inning_topbot'] == 'Top' else (row['away_score'] - row['home_score']),
    axis=1
)

# Encode baserunners
df['on_1b'] = df['on_1b'].notna().astype(int)
df['on_2b'] = df['on_2b'].notna().astype(int)
df['on_3b'] = df['on_3b'].notna().astype(int)

# Drop unneeded columns
df.drop(columns=['home_score', 'away_score', 'inning_topbot'], inplace=True)


In [49]:
# Drop any remaining NaNs and keep only the needed columns
df.dropna(inplace=True)

df_final = df[['pitcher', 'stand', 'balls', 'strikes', 'inning', 'outs_when_up',
               'on_1b', 'on_2b', 'on_3b', 'score_diff', 'pitch_type']].copy()


In [50]:
# Group pitch types
pitch_type_map = {
    'FF': 'Fastball', 'FT': 'Fastball', 'SI': 'Fastball', 'FC': 'Fastball', 'FS': 'Fastball',
    'SL': 'Breaking', 'ST': 'Breaking', 'CU': 'Breaking', 'KC': 'Breaking',
    'CH': 'Offspeed', 'SC': 'Offspeed',
    'KN': 'Other', 'EP': 'Other', 'FO': 'Other'
}

df_final['pitch_type_group'] = df_final['pitch_type'].map(pitch_type_map)
df_final = df_final.dropna(subset=['pitch_type_group'])


In [51]:
# Rename pitcher to pitcher_id to match input API
df_final.rename(columns={'pitcher': 'pitcher_id'}, inplace=True)

# Save cleaned data
df_final.to_csv("../data/processed_features.csv", index=False)
print("✅ Saved cleaned processed_features.csv")

✅ Saved cleaned processed_features.csv


In [52]:
df.head(50)

Unnamed: 0,pitcher,stand,balls,strikes,inning,outs_when_up,on_1b,on_2b,on_3b,pitch_type,score_diff
0,687863,1,1,1,9,1,0,0,0,ST,3
1,687863,1,0,1,9,1,0,0,0,ST,3
2,687863,1,0,0,9,1,0,0,0,ST,3
3,687863,0,0,1,9,0,0,0,0,FF,3
4,687863,0,0,0,9,0,0,0,0,ST,3
5,660813,1,0,2,9,2,1,1,0,FF,-3
6,660813,1,0,1,9,2,1,1,0,SI,-3
7,660813,1,0,0,9,2,1,1,0,SI,-3
8,660813,0,2,1,9,2,1,0,0,SI,-3
9,660813,0,1,1,9,2,1,0,0,FC,-3


In [53]:
print(df.columns.tolist())


['pitcher', 'stand', 'balls', 'strikes', 'inning', 'outs_when_up', 'on_1b', 'on_2b', 'on_3b', 'pitch_type', 'score_diff']


In [54]:
print(df[['on_1b', 'on_2b', 'on_3b']].sum())


on_1b    218773
on_2b    134248
on_3b     66914
dtype: int64
