In [68]:
import pandas as pd
import random

In [69]:
dfs = []
for year in range(14, 26):
    if year == 24:
        continue
    file_path = f"pitcher_log_20{year}.csv"
    year_df = pd.read_csv(file_path, parse_dates=['Date'])
    year_df = year_df[~year_df['Gcar'].isna()]
    dfs.append(year_df)
df = pd.concat(dfs, ignore_index=True)

df.dropna(subset='Id', inplace=True)
df['Date'] = pd.to_datetime(df['Date'].astype(str).str.split().str[0])

df['opener'] = df['Inngs'].astype(str).str.contains('GS', na=False)

# game_id: sorted teams + date
df['ateam'] = df[['Team', 'Opp']].min(axis=1)
df['zteam'] = df[['Team', 'Opp']].max(axis=1)
df['game_id'] = df['ateam'] + '_' + df['zteam']  + '_' + df['Date'].dt.strftime('%Y-%m-%d')

df = df.sort_values(['Id', 'Date'])

df['prev_game'] = df.groupby('Id')['Date'].shift(1)
df['freshness'] = df['Date'] - df['prev_game']

df['FIP'] = df['FIP'].apply(pd.to_numeric, errors='coerce')
    

In [None]:
game_cols = ["game_id", "1_opener_fip", "1_opener_freshness", "1_others_fip", "1_others_freshness", "1_home", "1_team", "2_opener_fip", "2_opener_freshness", "2_others_fip", "2_others_freshness", "2_home", "2_team"]
game_rows = []

for game_id, game_group in df.groupby('game_id'):
    teams_data = []
    for team, team_group in game_group.groupby('Team'):
        opener_df = team_group[team_group['opener']]
        others_df = team_group[~team_group['opener']]

        if opener_df.empty or others_df.empty:
            break 

        opener_fip = opener_df['FIP'].mean()
        opener_fresh = opener_df['freshness'].mean()

        # We really want top 3 or something here, should switch later
        # Also, we are looking not at the bench here but the other pitchers in the game. Not what we want.
        others_fip = others_df['FIP'].mean()
        others_fresh = others_df['freshness'].mean()

        team_data = {
            "opener_fip": opener_fip,
            "opener_freshness": opener_fresh,
            "others_fip": others_fip,
            "others_freshness": others_fresh,
            "home": 1 if team_group.iloc[0]['Home'] == '@' else 0,
            "team": team,
        }
        teams_data.append(team_data)

    if len(teams_data) != 2:
        continue

    random.shuffle(teams_data)
    team1, team2 = teams_data

    game_data = [
    game_id,
    team1["opener_fip"], team1["opener_freshness"],
    team1["others_fip"], team1["others_freshness"],
    team1["home"],       team1["team"],
    team2["opener_fip"], team2["opener_freshness"],
    team2["others_fip"], team2["others_freshness"],
    team2["home"],       team2["team"],
    ]

    game_rows.append(dict(zip(game_cols, game_data)))

games_df = pd.DataFrame(game_rows)

In [71]:
games_df.head()

Unnamed: 0,game_id,1_opener_fip,1_opener_freshness,1_others_fip,1_others_freshness,1_home,1_team,2_opener_fip,2_opener_freshness,2_others_fip,2_others_freshness,2_home,2_team
0,ARI_ATH_2025-08-02,4.91,6 days,6.426,7 days 09:36:00,0,ATH,4.67,6 days,3.09,5 days 12:00:00,1,ARI
1,ARI_ATH_2025-08-03,4.59,6 days,5.13,1 days 16:00:00,1,ARI,3.71,4 days,4.036667,3 days 08:00:00,0,ATH
2,ARI_ATL_2014-06-06,3.72,7 days,2.126667,3 days 16:00:00,1,ATL,3.85,3 days,3.892,1 days 09:36:00,0,ARI
3,ARI_ATL_2014-06-07,4.4,6 days,3.87,1 days 18:00:00,0,ARI,3.27,7 days,2.49,2 days 09:36:00,1,ATL
4,ARI_ATL_2014-06-08,4.6,5 days,3.9525,1 days 06:00:00,0,ARI,2.98,7 days,3.74,3 days 16:00:00,1,ATL
