In [None]:
import pandas as pd
import random

In [None]:
dfs = []
for year in range(14, 26):
    if year == 24:
        continue
    file_path = f"pitcher_log_20{year}.csv"
    year_df = pd.read_csv(file_path, parse_dates=['Date'])
    year_df = year_df[~year_df['Gcar'].isna()]
    dfs.append(year_df)
df = pd.concat(dfs, ignore_index=True)

df.dropna(subset='Id', inplace=True)
df['Date'] = pd.to_datetime(df['Date'].astype(str).str.split().str[0])

df['opener'] = df['Inngs'].astype(str).str.contains('GS', na=False)

# game_id: sorted teams + date
df['ateam'] = df[['Team', 'Opp']].min(axis=1)
df['zteam'] = df[['Team', 'Opp']].max(axis=1)
df['game_id'] = df['ateam'] + '_' + df['zteam']  + '_' + df['Date'].dt.strftime('%Y-%m-%d')

df = df.sort_values(['Id', 'Date'])

df['prev_game'] = df.groupby('Id')['Date'].shift(1)
df['freshness'] = df['Date'] - df['prev_game']

df['FIP'] = df['FIP'].apply(pd.to_numeric, errors='coerce')
    

In [None]:
letters = []
results = df['Result']
for res in results:
    letters.append(res[0])
print(set(letters))

In [22]:
game_cols = ["game_id", "1_opener_fip", "1_opener_freshness", "1_others_fip", "1_others_freshness", "1_home", "1_team", "2_opener_fip", "2_opener_freshness", "2_others_fip", "2_others_freshness", "2_home", "2_team", "result"]
game_rows = []

label_map = {'W': 1,
            'L': 0,
            'T': 2}

for game_id, game_group in df.groupby('game_id'):
    teams_data = []
    for team, team_group in game_group.groupby('Team'):
        opener_df = team_group[team_group['opener']]
        others_df = team_group[~team_group['opener']]

        if opener_df.empty or others_df.empty:
            break 

        opener_fip = opener_df['FIP'].mean()
        opener_fresh = opener_df['freshness'].mean()

        # We really want top 3 or something here, should switch later
        # Also, we are looking not at the bench here but the other pitchers in the game. Not what we want.
        others_fip = others_df['FIP'].mean()
        others_fresh = others_df['freshness'].mean()

        team_data = {
            "opener_fip": opener_fip,
            "opener_freshness": opener_fresh.total_seconds() / 3600,
            "others_fip": others_fip,
            "others_freshness": others_fresh.total_seconds() / 3600,
            "home": 1 if team_group.iloc[0]['Home'] == '@' else 0,
            "team": team,
            "result": label_map[team_group.iloc[0]['Result'][0]]
        }
        teams_data.append(team_data)

    if len(teams_data) != 2:
        continue

    random.shuffle(teams_data)
    team1, team2 = teams_data

    game_data = [
    game_id,
    team1["opener_fip"], team1["opener_freshness"],
    team1["others_fip"], team1["others_freshness"],
    team1["home"],       team1["team"],
    team2["opener_fip"], team2["opener_freshness"],
    team2["others_fip"], team2["others_freshness"],
    team2["home"],       team2["team"],
    team1["result"]
    ]

    game_rows.append(dict(zip(game_cols, game_data)))

games_df = pd.DataFrame(game_rows)

In [23]:
games_df.head()

Unnamed: 0,game_id,1_opener_fip,1_opener_freshness,1_others_fip,1_others_freshness,1_home,1_team,2_opener_fip,2_opener_freshness,2_others_fip,2_others_freshness,2_home,2_team,result
0,ARI_ATH_2025-08-02,4.91,144.0,6.426,177.6,0,ATH,4.67,144.0,3.09,132.0,1,ARI,0
1,ARI_ATH_2025-08-03,4.59,144.0,5.13,40.0,1,ARI,3.71,96.0,4.036667,80.0,0,ATH,1
2,ARI_ATL_2014-06-06,3.85,72.0,3.892,33.6,0,ARI,3.72,168.0,2.126667,88.0,1,ATL,0
3,ARI_ATL_2014-06-07,4.4,144.0,3.87,42.0,0,ARI,3.27,168.0,2.49,57.6,1,ATL,1
4,ARI_ATL_2014-06-08,2.98,168.0,3.74,88.0,1,ATL,4.6,120.0,3.9525,30.0,0,ARI,0


In [24]:
from sklearn.model_selection import train_test_split

X, y = games_df.drop(columns=['game_id', '1_team', '2_team', 'result']), games_df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

CM = confusion_matrix(y_test, pred)
CR = classification_report(y_test, pred)
print(CM)
print(CR)

[[2085  871]
 [1004 2055]]
              precision    recall  f1-score   support

           0       0.67      0.71      0.69      2956
           1       0.70      0.67      0.69      3059

    accuracy                           0.69      6015
   macro avg       0.69      0.69      0.69      6015
weighted avg       0.69      0.69      0.69      6015

