In [92]:
import pandas as pd
import random

In [93]:
dfs = []
for year in range(14, 26):
    if year == 24:
        continue
    file_path = f"data/pitcher_log_20{year}.csv"
    year_df = pd.read_csv(file_path, parse_dates=['Date'])
    year_df = year_df[~year_df['Gcar'].isna()]
    dfs.append(year_df)
df = pd.concat(dfs, ignore_index=True)

df.dropna(subset='Id', inplace=True)
df['Date'] = pd.to_datetime(df['Date'].astype(str).str.split().str[0])
df['year'] = df['Date'].dt.year

df['opener'] = df['Inngs'].astype(str).str.contains('GS', na=False)

# game_id: sorted teams + date
df['ateam'] = df[['Team', 'Opp']].min(axis=1)
df['zteam'] = df[['Team', 'Opp']].max(axis=1)
df['game_id'] = df['ateam'] + '_' + df['zteam']  + '_' + df['Date'].dt.strftime('%Y-%m-%d')

df = df.sort_values(['Id', 'Date'])

df['prev_game'] = df.groupby('Id')['Date'].shift(1)
df['freshness'] = df['Date'] - df['prev_game']

df['FIP'] = df['FIP'].apply(pd.to_numeric, errors='coerce')

df.rename(columns={'Team': 'team',}, inplace=True)

rid_ath = lambda x: x if x != "ATH" else "OAK"
df['team'] = df['team'].apply(rid_ath)

In [94]:
team_map = {
    'Angels': 'LAA',
    'Astros': 'HOU',
    'Athletics': 'OAK',
    'Blue Jays': 'TOR',
    'Braves': 'ATL',
    'Brewers': 'MIL',
    'Cardinals': 'STL',
    'Cubs': 'CHC',
    'D-backs': 'ARI',
    'Dodgers': 'LAD',
    'Giants': 'SFG',
    'Guardians': 'CLE',
    'Mariners': 'SEA',
    'Marlins': 'MIA',
    'Mets': 'NYM',
    'Nationals': 'WSN',
    'Orioles': 'BAL',
    'Padres': 'SDP',
    'Phillies': 'PHI',
    'Pirates': 'PIT',
    'Rangers': 'TEX',
    'Rays': 'TBR',
    'Red Sox': 'BOS',
    'Reds': 'CIN',
    'Rockies': 'COL',
    'Royals': 'KCR',
    'Tigers': 'DET',
    'Twins': 'MIN',
    'White Sox': 'CHW',
    'Yankees': 'NYY'
}
switcher = lambda x: team_map[x]



metadf = pd.read_csv('data/cleaned_team_data.csv')
metadf = metadf[['team', 'year', 'outs_above_average', 'woba', 'is_wildcard']]
metadf['team'] = metadf['team'].apply(switcher)

In [95]:
df = df.merge(metadf, on=['team', 'year'])


In [96]:
game_cols = ["game_id", "1_opener_fip", "1_opener_freshness", "1_others_fip", "1_others_freshness", "1_woba", "1_ooa", "1_home", "1_team", "1_is_wildcard", "2_opener_fip", "2_opener_freshness", "2_others_fip", "2_others_freshness", "2_woba", "w_ooa", "2_team", "2_is_wildcard", "is_playoff", "result"]
game_rows = []

label_map = {'W': 1,
            'L': 0,
            'T': 2}

for game_id, game_group in df.groupby('game_id'):
    teams_data = []
    for team, team_group in game_group.groupby('team'):
        opener_df = team_group[team_group['opener']]
        others_df = team_group[~team_group['opener']]

        if opener_df.empty or others_df.empty:
            break 

        opener_fip = opener_df['FIP'].mean()
        opener_fresh = opener_df['freshness'].mean()

        # We really want top 3 or something here, should switch later
        # Also, we are looking not at the bench here but the other pitchers in the game. Not what we want.
        others_fip = others_df['FIP'].mean()
        others_fresh = others_df['freshness'].mean()

        team_name = team if team != "ATH" else "OAK"

        team_data = {
            "opener_fip": opener_fip,
            "opener_freshness": opener_fresh.total_seconds() / 3600,
            "others_fip": others_fip,
            "others_freshness": others_fresh.total_seconds() / 3600,
            "home": 1 if team_group.iloc[0]['Home'] == '@' else 0,
            "team": team,
            "result": label_map[team_group.iloc[0]['Result'][0]],
            "woba": team_group.iloc[0]['woba'],
            "ooa": team_group.iloc[0]['outs_above_average'],
            "is_wildcard": 0
        }
        teams_data.append(team_data)

    if len(teams_data) != 2:
        continue

    random.shuffle(teams_data)
    team1, team2 = teams_data

    is_playoff = 0

    game_data = [
    game_id,
    team1["opener_fip"], team1["opener_freshness"],
    team1["others_fip"], team1["others_freshness"],
    team1["woba"],       team1["ooa"],
    team1["home"],       team1["team"],
    team1["is_wildcard"],
    team2["opener_fip"], team2["opener_freshness"],
    team2["others_fip"], team2["others_freshness"],
    team2["woba"],       team2["ooa"],
    team2["team"],
    team2["is_wildcard"], 
    is_playoff,
    team1["result"]
    ]

    game_rows.append(dict(zip(game_cols, game_data)))

games_df = pd.DataFrame(game_rows)

In [97]:
len(game_cols)

20

In [98]:
games_df.columns

Index(['game_id', '1_opener_fip', '1_opener_freshness', '1_others_fip',
       '1_others_freshness', '1_woba', '1_ooa', '1_home', '1_team',
       '1_is_wildcard', '2_opener_fip', '2_opener_freshness', '2_others_fip',
       '2_others_freshness', '2_woba', 'w_ooa', '2_team', '2_is_wildcard',
       'is_playoff', 'result'],
      dtype='object')

In [99]:
from sklearn.model_selection import train_test_split

X, y = games_df.drop(columns=['game_id', '1_team', '2_team', 'result']), games_df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [100]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

CM = confusion_matrix(y_test, pred)
CR = classification_report(y_test, pred)
print(CM)
print(CR)

[[1703  702]
 [ 770 1692]]
              precision    recall  f1-score   support

           0       0.69      0.71      0.70      2405
           1       0.71      0.69      0.70      2462

    accuracy                           0.70      4867
   macro avg       0.70      0.70      0.70      4867
weighted avg       0.70      0.70      0.70      4867



In [101]:
games_df.to_csv('data/GAMES.csv', index=False)