In [None]:
import sys
!{sys.executable} -m pip install pandas pybaseball

In [2]:
import pandas as pd
from pybaseball import team_pitching, pitching_stats

In [1]:
# convert team codes from retrosheet to baseball savant style

rs_to_bs = {
    'HOU': 'HOU',
    'TEX': 'TEX',
    'ANA': 'LAA',
    'OAK': 'OAK',
    'SEA': 'SEA',

    'KCA': 'KC',
    'DET': 'DET',
    'CLE': 'CLE',
    'MIN': 'MIN',
    'CHA': 'CWS',

    'NYA': 'NYY',
    'BOS': 'BOS',
    'TBA': 'TB',
    'TOR': 'TOR',
    'BAL': 'BAL',
    
    'LAN': 'LAD',
    'SDN': 'SD',
    'COL': 'COL',
    'ARI': 'AZ',
    'SFN': 'SF',
    
    'MIL': 'MIL',
    'PIT': 'PIT',
    'SLN': 'STL',
    'CIN': 'CIN',
    'CHN': 'CHC',

    'NYN': 'NYM',
    'ATL': 'ATL',
    'PHI': 'PHI',
    'WAS': 'WSH',
    'MIA': 'MIA',
}

In [11]:
# convert team codes from fangraphs to baseball savant style

fg_to_bs = {
    'HOU': 'HOU',
    'TEX': 'TEX',
    'LAA': 'LAA',
    'OAK': 'OAK',
    'SEA': 'SEA',

    'KCR': 'KC',
    'DET': 'DET',
    'CLE': 'CLE',
    'MIN': 'MIN',
    'CHW': 'CWS',

    'NYY': 'NYY',
    'BOS': 'BOS',
    'TBR': 'TB',
    'TOR': 'TOR',
    'BAL': 'BAL',
    
    'LAD': 'LAD',
    'SDP': 'SD',
    'COL': 'COL',
    'ARI': 'AZ',
    'SFG': 'SF',
    
    'MIL': 'MIL',
    'PIT': 'PIT',
    'STL': 'STL',
    'CIN': 'CIN',
    'CHC': 'CHC',

    'NYM': 'NYM',
    'ATL': 'ATL',
    'PHI': 'PHI',
    'WSN': 'WSH',
    'MIA': 'MIA',
}

In [31]:
### DON'T NEED TO RUN AGAIN!!!
### but double check the csv file with Baseball-Reference data

# run through gamelogs to calculate close_win_pct and run_diff

team_metrics = {
    'team': [],
    'year': [],
    'close_games': [],
    'close_win_pct': [],
    'run_diff': []
}

for year in range(2015, 2024):
    gamelog_df = pd.read_csv(f'data/gamelogs/gl{year}.txt')

    for team in rs_to_bs:
        total_rs = 0
        total_ra = 0
        close_wins = 0
        close_losses = 0
        
        for index, row in gamelog_df.iterrows():
            if row['away_team'] != team and row['home_team'] != team:
                continue

            if row['away_team'] == team:
                rs = row['away_score']
                ra = row['home_score']
            elif row['home_team'] == team:
                rs = row['home_score']
                ra = row['away_score']
                
            total_rs += rs
            total_ra += ra

            if abs(rs - ra) <= 1:
                if rs > ra:
                    close_wins += 1
                elif ra > rs:
                    close_losses += 1

        team_metrics['team'].append(rs_to_bs[team])
        team_metrics['year'].append(year)
        team_metrics['close_games'].append(close_wins + close_losses)
        team_metrics['close_win_pct'].append(close_wins / (close_wins + close_losses))
        team_metrics['run_diff'].append(total_rs - total_ra)

In [33]:
# save results from previous cell to csv file

team_metrics_df = pd.DataFrame(team_metrics)
team_metrics_df.to_csv('data/team_metrics.csv', index=False)

In [8]:
### DON'T NEED TO RUN AGAIN!!!
### but double check the accuracy

# combine team_xwoba and team_xwoba_against data (save the difference)

team_xwoba_diff = {
    'team': [],
    'year': [],
    'xwoba_diff': [],
}

team_xwoba_df = pd.read_csv('data/team_xwoba.csv')
team_xwoba_against_df = pd.read_csv('data/team_xwoba_against.csv')

for index1, row1 in team_xwoba_df.iterrows():
    for index2, row2 in team_xwoba_against_df.iterrows():
        if row1['player_name'] != row2['player_name'] or row1['year'] != row2['year']:
            continue

        team_xwoba_diff['team'].append(row1['player_name'])
        team_xwoba_diff['year'].append(row1['year'])
        team_xwoba_diff['xwoba_diff'].append(row1['xwoba'] - row2['xwoba'])

In [10]:
# save results from previous cell to csv file

team_xwoba_diff_df = pd.DataFrame(team_xwoba_diff)
team_xwoba_diff_df.to_csv('data/team_xwoba_diff.csv', index=False)

In [9]:
### DON'T NEED TO RUN AGAIN!!!
### but double check the csv file with FanGraphs data

# save pitching data from fangraphs

team_xfip_df = team_pitching(2015, 2023)
team_xfip_df.to_csv('data/team_xfip.csv', index=False)

player_xfip_df = pitching_stats(2015, 2023, qual=20)
player_xfip_df.to_csv('data/player_xfip.csv', index=False)

In [8]:
# training data - game results from 2017-2022

train_data = {
    'away_team': [],
    'away_score': [],
    'away_team_xwoba': [],
    'away_lineup_xwoba': [],
    'away_team_xfip': [],
    'away_starter_xfip': [],
    'away_close_win_pct': [],
    'away_run_diff': [],
    'home_team': [],
    'home_score': [],
    'home_team_xwoba': [],
    'home_lineup_xwoba': [],
    'home_team_xfip': [],
    'home_starter_xfip': [],
    'home_close_win_pct': [],
    'home_run_diff': []
}

In [10]:
# enter individual game data into the dictionary

def write_game_data(data, year):
    gamelog_df = pd.read_csv(f'data/gamelogs/gl{year}.txt')

    for index, row in gamelog_df.iterrows():
        data['away_team'] = tc_retro[row['away_team']]
        data['away_score'] = row['away_score']
        data['away_team_xwoba'] = get_team_xwoba(year, rs_to_bs[row['away_team']])
        data['away_lineup_xwoba'] = get_team_xwoba(year, [row['away_1_name'], row['away_2_name'], row['away_3_name'],
                                                                row['away_4_name'], row['away_5_name'], row['away_6_name'],
                                                                row['away_7_name'], row['away_8_name'], row['away_9_name']])
        data['away_team_xfip'] = get_team_xfip(year, row['away_team'])
        data['away_starter_xfip'] = get_starter_xfip(year, row['away_starter'])
        data['away_close_win_pct'] = get_close_win_pct(year, row['away_team'])
        data['away_run_diff'] = get_run_diff(year, row['away_team'])
        data['home_team'] = tc_retro[row['home_team']]
        data['home_score'] = row['home_score']
        data['home_team_xwoba'] = get_team_xwoba(year, row['home_team'])
        data['home_lineup_xwoba'] = get_team_xwoba(year, [row['home_1_name'], row['home_2_name'], row['home_3_name'],
                                                                row['home_4_name'], row['home_5_name'], row['home_6_name'],
                                                                row['home_7_name'], row['home_8_name'], row['home_9_name']])
        data['home_team_xfip'] = get_team_xfip(year, row['home_team'])
        data['home_starter_xfip'] = get_starter_xfip(year, row['home_starter'])
        data['home_close_win_pct'] = get_close_win_pct(year, row['home_team'])
        data['home_run_diff'] = get_run_diff(year, row['home_team'])

In [None]:
team_xwoba_diff_df = pd.read_csv('data/team_xwoba_diff.csv')
player_xwoba_df = pd.read_csv('data/player_xwoba.csv')
team_xfip_df = pd.read_csv('data/team_xfip.csv')
player_xfip_df = pd.read_csv('data/player_xfip.csv')
team_metrics_df = pd.read_csv('data/team_metrics.csv')