In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import time

starting_number = 401695202
ending_number = 401695235

base_url = "https://www.espn.com/mlb/boxscore/_/gameId/"

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
]

def parse_table(table):
    thead = table.find('thead')
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all('th')]
    else:
        first_row = table.find('tbody').find('tr')
        headers = ['Col' + str(i+1) for i in range(len(first_row.find_all('td')))]

    rows = []
    for tr in table.find('tbody').find_all('tr'):
        row = [td.get_text(strip=True) for td in tr.find_all('td')]
        row += [''] * (len(headers) - len(row))  # pad if short
        rows.append(row)

    return pd.DataFrame(rows, columns=headers)

def scrape_game(game_id):
    url = f"{base_url}{game_id}"
    headers = {"User-Agent": random.choice(user_agents)}
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch gameId {game_id} — Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    tables = soup.find_all('table', class_='Table')

    if len(tables) < 10:
        print(f"Skipping gameId {game_id} — Only found {len(tables)} tables, need at least 10")
        return None

    try:
        # ESPN lists away team first
        away_names = parse_table(tables[2])
        away_stats = parse_table(tables[3])
        home_names = parse_table(tables[4])
        home_stats = parse_table(tables[5])

        away_df = pd.concat([away_names, away_stats], axis=1)
        home_df = pd.concat([home_names, home_stats], axis=1)

        max_len = max(len(home_df), len(away_df))
        home_df = home_df.reindex(range(max_len)).reset_index(drop=True)
        away_df = away_df.reindex(range(max_len)).reset_index(drop=True)

        # Swap prefixes so they're correct: away team is actually first on page
        away_df.columns = [f"away_{col}" for col in away_df.columns]
        home_df.columns = [f"home_{col}" for col in home_df.columns]

        merged = pd.concat([away_df, home_df], axis=1)

        # Add gameId as a row
        empty_row = [''] * len(merged.columns)
        id_row = pd.DataFrame([empty_row], columns=merged.columns)
        id_row.iloc[0, 0] = f"gameId: {game_id}"

        merged_with_id = pd.concat([merged, id_row], ignore_index=True)
        return merged_with_id

    except Exception as e:
        print(f"Error parsing gameId {game_id}: {e}")
        return None

# Loop through and build the final DataFrame
all_data = []

for game_id in range(starting_number, ending_number + 1):
    print(f"Scraping gameId {game_id}...")
    game_df = scrape_game(game_id)
    if game_df is not None:
        all_data.append(game_df)
    time.sleep(1)

# Combine
if all_data:
    final_df = pd.concat(all_data, ignore_index=True)
    print(final_df.head(20))
else:
    print("No data scraped.")


Scraping gameId 401695202...
Scraping gameId 401695203...
Scraping gameId 401695204...
Scraping gameId 401695205...
Skipping gameId 401695205 — Only found 7 tables, need at least 10
Scraping gameId 401695206...
Skipping gameId 401695206 — Only found 4 tables, need at least 10
Scraping gameId 401695207...
Skipping gameId 401695207 — Only found 7 tables, need at least 10
Scraping gameId 401695208...
Scraping gameId 401695209...
Scraping gameId 401695210...
Scraping gameId 401695211...
Skipping gameId 401695211 — Only found 7 tables, need at least 10
Scraping gameId 401695212...
Scraping gameId 401695213...
Scraping gameId 401695214...
Skipping gameId 401695214 — Only found 7 tables, need at least 10
Scraping gameId 401695215...
Scraping gameId 401695216...
Scraping gameId 401695217...
Skipping gameId 401695217 — Only found 7 tables, need at least 10
Scraping gameId 401695218...
Scraping gameId 401695219...
Skipping gameId 401695219 — Only found 7 tables, need at least 10
Scraping gameId 

In [2]:
final_df

Unnamed: 0,away_hitters,away_AB,away_R,away_H,away_RBI,away_HR,away_BB,away_K,away_AVG,away_OBP,...,home_AB,home_R,home_H,home_RBI,home_HR,home_BB,home_K,home_AVG,home_OBP,home_SLG
0,C. CarrollRF,6,0,2,1,0,0,3,.321,.394,...,5,2,3,4,1,0,1,.253,.333,.358
1,G. PerdomoSS,5,1,2,1,0,0,1,.324,.425,...,5,2,3,2,1,0,0,.318,.419,.636
2,P. SmithDH,2,0,1,0,0,1,0,.408,.500,...,5,1,1,1,1,0,1,.284,.372,.541
3,aR. GrichukPH-DH,2,2,2,2,0,0,0,.296,.367,...,5,0,3,1,0,0,1,.306,.375,.583
4,J. Naylor1B,4,2,2,0,0,1,0,.333,.414,...,5,0,1,0,0,0,1,.170,.221,.341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,M. MaldonadoC,4,0,2,0,0,0,0,.257,.278,...,4,1,1,0,0,0,1,.232,.295,.429
328,team,33,3,10,2,1,4,4,,,...,3,0,1,1,0,0,0,.179,.258,.250
329,,,,,,,,,,,...,1,0,0,0,0,0,1,.143,.182,.222
330,,,,,,,,,,,...,34,2,7,2,0,1,11,,,


In [3]:
# final_df.insert(0, 'index', '')  # Add a blank column at the start
# final_df.to_csv('../stats/batter_stats_2025.csv', mode='a', header=False, index=False)


final_df.to_csv('../stats/batter_stats_2025.csv', mode='a', header=False, index=True)
