In [3]:
!pip install pybaseball pandas numpy tqdm --quiet

In [47]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import requests
import os, time, numpy as np, pandas as pd, requests
from tqdm import tqdm
from pybaseball import team_batting, playerid_lookup, season_game_logs

HEADERS = {
    "User-Agent": "Academic-Research/1.0 (contact: kh844@scarletmail.rutgers,edu)",
    "From": "kh844@scarletmail.rutgers.edu"  # optional but polite
} 

from pybaseball import (
    schedule_and_record,   # team schedule by year
    team_batting,          # team-season batting totals
    playerid_lookup,       # map name -> bbref id
    season_game_logs       # per-player game logs for a season
)

pd.set_option("display.max_columns", 100)


In [17]:
YEARS = [2019, 2020, 2021, 2022, 2023]        # 2019â€“2023 inclusive
TOP_N_BATTERS_PER_TEAM = 10
PAUSE_S = 1.0
OUTDIR = "data"
os.makedirs(OUTDIR, exist_ok=True)


In [19]:
# Modern MLB teams (works for 2023+)
MLB_TEAMS = [
    "ARI","ATL","BAL","BOS","CHC","CHW","CIN","CLE","COL",
    "DET","HOU","KCR","LAA","LAD","MIA","MIL","MIN","NYM",
    "NYY","OAK","PHI","PIT","SDP","SEA","SFG","STL","TBR",
    "TEX","TOR","WSN"
]

PAUSE_S = 1.0   # polite delay between requests
OUTDIR = "data"
os.makedirs(OUTDIR, exist_ok=True)

In [51]:
def get_team_schedule(team_code: str, year: int) -> pd.DataFrame:
    """
    Use pybaseball.schedule_and_record to get schedule for team/year.
    Signature is schedule_and_record(season, team).
    """
    try:
        df = schedule_and_record(year, team_code)
        df = df.rename(columns={"Tm": "team"})
        # Clean dates (sometimes have asterisks)
        df["Date"] = df["Date"].astype(str).str.replace("*", "", regex=False)
        df["game_date"] = pd.to_datetime(df["Date"], errors="coerce")
        df = df.dropna(subset=["game_date"])
        df["team_code"] = team_code
        df["team_played"] = 1
        return df[["game_date", "team", "team_code", "team_played"]].drop_duplicates()
    except Exception as e:
        print(f"[warn] schedule_and_record failed for {team_code} {year}: {e}")
        return pd.DataFrame(columns=["game_date","team","team_code","team_played"])


def get_top_batters(team_code: str, year: int, top_n: int) -> pd.DataFrame:
    """Top N hitters by PA for team-year."""
    try:
        tb = team_batting(year, team_code)
        tb = tb.rename(columns={"Name": "player_name"})
        tb = tb.loc[tb["PA"].fillna(0) > 0].copy()
        tb["player_name"] = tb["player_name"].str.strip()
        tb = tb.sort_values("PA", ascending=False).head(top_n)
        tb["team_code"] = team_code
        return tb[["player_name","PA","team_code"]]
    except Exception as e:
        print(f"[warn] team_batting failed for {team_code} {year}: {e}")
        return pd.DataFrame(columns=["player_name","PA","team_code"])


def name_to_bbref_id(player_name: str) -> str:
    """Map 'First Last' -> Baseball-Reference key_bbref."""
    parts = player_name.split()
    if len(parts) < 2:
        return ""
    last = parts[-1]
    first = " ".join(parts[:-1])
    df = playerid_lookup(last, first)
    if df.empty:
        return ""
    if "mlb_played_last" in df.columns:
        df = df.sort_values(
            ["mlb_played_last","mlb_played_first"],
            ascending=[False, False]
        )
    slug = df["key_bbref"].dropna().astype(str).head(1).values
    return slug[0] if len(slug) else ""


def fetch_player_game_logs(bbref_id: str, year: int) -> pd.DataFrame:
    """Per-game logs for one player-season via season_game_logs."""
    try:
        g = season_game_logs(bbref_id, year)
        g = g.rename(columns={
            "Date": "game_date",
            "Tm": "team",
            "Opp": "opp"
        })
        g["game_date"] = pd.to_datetime(g["game_date"], errors="coerce")
        g = g.dropna(subset=["game_date"])
        g["appeared"] = 1
        g["player_id_bbref"] = bbref_id

        keep = ["player_id_bbref","game_date","team","opp","appeared"]
        if "PA" in g.columns:
            keep.append("PA")
        g = g[keep].drop_duplicates()
        if "PA" not in g.columns:
            g["PA"] = np.nan
        return g
    except Exception as e:
        print(f"[warn] logs failed for {bbref_id} {year}: {e}")
        return pd.DataFrame(columns=["player_id_bbref","game_date","team","opp","appeared","PA"])


In [53]:
all_years_panel = []

for year in YEARS:
    print(f"\n================= YEAR {year} =================")

    # 1) Schedules
    sched_list = []
    for code in MLB_TEAMS:
        s = get_team_schedule(code, year)
        if not s.empty:
            sched_list.append(s)
        time.sleep(PAUSE_S)

    if not sched_list:
        print(f"[warn] no schedules for {year}, skipping")
        continue

    sched = pd.concat(sched_list, ignore_index=True).drop_duplicates()

    # 2) Top batters per team
    roster_list = []
    for code in MLB_TEAMS:
        tb = get_top_batters(code, year, TOP_N_BATTERS_PER_TEAM)
        if not tb.empty:
            roster_list.append(tb)
        time.sleep(PAUSE_S)

    if not roster_list:
        print(f"[warn] no roster batting for {year}, skipping")
        continue

    roster = (
        pd.concat(roster_list, ignore_index=True)
          .drop_duplicates(subset=["player_name","team_code"])
          .reset_index(drop=True)
    )

    # 3) Resolve IDs
    tqdm.pandas(desc=f"Resolve IDs {year}")
    roster["player_id_bbref"] = roster["player_name"].progress_apply(name_to_bbref_id)
    roster = roster[roster["player_id_bbref"].astype(bool)].reset_index(drop=True)
    print(f"  Resolved {roster['player_id_bbref'].nunique()} players for {year}")

    # 4) Player game logs
    logs_list = []
    for pid in tqdm(roster["player_id_bbref"].unique(), desc=f"Logs {year}"):
        df_p = fetch_player_game_logs(pid, year)
        if not df_p.empty:
            logs_list.append(df_p)
        time.sleep(PAUSE_S)

    if not logs_list:
        print(f"[warn] no logs fetched for {year}, skipping")
        continue

    games = pd.concat(logs_list, ignore_index=True)

    # 5) Infer rest days
    player_teams = (
        games.groupby(["player_id_bbref","team"])["game_date"]
             .min().reset_index()[["player_id_bbref","team"]]
    )

    player_dates = player_teams.merge(
        sched[["game_date","team","team_played"]],
        on="team", how="left"
    ).dropna(subset=["game_date"])

    appearances = games[["player_id_bbref","game_date","appeared"]].drop_duplicates()

    panel = player_dates.merge(
        appearances,
        on=["player_id_bbref","game_date"],
        how="left"
    )

    panel["appeared"] = panel["appeared"].fillna(0).astype(int)
    panel["rest_flag"] = ((panel["team_played"] == 1) & (panel["appeared"] == 0)).astype(int)

    panel = panel.sort_values(["player_id_bbref","game_date"]).reset_index(drop=True)
    panel["days_since_last_game"] = (
        panel.groupby("player_id_bbref")["game_date"].diff().dt.days
    )
    panel["prev_day_was_rest"] = (
        panel.groupby("player_id_bbref")["rest_flag"].shift(1).fillna(0).astype(int)
    )

    perf = games[["player_id_bbref","game_date","PA"]].drop_duplicates()
    panel = panel.merge(perf, on=["player_id_bbref","game_date"], how="left")
    panel["PA"] = panel["PA"].fillna(0)

    panel["season"] = year

    year_path = os.path.join(OUTDIR, f"mlb_{year}_rest_days.csv")
    panel.to_csv(year_path, index=False)
    print(f"  Saved {year_path} ({len(panel):,} rows)")

    all_years_panel.append(panel)



http://www.baseball-reference.com/teams/ARI/2019-schedule-scores.shtml
[warn] schedule_and_record failed for ARI 2019: Data cannot be retrieved for this team/year combo. Please verify that your team abbreviation is accurate and that the team existed during the season you are searching for.
http://www.baseball-reference.com/teams/ATL/2019-schedule-scores.shtml
[warn] schedule_and_record failed for ATL 2019: Data cannot be retrieved for this team/year combo. Please verify that your team abbreviation is accurate and that the team existed during the season you are searching for.
http://www.baseball-reference.com/teams/BAL/2019-schedule-scores.shtml
[warn] schedule_and_record failed for BAL 2019: Data cannot be retrieved for this team/year combo. Please verify that your team abbreviation is accurate and that the team existed during the season you are searching for.
http://www.baseball-reference.com/teams/BOS/2019-schedule-scores.shtml
[warn] schedule_and_record failed for BOS 2019: Data ca

KeyboardInterrupt: 

In [49]:
all_years_panel = []

for year in YEARS:
    print(f"\n================= YEAR {year} =================")

    # --- 1) Team schedules (scraped) ---
    sched_list = []
    for code in MLB_TEAMS:
        s = get_team_schedule_bref(code, year)
        if not s.empty:
            sched_list.append(s)
        time.sleep(PAUSE_S)

    if not sched_list:
        print(f"[warn] no schedules for {year}, skipping")
        continue

    sched = pd.concat(sched_list, ignore_index=True).drop_duplicates()

    # --- 2) Top batters per team ---
    roster_list = []
    for code in MLB_TEAMS:
        tb = get_top_batters(code, year, TOP_N_BATTERS_PER_TEAM)
        if not tb.empty:
            roster_list.append(tb)
        time.sleep(PAUSE_S)

    if not roster_list:
        print(f"[warn] no roster batting for {year}, skipping")
        continue

    roster = (
        pd.concat(roster_list, ignore_index=True)
          .drop_duplicates(subset=["player_name","team_code"])
          .reset_index(drop=True)
    )

    # --- 3) Resolve player IDs ---
    tqdm.pandas(desc=f"Resolve IDs {year}")
    roster["player_id_bbref"] = roster["player_name"].progress_apply(name_to_bbref_id)
    roster = roster[roster["player_id_bbref"].astype(bool)].reset_index(drop=True)
    print(f"  Resolved {roster['player_id_bbref'].nunique()} players for {year}")

    # --- 4) Fetch game logs for those players ---
    logs_list = []
    for pid in tqdm(roster["player_id_bbref"].unique(), desc=f"Logs {year}"):
        df_p = fetch_player_game_logs(pid, year)
        if not df_p.empty:
            logs_list.append(df_p)
        time.sleep(PAUSE_S)

    if not logs_list:
        print(f"[warn] no logs fetched for {year}, skipping")
        continue

    games = pd.concat(logs_list, ignore_index=True)

    # --- 5) Infer rest days ---
    # Find which teams each player played for in that season
    player_teams = (
        games.groupby(["player_id_bbref","team"])["game_date"]
             .min().reset_index()[["player_id_bbref","team"]]
    )

    # For each (player,team), get all team game dates
    player_dates = player_teams.merge(
        sched[["game_date","team","team_played"]],
        on="team", how="left"
    ).dropna(subset=["game_date"])

    appearances = games[["player_id_bbref","game_date","appeared"]].drop_duplicates()

    panel = player_dates.merge(
        appearances,
        on=["player_id_bbref","game_date"],
        how="left"
    )

    panel["appeared"] = panel["appeared"].fillna(0).astype(int)
    panel["rest_flag"] = ((panel["team_played"] == 1) & (panel["appeared"] == 0)).astype(int)

    # Sort & add fatigue features
    panel = panel.sort_values(["player_id_bbref","game_date"]).reset_index(drop=True)
    panel["days_since_last_game"] = (
        panel.groupby("player_id_bbref")["game_date"].diff().dt.days
    )
    panel["prev_day_was_rest"] = (
        panel.groupby("player_id_bbref")["rest_flag"].shift(1).fillna(0).astype(int)
    )

    # Attach PA (0 when DNP)
    perf = games[["player_id_bbref","game_date","PA"]].drop_duplicates()
    panel = panel.merge(perf, on=["player_id_bbref","game_date"], how="left")
    panel["PA"] = panel["PA"].fillna(0)

    panel["season"] = year

    # Save per-year CSV
    year_path = os.path.join(OUTDIR, f"mlb_{year}_rest_days.csv")
    panel.to_csv(year_path, index=False)
    print(f"  Saved {year_path} ({len(panel):,} rows)")

    all_years_panel.append(panel)



[warn] schedule scrape failed for ARI 2019: 403 Client Error: Forbidden for url: https://www.baseball-reference.com/teams/ARI/2019-schedule-scores.shtml
[warn] schedule scrape failed for ATL 2019: 403 Client Error: Forbidden for url: https://www.baseball-reference.com/teams/ATL/2019-schedule-scores.shtml
[warn] schedule scrape failed for BAL 2019: 403 Client Error: Forbidden for url: https://www.baseball-reference.com/teams/BAL/2019-schedule-scores.shtml
[warn] schedule scrape failed for BOS 2019: 403 Client Error: Forbidden for url: https://www.baseball-reference.com/teams/BOS/2019-schedule-scores.shtml
[warn] schedule scrape failed for CHC 2019: 403 Client Error: Forbidden for url: https://www.baseball-reference.com/teams/CHC/2019-schedule-scores.shtml
[warn] schedule scrape failed for CHW 2019: 403 Client Error: Forbidden for url: https://www.baseball-reference.com/teams/CHW/2019-schedule-scores.shtml
[warn] schedule scrape failed for CIN 2019: 403 Client Error: Forbidden for url: 

KeyboardInterrupt: 

In [55]:
!pip install pybaseball pandas --quiet

import pandas as pd
from pybaseball import schedule_and_record

pd.set_option("display.max_columns", 50)

# Single sanity check
bal_2019 = schedule_and_record("BAL", 2019)
display(bal_2019.head())
print(bal_2019.shape)


AttributeError: 'int' object has no attribute 'upper'