In [None]:
import pandas as pd

P_BASE = "data/mlb_pitch_data_2021_2023.csv"

pitches = pd.read_csv(P_BASE, low_memory=False)
pitches['game_year'] = pitches['game_year'].astype(int)

print("Rows by year:")
print(pitches['game_year'].value_counts().sort_index())

if 'game_date' in pitches.columns:
    pitches['game_date'] = pd.to_datetime(pitches['game_date'], errors='coerce')
    by_month_23 = (pitches[pitches['game_year']==2023]
                   .groupby(pitches['game_date'].dt.month).size().sort_index())
    print("\n2023 rows by month:")
    print(by_month_23)


In [None]:
import pandas as pd

P_BASE = "mlb_pitch_data_2021_2023.csv"

pitches = pd.read_csv(P_BASE, low_memory=False)
pitches['game_year'] = pitches['game_year'].astype(int)

print("Rows by year:")
print(pitches['game_year'].value_counts().sort_index())

if 'game_date' in pitches.columns:
    pitches['game_date'] = pd.to_datetime(pitches['game_date'], errors='coerce')
    by_month_23 = (pitches[pitches['game_year']==2023]
                   .groupby(pitches['game_date'].dt.month).size().sort_index())
    print("\n2023 rows by month:")
    print(by_month_23)


In [None]:
import pandas as pd
from datetime import datetime, timedelta
from pybaseball import statcast

P = "mlb_pitch_data_2021_2023.csv"
THRESHOLD = 10_000
def month_bounds(year:int, month:int):
    start = datetime(year, month, 1)
    nxt   = (start.replace(day=28) + timedelta(days=4)).replace(day=1)
    end   = nxt - timedelta(days=1)
    return start, end

def week_ranges(start_dt:datetime, end_dt:datetime):
    cur = start_dt
    out = []
    while cur <= end_dt:
        wk_end = min(cur + timedelta(days=6), end_dt)
        out.append((cur.strftime("%Y-%m-%d"), wk_end.strftime("%Y-%m-%d")))
        cur = wk_end + timedelta(days=1)
    return out

def fetch_range(s, e):
    try:
        df = statcast(s, e)
        if df is None or len(df) == 0:
            return pd.DataFrame()
        return df
    except Exception:
        return pd.DataFrame()

def fetch_month_with_retries(year:int, month:int):
    s_dt, e_dt = month_bounds(year, month)
    full = fetch_range(s_dt.strftime("%Y-%m-%d"), e_dt.strftime("%Y-%m-%d"))
    if len(full):
        return full[full.get("game_year", year) == year]

    mid = s_dt + timedelta(days=(e_dt - s_dt).days // 2)
    halves = []
    for s,e in [(s_dt, mid), (mid + timedelta(days=1), e_dt)]:
        part = fetch_range(s.strftime("%Y-%m-%d"), e.strftime("%Y-%m-%d"))
        if len(part):
            halves.append(part)
    if halves:
        out = pd.concat(halves, ignore_index=True)
        return out[out.get("game_year", year) == year]

    weeks = []
    for s_str, e_str in week_ranges(s_dt, e_dt):
        part = fetch_range(s_str, e_str)
        if len(part):
            weeks.append(part)
    if weeks:
        out = pd.concat(weeks, ignore_index=True)
        return out[out.get("game_year", year) == year]

    return pd.DataFrame()

def dedupe_by_pitch(df):
    keys = ["game_pk","at_bat_number","pitch_number"]
    if all(k in df.columns for k in keys):
        before = len(df)
        df = df.drop_duplicates(subset=keys)
        after  = len(df)
        if after != before:
            print(f"  De-duplicated: {before:,} → {after:,}")
    return df

p = pd.read_csv(P, low_memory=False)
p["game_year"] = pd.to_numeric(p["game_year"], errors="coerce").astype("Int64")
p["game_date"] = pd.to_datetime(p.get("game_date"), errors="coerce")

print("Before (by year):")
print(p["game_year"].value_counts().sort_index())

p23 = p[p["game_year"] == 2023].copy()
month_counts = p23.groupby(p23["game_date"].dt.month).size().to_dict()
months = list(range(3, 11)) todo = [m for m in months if month_counts.get(m, 0) < THRESHOLD]

print("\n2023 month counts before:", month_counts)
print("Months to fetch (threshold < 10k rows):", todo)

for m in todo:
    print(f"\nFetching 2023-{m:02d} ...")
    dfm = fetch_month_with_retries(2023, m)
    print(f"  fetched rows: {len(dfm):,}")
    if len(dfm) == 0:
        print("  WARNING: still empty; try re-running later or lowering date window.")
        continue
    dfm = dfm[dfm.get("game_year", 2023) == 2023]
    p = pd.concat([p, dfm], ignore_index=True)
    p = dedupe_by_pitch(p)
    p.to_csv(P, index=False)

    p = pd.read_csv(P, low_memory=False)
    p["game_year"] = pd.to_numeric(p["game_year"], errors="coerce").astype("Int64")
    p["game_date"] = pd.to_datetime(p.get("game_date"), errors="coerce")
    p23 = p[p["game_year"] == 2023]
    month_counts = p23.groupby(p23["game_date"].dt.month).size().to_dict()
    print("  Updated 2023 month counts:", month_counts)

print("\nFinal (by year):")
print(p["game_year"].value_counts().sort_index())
print("Final 2023 month counts:", p[p["game_year"]==2023].groupby(p["game_date"].dt.month).size().to_dict())


In [None]:

PITCHES_PATH = "mlb_pitch_data_2021_2023.csv"
INJURY_PATH  = "rosterresource_injuries_cleaned.csv"

pitches = pd.read_csv(PITCHES_PATH, low_memory=False)
pitches['game_year'] = pd.to_numeric(pitches['game_year'], errors='coerce').astype('Int64')

inj = pd.read_csv(INJURY_PATH)

keep = [c for c in ["MLBAMID","Injury / Surgery Date","Injury / Surgery"] if c in inj.columns]
inj = inj[keep].copy()

pitches['pitcher'] = pd.to_numeric(pitches['pitcher'], errors='coerce').astype('Int64')
inj['MLBAMID']     = pd.to_numeric(inj['MLBAMID'], errors='coerce').astype('Int64')

merged = pitches.merge(inj, left_on='pitcher', right_on='MLBAMID', how='left')

merged['injured'] = merged['Injury / Surgery'].notna().astype(int)

merged.to_csv("data/pitches_with_injuries_2021_2023.csv", index=False)
print("Merged file saved.")
print("Year counts:\n", merged['game_year'].value_counts().sort_index())
print("Injury flags:\n", merged['injured'].value_counts())


In [None]:
import pandas as pd

PITCHES_PATH = "mlb_pitch_data_2021_2023.csv"
INJURY_PATH  = "rosterresource_injuries_cleaned.csv"

pitches = pd.read_csv(PITCHES_PATH, low_memory=False)
pitches['game_year'] = pd.to_numeric(pitches['game_year'], errors='coerce').astype('Int64')

inj = pd.read_csv(INJURY_PATH)

keep = [c for c in ["MLBAMID","Injury / Surgery Date","Injury / Surgery"] if c in inj.columns]
inj = inj[keep].copy()

pitches['pitcher'] = pd.to_numeric(pitches['pitcher'], errors='coerce').astype('Int64')
inj['MLBAMID']     = pd.to_numeric(inj['MLBAMID'], errors='coerce').astype('Int64')

merged = pitches.merge(inj, left_on='pitcher', right_on='MLBAMID', how='left')

merged['injured'] = merged['Injury / Surgery'].notna().astype(int)

merged.to_csv("pitches_with_injuries_2021_2023.csv", index=False)
print("Merged file saved.")
print("Year counts:\n", merged['game_year'].value_counts().sort_index())
print("Injury flags:\n", merged['injured'].value_counts())

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

ROOT = Path.cwd() if (Path.cwd()/ "data").exists() else (Path.cwd()/ "pitcher-injury-predictor")
DATA = ROOT / "data"
DATA.mkdir(exist_ok=True, parents=True)

p_main   = DATA / "mlb_pitch_data_2021_2023.csv"
p_merged = ROOT / "pitches_with_injuries_2021_2023.csv"         p_gz_out = DATA / "pitches_with_injuries_2021_2023.csv.gz"      p_gz_in  = DATA / "pitches_with_injuries_2021_2023.csv.gz"      
p_inj    = DATA / "rosterresource_injuries_2023.csv"

def load_statcast():
    df = pd.read_csv(p_main, low_memory=False)
    if "game_date" in df.columns:
        df["game_date"] = pd.to_datetime(df["game_date"], errors="coerce")
    if "game_year" not in df.columns and "game_date" in df.columns:
        df["game_year"] = df["game_date"].dt.year
    dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in df.columns]
    if dedup_keys:
        before = len(df)
        df = df.drop_duplicates(subset=dedup_keys, keep="first")
        print(f"Dedup pitches: {before} -> {len(df)}")
    else:
        print("[WARN] Dedup keys not all present; skipped de-duplication.")
    return df

def pick_spin_col(df):
    if "release_spin_rate" in df.columns:
        df["spin_rate"] = df["release_spin_rate"]
    elif "spin_rate" in df.columns:
        pass 
    else:
        df["spin_rate"] = np.nan
    return df

def ensure_injured(df):
    """If 'injured' column missing, try to create 2023 season-level label using injuries file.
       We mark pitcher-season injured=1 for all pitches in 2023 if the pitcher appears in injuries_2023 file.
       For 2021-2022 we leave as 0 if no data is available."""
    if "injured" in df.columns:
        return df

    if not p_inj.exists():
        print("[WARN] injuries_2023.csv not found. Creating placeholder injured=0.")
        df["injured"] = 0
        return df

    inj = pd.read_csv(p_inj, low_memory=False)
    cols = {c.lower(): c for c in inj.columns}
    id_candidates = ["mlbam_id","player_id_mlbam","playerid","mlb_id","player_id","mlbamid","mlbam"]
    name_candidates = ["player_name","name","Player","PLAYER","Name"]

    id_col = next((cols[c] for c in id_candidates if c in cols), None)
    name_col = next((cols[c] for c in name_candidates if c in cols), None)

    injured_ids = set()
    injured_names = set()

    if id_col is not None:
        injured_ids = set(inj[id_col].dropna().astype(str).unique().tolist())
    if name_col is not None:
        injured_names = set(inj[name_col].dropna().astype(str).str.strip().unique().tolist())

    if "pitcher" in df.columns:
        df["_pitcher_str"] = df["pitcher"].astype(str)
    else:
        df["_pitcher_str"] = ""

    name_cols_df = [c for c in df.columns if c.lower() in ["pitcher_name","player_name","name"]]
    if name_cols_df:
        df["_pname"] = df[name_cols_df[0]].astype(str).str.strip()
    else:
        df["_pname"] = ""

    df["injured"] = 0
    if "game_year" not in df.columns and "game_date" in df.columns:
        df["game_year"] = pd.to_datetime(df["game_date"], errors="coerce").dt.year

    mask_2023 = (df["game_year"] == 2023) if "game_year" in df.columns else pd.Series(False, index=df.index)
    if len(injured_ids) > 0 and df["_pitcher_str"].notna().any():
        df.loc[mask_2023 & df["_pitcher_str"].isin(injured_ids), "injured"] = 1
    if len(injured_names) > 0 and df["_pname"].notna().any():
        df.loc[mask_2023 & df["_pname"].isin(injured_names), "injured"] = 1

    df = df.drop(columns=[c for c in ["_pitcher_str","_pname"] if c in df.columns])
    return df

if p_merged.exists():
    print(f"Loading existing merged (root) CSV: {p_merged.name}")
    dfm = pd.read_csv(p_merged, low_memory=False)
elif p_gz_in.exists():
    print(f"Loading existing merged (gz) CSV: {p_gz_in.name}")
    dfm = pd.read_csv(p_gz_in, low_memory=False, compression="gzip")
else:
    print("No merged file found; loading statcast and constructing season-level injury label from injuries_2023.csv")
    dfm = load_statcast()
    dfm = pick_spin_col(dfm)
    dfm = ensure_injured(dfm)

if "game_date" in dfm.columns:
    dfm["game_date"] = pd.to_datetime(dfm["game_date"], errors="coerce")
if "game_year" not in dfm.columns and "game_date" in dfm.columns:
    dfm["game_year"] = dfm["game_date"].dt.year
dfm = pick_spin_col(dfm)

dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in dfm.columns]
if dedup_keys:
    before = len(dfm)
    dfm = dfm.drop_duplicates(subset=dedup_keys, keep="first")
    print(f"Final dedup: {before} -> {len(dfm)}")
else:
    print("[WARN] Dedup keys not all present; skipped de-duplication.")

if "game_year" in dfm.columns:
    print("\nPitch counts by year (post-merge):")
    print(dfm["game_year"].value_counts().sort_index())
if "injured" in dfm.columns:
    print("\nPitch-level injury flag counts (post-merge):")
    print(dfm["injured"].value_counts())

dfm.to_csv(p_gz_out, index=False, compression="gzip")
print(f"\nSaved canonical merged to: {p_gz_out.resolve()}")


In [None]:
from pathlib import Path

TARGET_FILE = "mlb_pitch_data_2021_2023.csv"

def find_root_and_data():
    cwd = Path.cwd().resolve()
    print("CWD:", cwd)

    for p in [cwd, *cwd.parents]:
        candidate = p / "data" / TARGET_FILE
        if candidate.exists():
            print("Found data file at:", candidate)
            return p, p / "data"

    if cwd.name == "data" and (cwd / TARGET_FILE).exists():
        print("Notebook appears to be in /data; using its parent as ROOT.")
        return cwd.parent, cwd

    for base in [cwd, cwd.parent]:
        try:
            hit = next(base.rglob(TARGET_FILE))
            print("Found by scanning:", hit)
            root = hit.parent.parent if hit.parent.name == "data" else hit.parents[2]
            return root, root / "data"
        except StopIteration:
            pass

    raise FileNotFoundError(f"Could not locate {TARGET_FILE} by walking parents or light scan.")

ROOT, DATA = find_root_and_data()
print("ROOT:", ROOT)
print("DATA:", DATA)

try:
    listing = sorted(DATA.iterdir())[:10]
    print("\n/data contents (first 10):")
    for x in listing:
        print(" -", x.name)
except Exception as e:
    print("Could not list /data:", e)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

try:
    ROOT, DATA
except NameError:
    from pathlib import Path
    TARGET_FILE = "mlb_pitch_data_2021_2023.csv"
    def _find():
        cwd = Path.cwd().resolve()
        for p in [cwd, *cwd.parents]:
            if (p / "data" / TARGET_FILE).exists():
                return p, p / "data"
        if cwd.name == "data" and (cwd / TARGET_FILE).exists():
            return cwd.parent, cwd
        raise FileNotFoundError("Cannot auto-detect project root.")
    ROOT, DATA = _find()

DATA.mkdir(exist_ok=True, parents=True)

p_main   = DATA / "mlb_pitch_data_2021_2023.csv"
p_merged = ROOT / "pitches_with_injuries_2021_2023.csv"        
p_gz_out = DATA / "pitches_with_injuries_2021_2023.csv.gz"
p_gz_in  = DATA / "pitches_with_injuries_2021_2023.csv.gz"
p_inj    = DATA / "rosterresource_injuries_2023.csv"

print("Using:")
print("  ROOT:", ROOT)
print("  DATA:", DATA)
print("  main:", p_main.exists(), p_main)
print("merged:", p_merged.exists(), p_merged)
print(" merged.gz exists:", p_gz_in.exists(), p_gz_in)
print("injuries:", p_inj.exists(), p_inj)

def load_statcast():
    if not p_main.exists():
        raise FileNotFoundError(f"Expected Statcast at {p_main}")
    df = pd.read_csv(p_main, low_memory=False)
    if "game_date" in df.columns:
        df["game_date"] = pd.to_datetime(df["game_date"], errors="coerce")
    if "game_year" not in df.columns and "game_date" in df.columns:
        df["game_year"] = df["game_date"].dt.year
    dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in df.columns]
    if dedup_keys:
        before = len(df)
        df = df.drop_duplicates(subset=dedup_keys, keep="first")
        print(f"Dedup pitches: {before} -> {len(df)}")
    else:
        print("[WARN] Dedup keys not all present; skipped de-duplication.")
    return df

def pick_spin_col(df):
    if "release_spin_rate" in df.columns and "spin_rate" not in df.columns:
        df["spin_rate"] = df["release_spin_rate"]
    elif "spin_rate" not in df.columns:
        df["spin_rate"] = np.nan
    return df

def ensure_injured(df):
    """If 'injured' column missing, try to create 2023 season-level label using injuries file.
       Mark pitcher-season injured=1 for all 2023 pitches if pitcher appears in injuries_2023 file."""
    if "injured" in df.columns:
        return df

    if not p_inj.exists():
        print("[WARN] injuries_2023.csv not found. Creating placeholder injured=0.")
        df["injured"] = 0
        return df

    inj = pd.read_csv(p_inj, low_memory=False)
    cols = {c.lower(): c for c in inj.columns}
    id_candidates = ["mlbam_id","player_id_mlbam","playerid","mlb_id","player_id","mlbamid","mlbam"]
    name_candidates = ["player_name","name","player","Name","PLAYER"]

    id_col = next((cols[c] for c in id_candidates if c in cols), None)
    name_col = next((cols[c] for c in name_candidates if c in cols), None)

    injured_ids = set(inj[id_col].dropna().astype(str)) if id_col else set()
    injured_names = set(inj[name_col].dropna().astype(str).str.strip()) if name_col else set()

    if "game_year" not in df.columns and "game_date" in df.columns:
        df["game_year"] = pd.to_datetime(df["game_date"], errors="coerce").dt.year

    df["_pitcher_str"] = df.get("pitcher", pd.Series(index=df.index, dtype="float")).astype(str)
    name_cols_df = [c for c in df.columns if c.lower() in ["pitcher_name","player_name","name"]]
    df["_pname"] = df[name_cols_df[0]].astype(str).str.strip() if name_cols_df else ""

    df["injured"] = 0
    mask_2023 = (df["game_year"] == 2023) if "game_year" in df.columns else pd.Series(False, index=df.index)

    if injured_ids:
        df.loc[mask_2023 & df["_pitcher_str"].isin(injured_ids), "injured"] = 1
    if injured_names:
        df.loc[mask_2023 & df["_pname"].isin(injured_names), "injured"] = 1

    return df.drop(columns=[c for c in ["_pitcher_str","_pname"] if c in df.columns])

if p_merged.exists():
    print(f"\nLoading existing merged (root) CSV: {p_merged.name}")
    dfm = pd.read_csv(p_merged, low_memory=False)
elif p_gz_in.exists():
    print(f"\nLoading existing merged (gz) CSV: {p_gz_in.name}")
    dfm = pd.read_csv(p_gz_in, low_memory=False, compression="gzip")
else:
    print("\nNo merged file found; loading statcast and constructing 2023 season-level injury label from injuries_2023.csv")
    dfm = load_statcast()
    dfm = pick_spin_col(dfm)
    dfm = ensure_injured(dfm)

if "game_date" in dfm.columns:
    dfm["game_date"] = pd.to_datetime(dfm["game_date"], errors="coerce")
if "game_year" not in dfm.columns and "game_date" in dfm.columns:
    dfm["game_year"] = dfm["game_date"].dt.year
dfm = pick_spin_col(dfm)

dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in dfm.columns]
if dedup_keys:
    before = len(dfm)
    dfm = dfm.drop_duplicates(subset=dedup_keys, keep="first")
    print(f"Final dedup: {before} -> {len(dfm)}")
else:
    print("[WARN] Dedup keys not all present; skipped de-duplication.")

if "game_year" in dfm.columns:
    print("\nPitch counts by year (post-merge):")
    print(dfm["game_year"].value_counts().sort_index())
if "injured" in dfm.columns:
    print("\nPitch-level injury flag counts (post-merge):")
    print(dfm["injured"].value_counts())

dfm.to_csv(p_gz_out, index=False, compression="gzip")
print(f"\nSaved canonical merged to: {p_gz_out.resolve()}")


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

ROOT = Path.cwd() if (Path.cwd()/ "data").exists() else (Path.cwd()/ "pitcher-injury-predictor")
DATA = ROOT / "data"
p_gz = DATA / "pitches_with_injuries_2021_2023.csv.gz"
assert p_gz.exists(), f"Missing {p_gz}"

df = pd.read_csv(p_gz, low_memory=False, compression="gzip")
if "game_date" in df.columns:
    df["game_date"] = pd.to_datetime(df["game_date"], errors="coerce")
if "game_year" not in df.columns and "game_date" in df.columns:
    df["game_year"] = df["game_date"].dt.year

dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in df.columns]
if dedup_keys:
    df = df.drop_duplicates(subset=dedup_keys, keep="first")

df23 = df[df["game_year"] == 2023].copy()
if df23.empty:
    raise ValueError("No 2023 data found after filtering.")

if "release_spin_rate" in df23.columns and "spin_rate" not in df23.columns:
    df23["spin_rate"] = df23["release_spin_rate"]
elif "spin_rate" not in df23.columns:
    df23["spin_rate"] = np.nan

for col in ["pitcher","pitcher_name","game_pk","game_date","pitch_type","release_speed","spin_rate"]:
    if col not in df23.columns:
        df23[col] = np.nan

gcols = ["pitcher","pitcher_name","game_pk","game_date"]
per_game = (df23
            .dropna(subset=["pitcher"])
            .groupby(gcols, dropna=False)
            .size()
            .reset_index(name="pitches_in_game"))

per_game = per_game.sort_values(["pitcher","game_date"])
per_game["rest_days"] = (per_game.groupby("pitcher")["game_date"]
                         .diff().dt.days)

rest_agg = (per_game.groupby(["pitcher","pitcher_name"], dropna=False)
            .agg(
                games_pitched=("game_pk","nunique"),
                total_pitches=("pitches_in_game","sum"),
                avg_pitches_per_game=("pitches_in_game","mean"),
                max_pitches_in_game=("pitches_in_game","max"),
                median_rest_days=("rest_days","median"),
                mean_rest_days=("rest_days","mean"),
                short_rest_games=("rest_days", lambda s: np.sum(s<=3) if s.notna().any() else 0),
            )
            .reset_index())

def pct95(x): 
    try: 
        return np.nanpercentile(x, 95) 
    except Exception: 
        return np.nan

velo_spin = (df23.groupby(["pitcher","pitcher_name"], dropna=False)
             .agg(
                 avg_velocity=("release_speed","mean"),
                 p95_velocity=("release_speed", pct95),
                 avg_spin=("spin_rate","mean"),
             )
             .reset_index())

fastballs = {"FF","FA","FT","SI","FC"}
breaking  = {"SL","CU","KC","SV","SC"}
offspeed  = {"CH","FS","KN"}

tmp = df23.copy()
tmp["pitch_type"] = tmp["pitch_type"].astype(str).str.upper()
tmp["family"] = np.where(tmp["pitch_type"].isin(fastballs), "fastball",
                 np.where(tmp["pitch_type"].isin(breaking),  "breaking",
                 np.where(tmp["pitch_type"].isin(offspeed),  "offspeed","other")))
mix = (tmp.groupby(["pitcher","pitcher_name","family"], dropna=False)
       .size().reset_index(name="cnt"))
mix_tot = mix.groupby(["pitcher","pitcher_name"], dropna=False)["cnt"].sum().rename("total").reset_index()
mix = mix.merge(mix_tot, on=["pitcher","pitcher_name"], how="left")
mix["pct"] = mix["cnt"] / mix["total"].replace(0, np.nan)

mix_pivot = (mix.pivot_table(index=["pitcher","pitcher_name"], columns="family", values="pct", fill_value=0)
               .reset_index())
mix_pivot.columns = ["pitcher","pitcher_name"] + [f"mix_{c}" for c in mix_pivot.columns.tolist()[2:]]

if "injured" not in df23.columns:
    df23["injured"] = 0
label_agg = (df23.groupby(["pitcher","pitcher_name"], dropna=False)["injured"]
             .max().reset_index().rename(columns={"injured":"injury_2023"}))

feat = (rest_agg
        .merge(velo_spin, on=["pitcher","pitcher_name"], how="outer")
        .merge(mix_pivot, on=["pitcher","pitcher_name"], how="outer")
        .merge(label_agg, on=["pitcher","pitcher_name"], how="left"))

for c in ["mix_fastball","mix_breaking","mix_offspeed","mix_other"]:
    if c not in feat.columns:
        feat[c] = 0.0
num_cols = feat.select_dtypes(include=[np.number]).columns
feat[num_cols] = feat[num_cols].replace([np.inf,-np.inf], np.nan)

p_feat = DATA / "features_2023.csv"
feat.to_csv(p_feat, index=False)
print(f"Saved 2023 feature table: {p_feat.resolve()}")
print("Rows (pitchers):", len(feat))
print("\nLabel distribution (injury_2023):")
print(feat["injury_2023"].fillna(0).astype(int).value_counts())


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

MERGED_GZ_NAME = "pitches_with_injuries_2021_2023.csv.gz"
MERGED_CSV_NAME = "pitches_with_injuries_2021_2023.csv"
MAIN_PITCHES = "mlb_pitch_data_2021_2023.csv"

def find_root_and_data():
    cwd = Path.cwd().resolve()

    if cwd.name == "data":
        root = cwd.parent
        data = cwd
        return root, data

    if (cwd / "data").exists():
        return cwd, cwd / "data"

    for p in cwd.parents:
        if (p / "data").exists():
            return p, p / "data"

    return cwd, cwd / "data"

def find_file(name):
    """Find file by walking up parents and doing a short downward scan."""
    root, data = find_root_and_data()
    p1 = data / name
    if p1.exists():
        return root, data, p1
    p2 = root / name
    if p2.exists():
        return root, data, p2
    try:
        hit = next((p for p in root.rglob(name)))
        if hit.parent.name == "data":
            root2 = hit.parent.parent
            data2 = hit.parent
            return root2, data2, hit
        return root, data, hit
    except StopIteration:
        return root, data, None

ROOT, DATA, p_gz = find_file(MERGED_GZ_NAME)
_, _, p_csv = find_file(MERGED_CSV_NAME)

if p_gz is None and p_csv is None:
    _, _, p_main = find_file(MAIN_PITCHES)
    raise FileNotFoundError(
        f"Could not find {MERGED_GZ_NAME} or {MERGED_CSV_NAME}.\n"
        f"Check that step (B) saved the merged file.\n"
        f"Detected ROOT={ROOT}, DATA={DATA}. "
        f"Main pitches present? {bool(p_main)} at {p_main}"
    )

DATA.mkdir(parents=True, exist_ok=True)
if p_gz is None and p_csv is not None:
    print(f"Found merged CSV at {p_csv}. Compressing to {DATA / MERGED_GZ_NAME} ...")
    df_tmp = pd.read_csv(p_csv, low_memory=False)
    (DATA / MERGED_GZ_NAME).write_text("") 
    df_tmp.to_csv(DATA / MERGED_GZ_NAME, index=False, compression="gzip")
    p_gz = DATA / MERGED_GZ_NAME
    print("Compressed.")

print("ROOT:", ROOT)
print("DATA:", DATA)
print("Using merged file:", p_gz)

df = pd.read_csv(p_gz, low_memory=False, compression="gzip")

if "game_date" in df.columns:
    df["game_date"] = pd.to_datetime(df["game_date"], errors="coerce")
if "game_year" not in df.columns and "game_date" in df.columns:
    df["game_year"] = df["game_date"].dt.year

dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in df.columns]
if dedup_keys:
    before = len(df)
    df = df.drop_duplicates(subset=dedup_keys, keep="first")
    print(f"Dedup pitches: {before} -> {len(df)}")

df23 = df[df["game_year"] == 2023].copy()
if df23.empty:
    raise ValueError("No 2023 data found after filtering.")

if "release_spin_rate" in df23.columns and "spin_rate" not in df23.columns:
    df23["spin_rate"] = df23["release_spin_rate"]
elif "spin_rate" not in df23.columns:
    df23["spin_rate"] = np.nan

for col in ["pitcher","pitcher_name","game_pk","game_date","pitch_type","release_speed","spin_rate","injured"]:
    if col not in df23.columns:
        df23[col] = np.nan if col != "injured" else 0

gcols = ["pitcher","pitcher_name","game_pk","game_date"]
per_game = (df23
            .dropna(subset=["pitcher"])
            .groupby(gcols, dropna=False)
            .size()
            .reset_index(name="pitches_in_game"))
per_game = per_game.sort_values(["pitcher","game_date"])
per_game["rest_days"] = (per_game.groupby("pitcher")["game_date"]
                         .diff().dt.days)

rest_agg = (per_game.groupby(["pitcher","pitcher_name"], dropna=False)
            .agg(
                games_pitched=("game_pk","nunique"),
                total_pitches=("pitches_in_game","sum"),
                avg_pitches_per_game=("pitches_in_game","mean"),
                max_pitches_in_game=("pitches_in_game","max"),
                median_rest_days=("rest_days","median"),
                mean_rest_days=("rest_days","mean"),
                short_rest_games=("rest_days", lambda s: np.sum(s<=3) if s.notna().any() else 0),
            )
            .reset_index())

def pct95(x): 
    try: 
        return np.nanpercentile(x, 95) 
    except Exception: 
        return np.nan

velo_spin = (df23.groupby(["pitcher","pitcher_name"], dropna=False)
             .agg(
                 avg_velocity=("release_speed","mean"),
                 p95_velocity=("release_speed", pct95),
                 avg_spin=("spin_rate","mean"),
             )
             .reset_index())

fastballs = {"FF","FA","FT","SI","FC"}
breaking  = {"SL","CU","KC","SV","SC"}
offspeed  = {"CH","FS","KN"}

tmp = df23.copy()
tmp["pitch_type"] = tmp["pitch_type"].astype(str).str.upper()
tmp["family"] = np.where(tmp["pitch_type"].isin(fastballs), "fastball",
                 np.where(tmp["pitch_type"].isin(breaking),  "breaking",
                 np.where(tmp["pitch_type"].isin(offspeed),  "offspeed","other")))
mix = (tmp.groupby(["pitcher","pitcher_name","family"], dropna=False)
       .size().reset_index(name="cnt"))
mix_tot = mix.groupby(["pitcher","pitcher_name"], dropna=False)["cnt"].sum().rename("total").reset_index()
mix = mix.merge(mix_tot, on=["pitcher","pitcher_name"], how="left")
mix["pct"] = mix["cnt"] / mix["total"].replace(0, np.nan)
mix_pivot = (mix.pivot_table(index=["pitcher","pitcher_name"], columns="family", values="pct", fill_value=0)
               .reset_index())
mix_pivot.columns = ["pitcher","pitcher_name"] + [f"mix_{c}" for c in mix_pivot.columns.tolist()[2:]]

label_agg = (df23.groupby(["pitcher","pitcher_name"], dropna=False)["injured"]
             .max().reset_index().rename(columns={"injured":"injury_2023"}))

feat = (rest_agg
        .merge(velo_spin, on=["pitcher","pitcher_name"], how="outer")
        .merge(mix_pivot, on=["pitcher","pitcher_name"], how="outer")
        .merge(label_agg, on=["pitcher","pitcher_name"], how="left"))

for c in ["mix_fastball","mix_breaking","mix_offspeed","mix_other"]:
    if c not in feat.columns:
        feat[c] = 0.0

num_cols = feat.select_dtypes(include=[np.number]).columns
feat[num_cols] = feat[num_cols].replace([np.inf,-np.inf], np.nan)

p_feat = (DATA / "features_2023.csv")
feat.to_csv(p_feat, index=False)
print(f"Saved 2023 feature table: {p_feat.resolve()}")
print("Rows (pitchers):", len(feat))
print("\nLabel distribution (injury_2023):")
print(feat["injury_2023"].fillna(0).astype(int).value_counts())


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

FEATURES_NAME = "features_2023.csv"

def find_root_and_data():
    cwd = Path.cwd().resolve()
    if cwd.name == "data":
        return cwd.parent, cwd
    if (cwd / "data").exists():
        return cwd, cwd / "data"
    for p in cwd.parents:
        if (p / "data").exists():
            return p, p / "data"
    return cwd, cwd / "data"

def find_file(name):
    root, data = find_root_and_data()
    p1 = data / name
    if p1.exists():
        return root, data, p1
    p2 = root / name
    if p2.exists():
        return root, data, p2
    try:
        hit = next(root.rglob(name))
        if hit.parent.name == "data":
            return hit.parent.parent, hit.parent, hit
        return root, data, hit
    except StopIteration:
        return root, data, None

ROOT, DATA, p_feat = find_file(FEATURES_NAME)
assert p_feat is not None and p_feat.exists(), f"Missing {FEATURES_NAME}; run step (C). Found ROOT={ROOT}, DATA={DATA}"

print("Using features file:", p_feat)

df = pd.read_csv(p_feat)

y = df.get("injury_2023", pd.Series(0, index=df.index)).fillna(0).astype(int)
drop_cols = ["injury_2023","pitcher","pitcher_name"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

X = X.select_dtypes(include=[np.number]).copy()
X = X.loc[:, X.notna().any(axis=0)]
X = X.fillna(X.median(numeric_only=True))

print("X shape:", X.shape, "| y positives:", int(y.sum()), "/", len(y))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs"))
])
lr_pipe.fit(X_train, y_train)

y_pred = lr_pipe.predict(X_test)
y_proba = lr_pipe.predict_proba(X_test)[:, 1]
auc_lr = roc_auc_score(y_test, y_proba)

print("\n=== Logistic Regression: Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))
print(f"ROC AUC (LR): {auc_lr:.3f}")

baseline_lr_report = classification_report(y_test, y_pred, digits=3, output_dict=True)
baseline_lr_auc = float(auc_lr)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import re

def find_root_and_data():
    cwd = Path.cwd().resolve()
    if cwd.name == "data":
        return cwd.parent, cwd
    if (cwd / "data").exists():
        return cwd, cwd / "data"
    for p in cwd.parents:
        if (p / "data").exists():
            return p, p / "data"
    return cwd, cwd / "data"

ROOT, DATA = find_root_and_data()
p_feat = DATA / "features_2023.csv"
p_gz   = DATA / "pitches_with_injuries_2021_2023.csv.gz"
p_inj  = DATA / "rosterresource_injuries_2023.csv"

print("ROOT:", ROOT)
print("DATA:", DATA)
print("Exists:", {"features": p_feat.exists(), "merged_gz": p_gz.exists(), "injuries": p_inj.exists()})

df_feat = pd.read_csv(p_feat) if p_feat.exists() else None
if df_feat is not None:
    vc = df_feat["injury_2023"].fillna(0).astype(int).value_counts(dropna=False)
    print("\nfeatures_2023 label counts (injury_2023):")
    print(vc)

if p_inj.exists():
    inj = pd.read_csv(p_inj, low_memory=False)
    print("\nInjuries columns:", list(inj.columns)[:20])
    print("Injuries rows:", len(inj))
else:
    inj = None

if p_gz.exists():
    dfm = pd.read_csv(p_gz, nrows=200_000, low_memory=False, compression="gzip")  
    print("\nMerged sample columns:", list(dfm.columns)[:25])
    print("Has 'pitcher' ID column?", "pitcher" in dfm.columns)
    print("Has 'pitcher_name'?", "pitcher_name" in dfm.columns)
    if "pitcher_name" in dfm.columns:
        names = dfm["pitcher_name"].dropna().astype(str).unique()[:10]
        print("Sample pitcher_name values:", names)
else:
    dfm = None


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import unicodedata

def normalize_name(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
    s = s.lower()
    s = re.sub(r"\b(jr|sr|ii|iii|iv|v)\b\.?", "", s)
    s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

ROOT, DATA = find_root_and_data()
p_feat = DATA / "features_2023.csv"
p_gz   = DATA / "pitches_with_injuries_2021_2023.csv.gz"
p_inj  = DATA / "rosterresource_injuries_2023.csv"

assert p_feat.exists(), f"Missing {p_feat}"
assert p_gz.exists(),   f"Missing {p_gz}"
assert p_inj.exists(),  f"Missing {p_inj}"

feat = pd.read_csv(p_feat)
dfm  = pd.read_csv(p_gz, low_memory=False, compression="gzip")
inj  = pd.read_csv(p_inj, low_memory=False)

inj_cols = {c.lower(): c for c in inj.columns}
id_candidates   = ["mlbam_id","player_id_mlbam","playerid","mlb_id","player_id","mlbamid","mlbam"]
name_candidates = ["player_name","name","player","Player","PLAYER","Name"]
inj_id_col   = next((inj_cols[c] for c in id_candidates if c in inj_cols), None)
inj_name_col = next((inj_cols[c] for c in name_candidates if c in inj_cols), None)

injured_ids = set()
injured_names_norm = set()
if inj_id_col is not None:
    injured_ids = set(inj[inj_id_col].dropna().astype(str).tolist())
if inj_name_col is not None:
    injured_names_norm = set(inj[inj_name_col].dropna().astype(str).map(normalize_name).tolist())

dfm["game_date"] = pd.to_datetime(dfm.get("game_date"), errors="coerce")
if "game_year" not in dfm.columns:
    dfm["game_year"] = dfm["game_date"].dt.year
df23 = dfm[dfm["game_year"] == 2023].copy()

pid_has = "pitcher" in df23.columns
pname_has = "pitcher_name" in df23.columns

if pid_has:
    df23["_pid_str"] = df23["pitcher"].astype(str)
else:
    df23["_pid_str"] = ""

if pname_has:
    df23["_pname_norm"] = df23["pitcher_name"].astype(str).map(normalize_name)
else:
    df23["_pname_norm"] = ""

df23["injured_tmp"] = 0
if injured_ids and pid_has:
    df23.loc[df23["_pid_str"].isin(injured_ids), "injured_tmp"] = 1
if injured_names_norm and pname_has:
    df23.loc[df23["_pname_norm"].isin(injured_names_norm), "injured_tmp"] = 1

lab = (df23.groupby("pitcher", dropna=False)["injured_tmp"]
       .max().reset_index().rename(columns={"injured_tmp":"injury_2023"}))

feat2 = feat.copy()
if "pitcher" in feat2.columns:
    feat2 = feat2.merge(lab, on="pitcher", how="left", suffixes=("", "_new"))
    if "injury_2023_new" in feat2.columns:
        feat2["injury_2023"] = feat2["injury_2023"].fillna(feat2["injury_2023_new"])
        feat2 = feat2.drop(columns=["injury_2023_new"])

if feat2["injury_2023"].isna().all() or (feat2["injury_2023"].fillna(0).sum() == 0):
    if "pitcher_name" in feat2.columns and pname_has:
        key_feat = feat2[["pitcher_name"]].copy()
        key_feat["_pname_norm"] = key_feat["pitcher_name"].astype(str).map(normalize_name)
        lab_name = (df23.groupby("_pname_norm")["injured_tmp"]
                    .max().reset_index().rename(columns={"injured_tmp":"injury_2023_name"}))
        feat2 = feat2.join(key_feat["_pname_norm"])
        feat2 = feat2.merge(lab_name, on="_pname_norm", how="left")
        feat2["injury_2023"] = feat2["injury_2023"].fillna(feat2["injury_2023_name"])
        feat2 = feat2.drop(columns=[c for c in ["_pname_norm","injury_2023_name"] if c in feat2.columns], errors="ignore")

feat2["injury_2023"] = feat2["injury_2023"].fillna(0).astype(int)

print("Label counts after repair:")
print(feat2["injury_2023"].value_counts())

feat2.to_csv(p_feat, index=False)
print("Updated labels saved to:", p_feat)


In [None]:
import pandas as pd
import unicodedata, re
from pathlib import Path

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"

injury_files = {
    2021: DATA / "2021roster-resource__injury-report.xlsx.xlsx",
    2022: DATA / "2022roster-resource__injury-report.xlsx.xlsx",
    2023: DATA / "2023roster-resource__injury-report.xlsx.xlsx",
}

def normalize_name(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))
    s = s.lower()
    s = re.sub(r"\b(jr|sr|ii|iii|iv|v)\b\.?", "", s)
    s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

inj_dfs = []
for year, path in injury_files.items():
    if not path.exists():
        print(f"[WARN] Missing {path}")
        continue
    df = pd.read_excel(path)
    name_col = next((c for c in df.columns if "player" in c.lower() or "name" in c.lower()), None)
    if name_col is None:
        raise ValueError(f"No name column found in {path}")
    df["name_norm"] = df[name_col].astype(str).map(normalize_name)
    df["year"] = year
    inj_dfs.append(df[["name_norm","year"]])

inj_all = pd.concat(inj_dfs, ignore_index=True).drop_duplicates()
print("Unified injuries rows:", len(inj_all))
print("By year:")
print(inj_all.groupby("year").size())


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
p_gz = DATA / "pitches_with_injuries_2021_2023.csv.gz"
assert p_gz.exists(), f"Missing {p_gz}"

dfm = pd.read_csv(p_gz, low_memory=False, compression="gzip")

dfm["game_date"] = pd.to_datetime(dfm.get("game_date"), errors="coerce")
if "game_year" not in dfm.columns:
    dfm["game_year"] = dfm["game_date"].dt.year

if "player_name" in dfm.columns:
    dfm["_pname_norm"] = dfm["player_name"].astype(str).map(normalize_name)
elif "pitcher_name" in dfm.columns:
    dfm["_pname_norm"] = dfm["pitcher_name"].astype(str).map(normalize_name)
else:
    dfm["_pname_norm"] = ""

inj_set = set(zip(inj_all["name_norm"], inj_all["year"]))

dfm["injured"] = dfm.apply(
    lambda row: 1 if (row["_pname_norm"], row["game_year"]) in inj_set else 0,
    axis=1
)

print("Pitch-level injury counts by year:")
print(dfm.groupby("game_year")["injured"].sum())

p_labeled = DATA / "pitches_with_injuries_2021_2023_labeled.csv.gz"
dfm.to_csv(p_labeled, index=False, compression="gzip")
print(f"\nSaved labeled merged file to: {p_labeled}")


In [None]:
import pandas as pd
from pathlib import Path
import re

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"

inj_paths = [
    DATA / "2021roster-resource__injury-report.xlsx.xlsx",
    DATA / "2022roster-resource__injury-report.xlsx",
    DATA / "2023roster-resource__injury-report.xlsx.xlsx",
]

def preview_injury_file(p):
    if not p.exists():
        print(f"[MISS] {p}")
        return
    df = pd.read_excel(p, nrows=8)
    print(f"\n=== {p.name} ===")
    print("Columns:", list(df.columns))
    like_id = [c for c in df.columns if re.search(r"mlbam|player.*id|mlb.*id", str(c), flags=re.I)]
    like_name = [c for c in df.columns if re.search(r"(player|name)", str(c), flags=re.I)]
    show_cols = (like_id + like_name)[:8]
    if show_cols:
        print(df[show_cols].head(5))
    else:
        print(df.head(5))

for p in inj_paths:
    preview_injury_file(p)

p_gz = DATA / "pitches_with_injuries_2021_2023.csv.gz"
dfm = pd.read_csv(p_gz, nrows=5_000, low_memory=False, compression="gzip")
print("\n=== Statcast merged: sample columns ===")
print(list(dfm.columns))

pitcher_name_candidates = [c for c in dfm.columns if ("pitch" in c.lower() and "name" in c.lower())]
print("Pitcher-name candidates in Statcast:", pitcher_name_candidates)
print("Has 'player_name' (often batter):", "player_name" in dfm.columns)
print("Has 'pitcher' (MLBAM ID):", "pitcher" in dfm.columns)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
p_gz = DATA / "pitches_with_injuries_2021_2023.csv.gz"
assert p_gz.exists(), f"Missing {p_gz}"

dfm = pd.read_csv(p_gz, low_memory=False, compression="gzip")
dfm["game_date"] = pd.to_datetime(dfm.get("game_date"), errors="coerce")
if "game_year" not in dfm.columns:
    dfm["game_year"] = dfm["game_date"].dt.year

dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in dfm.columns]
if dedup_keys:
    dfm = dfm.drop_duplicates(subset=dedup_keys, keep="first")

inj_files = {
    2021: DATA / "2021roster-resource__injury-report.xlsx.xlsx",
    2022: DATA / "2022roster-resource__injury-report.xlsx.xlsx",
    2023: DATA / "2023roster-resource__injury-report.xlsx.xlsx",
}

inj_dfs = []
for yr, path in inj_files.items():
    if not path.exists():
        print(f"[WARN] Missing {path}")
        continue
    inj = pd.read_excel(path, dtype=str)
    if "MLBAMID" not in inj.columns:
        raise ValueError(f"No MLBAMID col in {path}")
    inj_tmp = inj[["MLBAMID"]].copy()
    inj_tmp["MLBAMID"] = inj_tmp["MLBAMID"].astype(str).str.strip()
    inj_tmp["year"] = yr
    inj_dfs.append(inj_tmp)

inj_all = pd.concat(inj_dfs, ignore_index=True).dropna()
inj_set = set(map(tuple, inj_all[["MLBAMID","year"]].values))

dfm["pitcher_str"] = dfm["pitcher"].astype(str)
dfm["injured"] = dfm.apply(
    lambda row: 1 if (row["pitcher_str"], row["game_year"]) in inj_set else 0,
    axis=1
)

print("Pitch-level injury counts by year (using MLBAM ID):")
print(dfm.groupby("game_year")["injured"].sum())

p_labeled = DATA / "pitches_with_injuries_2021_2023_labeled.csv.gz"
dfm.to_csv(p_labeled, index=False, compression="gzip")
print(f"\nSaved labeled merged file to: {p_labeled}")


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
p_labeled = DATA / "pitches_with_injuries_2021_2023_labeled.csv.gz"
assert p_labeled.exists(), f"Missing {p_labeled}"

dfm = pd.read_csv(p_labeled, low_memory=False, compression="gzip")
dfm["game_date"] = pd.to_datetime(dfm["game_date"], errors="coerce")

if "release_spin_rate" in dfm.columns and "spin_rate" not in dfm.columns:
    dfm["spin_rate"] = dfm["release_spin_rate"]

fastballs = {"FF","FA","FT","SI","FC"}
breaking  = {"SL","CU","KC","SV","SC"}
offspeed  = {"CH","FS","KN"}

def build_features_for_year(df, year):
    dfy = df[df["game_year"]==year].copy()
    if dfy.empty:
        return None
    dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in dfy.columns]
    if dedup_keys:
        dfy = dfy.drop_duplicates(subset=dedup_keys, keep="first")
    gcols = ["pitcher","game_pk","game_date"]
    per_game = dfy.groupby(gcols).size().reset_index(name="pitches_in_game")
    per_game = per_game.sort_values(["pitcher","game_date"])
    per_game["rest_days"] = per_game.groupby("pitcher")["game_date"].diff().dt.days
    rest_agg = per_game.groupby("pitcher").agg(
        games_pitched=("game_pk","nunique"),
        total_pitches=("pitches_in_game","sum"),
        avg_pitches=("pitches_in_game","mean"),
        median_rest=("rest_days","median"),
        mean_rest=("rest_days","mean"),
        short_rest_games=("rest_days", lambda s: np.sum(s<=3) if s.notna().any() else 0),
    ).reset_index()
    velo = dfy.groupby("pitcher").agg(
        avg_velocity=("release_speed","mean"),
        p95_velocity=("release_speed", lambda x: np.nanpercentile(x,95)),
        avg_spin=("spin_rate","mean"),
    ).reset_index()
    tmp = dfy.copy()
    tmp["pitch_type"] = tmp["pitch_type"].astype(str).str.upper()
    tmp["family"] = np.where(tmp["pitch_type"].isin(fastballs), "fastball",
                     np.where(tmp["pitch_type"].isin(breaking),  "breaking",
                     np.where(tmp["pitch_type"].isin(offspeed),  "offspeed","other")))
    mix = tmp.groupby(["pitcher","family"]).size().reset_index(name="cnt")
    tot = mix.groupby("pitcher")["cnt"].sum().reset_index(name="total")
    mix = mix.merge(tot,on="pitcher",how="left")
    mix["pct"] = mix["cnt"]/mix["total"]
    mix_pivot = mix.pivot(index="pitcher",columns="family",values="pct").fillna(0).reset_index()
    mix_pivot.columns = ["pitcher"] + [f"mix_{c}" for c in mix_pivot.columns.tolist()[1:]]
    lab = dfy.groupby("pitcher")["injured"].max().reset_index().rename(columns={"injured":"injury"})
    feat = rest_agg.merge(velo,on="pitcher",how="outer").merge(mix_pivot,on="pitcher",how="outer").merge(lab,on="pitcher",how="left")
    feat["injury"] = feat["injury"].fillna(0).astype(int)
    return feat

for yr in [2021,2022,2023]:
    feat = build_features_for_year(dfm, yr)
    if feat is not None:
        out = DATA / f"features_{yr}.csv"
        feat.to_csv(out,index=False)
        print(f"Saved {out}, rows={len(feat)}, positives={feat['injury'].sum()}")


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"

df21 = pd.read_csv(DATA/"features_2021.csv")
df22 = pd.read_csv(DATA/"features_2022.csv")
df23 = pd.read_csv(DATA/"features_2023.csv")

df21["year"] = 2021
df22["year"] = 2022
df23["year"] = 2023

train = pd.concat([df21,df22], ignore_index=True)
test = df23.copy()

y_train = train["injury"].fillna(0).astype(int)
y_test = test["injury"].fillna(0).astype(int)

drop_cols = ["injury","pitcher","year"]
X_train = train.drop(columns=[c for c in drop_cols if c in train.columns], errors="ignore")
X_test  = test.drop(columns=[c for c in drop_cols if c in test.columns], errors="ignore")

X_train = X_train.select_dtypes(include=[np.number]).copy()
X_test  = X_test.select_dtypes(include=[np.number]).copy()
X_train = X_train.fillna(X_train.median(numeric_only=True))
X_test  = X_test.fillna(X_train.median(numeric_only=True))

print("Train shape:", X_train.shape, "| positives:", y_train.sum())
print("Test shape:", X_test.shape,  "| positives:", y_test.sum())

lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs"))
])
lr_pipe.fit(X_train, y_train)

y_pred = lr_pipe.predict(X_test)
y_proba = lr_pipe.predict_proba(X_test)[:,1]
auc_lr = roc_auc_score(y_test, y_proba)

print("\n=== Logistic Regression (train 2021–22, test 2023) ===")
print(classification_report(y_test, y_pred, digits=3))
print(f"ROC AUC: {auc_lr:.3f}")

baseline_lr_report = classification_report(y_test, y_pred, digits=3, output_dict=True)
baseline_lr_auc = float(auc_lr)


In [10]:
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test, label=y_test)

neg, pos = (y_train==0).sum(), (y_train==1).sum()
scale_pos_weight = neg/pos if pos > 0 else 1

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.1,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "scale_pos_weight": scale_pos_weight,
    "seed": 42,
}

bst = xgb.train(params, dtrain, num_boost_round=200)

y_proba_xgb = bst.predict(dtest)
y_pred_xgb = (y_proba_xgb >= 0.5).astype(int)

auc_xgb = roc_auc_score(y_test, y_proba_xgb)

print("\n=== XGBoost (train 2021–22, test 2023) ===")
print(classification_report(y_test, y_pred_xgb, digits=3))
print(f"ROC AUC: {auc_xgb:.3f}")

xgb_auc = float(auc_xgb)
xgb_report = classification_report(y_test, y_pred_xgb, digits=3, output_dict=True)


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/kavehnaini/miniconda3/envs/pitcherinjury/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <6984A3F0-3899-36C4-A85D-20B5520FF130> /Users/kavehnaini/miniconda3/envs/pitcherinjury/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/Users/kavehnaini/miniconda3/envs/pitcherinjury/lib/python3.10/lib-dynload/../../libomp.dylib' (no such file), '/Users/kavehnaini/miniconda3/envs/pitcherinjury/bin/../lib/libomp.dylib' (no such file)"]


In [1]:
import xgboost
print("XGBoost version:", xgboost.__version__)

XGBoost version: 3.0.4


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

need_rebuild = False
try:
    _ = (X_train, X_test, y_train, y_test)
except NameError:
    need_rebuild = True

if need_rebuild:
    DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
    df21 = pd.read_csv(DATA/"features_2021.csv"); df21["year"] = 2021
    df22 = pd.read_csv(DATA/"features_2022.csv"); df22["year"] = 2022
    df23 = pd.read_csv(DATA/"features_2023.csv"); df23["year"] = 2023
    train = pd.concat([df21, df22], ignore_index=True)
    test  = df23.copy()
    y_train = train["injury"].fillna(0).astype(int)
    y_test  = test["injury"].fillna(0).astype(int)
    drop_cols = ["injury","pitcher","year"]
    X_train = train.drop(columns=[c for c in drop_cols if c in train.columns], errors="ignore").select_dtypes(np.number)
    X_test  = test.drop(columns=[c for c in drop_cols if c in test.columns], errors="ignore").select_dtypes(np.number)
    X_train = X_train.fillna(X_train.median(numeric_only=True))
    X_test  = X_test.fillna(X_train.median(numeric_only=True))

neg, pos = (y_train==0).sum(), (y_train==1).sum()
scale_pos_weight = (neg / pos) if pos > 0 else 1.0
print(f"Train positives: {pos} / {neg+pos} | scale_pos_weight={scale_pos_weight:.1f}")

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="auc",
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

xgb.fit(X_train, y_train)

y_proba = xgb.predict_proba(X_test)[:, 1]
y_pred  = (y_proba >= 0.5).astype(int)
auc_xgb = roc_auc_score(y_test, y_proba)

from sklearn.metrics import classification_report
print("\n=== XGBoost (train 2021–22, test 2023) ===")
print(classification_report(y_test, y_pred, digits=3))
print(f"ROC AUC: {auc_xgb:.3f}")

xgb_auc = float(auc_xgb)
xgb_report = classification_report(y_test, y_pred, digits=3, output_dict=True)

try:
    importances = pd.Series(xgb.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    topk = importances.head(10)
    print("\nTop 10 features by gain (approx.):")
    print(topk.to_string())
except Exception as e:
    print("\n[INFO] Could not compute feature importances:", e)


Train positives: 21 / 2300 | scale_pos_weight=108.5

=== XGBoost (train 2021–22, test 2023) ===
              precision    recall  f1-score   support

           0      0.986     1.000     0.993      1226
           1      0.000     0.000     0.000        17

    accuracy                          0.986      1243
   macro avg      0.493     0.500     0.497      1243
weighted avg      0.973     0.986     0.980      1243

ROC AUC: 0.665

Top 10 features by gain (approx.):
mix_other        0.098793
total_pitches    0.088697
avg_pitches      0.087084
avg_spin         0.086576
mean_rest        0.086567
p95_velocity     0.085501
mix_fastball     0.080656
mix_breaking     0.078048
median_rest      0.073236
games_pitched    0.068969


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [3]:
import numpy as np
from sklearn.metrics import (
    classification_report, roc_auc_score, average_precision_score,
    confusion_matrix, precision_recall_curve, roc_curve
)

proba = y_proba 
y_true = y_test.values if hasattr(y_test, "values") else y_test

auc = roc_auc_score(y_true, proba)
ap  = average_precision_score(y_true, proba)  # PR-AUC
print(f"ROC AUC: {auc:.3f} | PR AUC: {ap:.3f}")

pred_050 = (proba >= 0.50).astype(int)
print("\n--- Threshold = 0.50 ---")
print(classification_report(y_true, pred_050, digits=3, zero_division=0))
print("Confusion matrix [tn fp; fn tp]:\n", confusion_matrix(y_true, pred_050))

fpr, tpr, thr = roc_curve(y_true, proba)
j_idx = np.argmax(tpr - fpr)
thr_j = thr[j_idx]
pred_j = (proba >= thr_j).astype(int)
print(f"\n--- Youden's J threshold = {thr_j:.3f} ---")
print(classification_report(y_true, pred_j, digits=3, zero_division=0))
print("Confusion matrix [tn fp; fn tp]:\n", confusion_matrix(y_true, pred_j))

ths = np.unique(np.concatenate([thr, np.linspace(0.05,0.95,91)]))
best_f1, best_thr = -1, 0.5
for t in ths:
    p = (proba >= t).astype(int)
    tp = np.sum((p==1) & (y_true==1))
    fp = np.sum((p==1) & (y_true==0))
    fn = np.sum((p==0) & (y_true==1))
    prec = tp/(tp+fp) if (tp+fp)>0 else 0.0
    rec  = tp/(tp+fn) if (tp+fn)>0 else 0.0
    f1 = 2*prec*rec/(prec+rec) if (prec+rec)>0 else 0.0
    if f1 > best_f1:
        best_f1, best_thr = f1, t
pred_f1 = (proba >= best_thr).astype(int)
print(f"\n--- Best-F1 threshold = {best_thr:.3f} ---")
print(classification_report(y_true, pred_f1, digits=3, zero_division=0))
print("Confusion matrix [tn fp; fn tp]:\n", confusion_matrix(y_true, pred_f1))

k = int(np.sum(y_true==1))
order = np.argsort(-proba)  
pred_topk = np.zeros_like(y_true)
pred_topk[order[:k]] = 1
print(f"\n--- Top-k (k={k}, matches prevalence) ---")
print(classification_report(y_true, pred_topk, digits=3, zero_division=0))
print("Confusion matrix [tn fp; fn tp]:\n", confusion_matrix(y_true, pred_topk))

chosen_threshold = float(best_thr)
print(f"\n[Chosen threshold for later use] {chosen_threshold:.3f}")


ROC AUC: 0.665 | PR AUC: 0.031

--- Threshold = 0.50 ---
              precision    recall  f1-score   support

           0      0.986     1.000     0.993      1226
           1      0.000     0.000     0.000        17

    accuracy                          0.986      1243
   macro avg      0.493     0.500     0.497      1243
weighted avg      0.973     0.986     0.980      1243

Confusion matrix [tn fp; fn tp]:
 [[1226    0]
 [  17    0]]

--- Youden's J threshold = 0.001 ---
              precision    recall  f1-score   support

           0      0.995     0.471     0.640      1226
           1      0.021     0.824     0.041        17

    accuracy                          0.476      1243
   macro avg      0.508     0.647     0.340      1243
weighted avg      0.982     0.476     0.632      1243

Confusion matrix [tn fp; fn tp]:
 [[578 648]
 [  3  14]]

--- Best-F1 threshold = 0.016 ---
              precision    recall  f1-score   support

           0      0.990     0.923     0.955

In [4]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
p_labeled = DATA / "pitches_with_injuries_2021_2023_labeled.csv.gz"
assert p_labeled.exists(), f"Missing {p_labeled}"

df = pd.read_csv(p_labeled, low_memory=False, compression="gzip")
df["game_date"] = pd.to_datetime(df["game_date"], errors="coerce")
if "game_year" not in df.columns:
    df["game_year"] = df["game_date"].dt.year
if "release_spin_rate" in df.columns and "spin_rate" not in df.columns:
    df["spin_rate"] = df["release_spin_rate"]
if "pitch_type" not in df.columns:
    df["pitch_type"] = np.nan

dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in df.columns]
if dedup_keys:
    df = df.drop_duplicates(subset=dedup_keys, keep="first")

fastballs = {"FF","FA","FT","SI","FC"}
breaking  = {"SL","CU","KC","SV","SC"}
offspeed  = {"CH","FS","KN"}

def build_base(dfy):
    gcols = ["pitcher","game_pk","game_date"]
    per_game = dfy.groupby(gcols).size().reset_index(name="pitches_in_game")
    per_game = per_game.sort_values(["pitcher","game_date"])
    per_game["rest_days"] = per_game.groupby("pitcher")["game_date"].diff().dt.days
    rest = per_game.groupby("pitcher").agg(
        games_pitched=("game_pk","nunique"),
        total_pitches=("pitches_in_game","sum"),
        avg_pitches=("pitches_in_game","mean"),
        median_rest=("rest_days","median"),
        mean_rest=("rest_days","mean"),
        short_rest_games=("rest_days", lambda s: np.sum(s<=3) if s.notna().any() else 0),
    ).reset_index()
    vs = dfy.groupby("pitcher").agg(
        avg_velocity=("release_speed","mean"),
        p95_velocity=("release_speed", lambda x: np.nanpercentile(x,95)),
        avg_spin=("spin_rate","mean"),
    ).reset_index()
    tmp = dfy.copy()
    tmp["pitch_type"] = tmp["pitch_type"].astype(str).str.upper()
    tmp["family"] = np.where(tmp["pitch_type"].isin(fastballs), "fastball",
                      np.where(tmp["pitch_type"].isin(breaking),  "breaking",
                      np.where(tmp["pitch_type"].isin(offspeed),  "offspeed","other")))
    mix = tmp.groupby(["pitcher","family"]).size().reset_index(name="cnt")
    tot = mix.groupby("pitcher")["cnt"].sum().reset_index(name="total")
    mix = mix.merge(tot,on="pitcher",how="left")
    mix["pct"] = mix["cnt"]/mix["total"]
    mix_pivot = mix.pivot(index="pitcher",columns="family",values="pct").fillna(0).reset_index()
    mix_pivot.columns = ["pitcher"] + [f"mix_{c}" for c in mix_pivot.columns.tolist()[1:]]
    lab = dfy.groupby("pitcher")["injured"].max().reset_index().rename(columns={"injured":"injury"})
    return rest.merge(vs,on="pitcher",how="outer").merge(mix_pivot,on="pitcher",how="outer").merge(lab,on="pitcher",how="left")

def add_enhancements(df_all, year, base):
    dfy = df_all[df_all["game_year"]==year].copy()
    prev = df_all[df_all["game_year"]==year-1]
    vel_prev = prev.groupby("pitcher")["release_speed"].mean().rename("avg_velocity_prev").reset_index()
    vel_season = dfy.groupby("pitcher")["release_speed"].mean().rename("season_velocity").reset_index()
    dfy["month"] = dfy["game_date"].dt.month
    early = dfy[dfy["month"].isin([3,4])]
    vel_early = early.groupby("pitcher")["release_speed"].mean().rename("early_velocity").reset_index()
    vel = vel_season.merge(vel_prev, on="pitcher", how="left").merge(vel_early, on="pitcher", how="left")
    vel["velocity_delta"] = np.where(vel["avg_velocity_prev"].notna(),
                                     vel["season_velocity"] - vel["avg_velocity_prev"],
                                     vel["season_velocity"] - vel["early_velocity"])
    daily = (dfy.groupby(["pitcher", dfy["game_date"].dt.date]).size()
             .reset_index(name="pitches"))
    daily = daily.rename(columns={"game_date":"date"})
    daily["date"] = pd.to_datetime(daily["date"])
    daily = daily.sort_values(["pitcher","date"])
    daily["roll14"] = daily.groupby("pitcher")["pitches"].transform(lambda s: s.rolling(14, min_periods=1).sum())
    spike = daily.groupby("pitcher")["roll14"].agg(max_14d="max", median_14d="median").reset_index()
    spike["workload_spike_14d"] = spike["max_14d"] / spike["median_14d"].replace(0, np.nan)
    def breaking_pct(d):
        if len(d)==0: return np.nan
        pt = d["pitch_type"].astype(str).str.upper()
        return (pt.isin(breaking)).mean()
    bb_early = early.groupby("pitcher").apply(breaking_pct).rename("breaking_pct_early").reset_index()
    bb_season = dfy.groupby("pitcher").apply(breaking_pct).rename("breaking_pct_season").reset_index()
    bb = bb_season.merge(bb_early, on="pitcher", how="left")
    bb["breaking_usage_delta"] = bb["breaking_pct_season"] - bb["breaking_pct_early"]
    enh = (base
           .merge(vel[["pitcher","velocity_delta"]], on="pitcher", how="left")
           .merge(spike[["pitcher","workload_spike_14d"]], on="pitcher", how="left")
           .merge(bb[["pitcher","breaking_usage_delta"]], on="pitcher", how="left"))
    return enh

enhanced_paths = {}
for yr in [2021, 2022, 2023]:
    base = build_base(df[df["game_year"]==yr])
    if base is None or base.empty:
        continue
    enh = add_enhancements(df, yr, base)
    enh["injury"] = enh["injury"].fillna(0).astype(int)
    num_cols = enh.select_dtypes(include=[np.number]).columns
    enh[num_cols] = enh[num_cols].replace([np.inf,-np.inf], np.nan)
    enh[num_cols] = enh[num_cols].fillna(enh[num_cols].median(numeric_only=True))
    out = DATA / f"features_{yr}_enhanced.csv"
    enh.to_csv(out, index=False)
    enhanced_paths[yr] = out
    print(f"Saved {out} | rows={len(enh)} | positives={enh['injury'].sum()}")

df21e = pd.read_csv(enhanced_paths[2021]); df21e["year"] = 2021
df22e = pd.read_csv(enhanced_paths[2022]); df22e["year"] = 2022
df23e = pd.read_csv(enhanced_paths[2023]); df23e["year"] = 2023

train_e = pd.concat([df21e, df22e], ignore_index=True)
test_e  = df23e.copy()

y_train_e = train_e["injury"].astype(int)
y_test_e  = test_e["injury"].astype(int)

drop_cols = ["injury","pitcher","year"]
X_train_e = train_e.drop(columns=[c for c in drop_cols if c in train_e.columns], errors="ignore").select_dtypes(np.number)
X_test_e  = test_e.drop(columns=[c for c in drop_cols if c in test_e.columns], errors="ignore").select_dtypes(np.number)

X_train_e = X_train_e.fillna(X_train_e.median(numeric_only=True))
X_test_e  = X_test_e.fillna(X_train_e.median(numeric_only=True))

lr_pipe_e = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs"))
])
lr_pipe_e.fit(X_train_e, y_train_e)
y_prob_lr_e = lr_pipe_e.predict_proba(X_test_e)[:,1]
y_pred_lr_e = (y_prob_lr_e >= 0.5).astype(int)
auc_lr_e = roc_auc_score(y_test_e, y_prob_lr_e)
print("\n=== Logistic Regression (ENHANCED) ===")
print(classification_report(y_test_e, y_pred_lr_e, digits=3, zero_division=0))
print(f"ROC AUC: {auc_lr_e:.3f}")

neg, pos = (y_train_e==0).sum(), (y_train_e==1).sum()
scale_pos_weight = (neg/pos) if pos>0 else 1.0
xgb_e = XGBClassifier(
    n_estimators=500, learning_rate=0.05, max_depth=4,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    objective="binary:logistic", eval_metric="auc",
    n_jobs=-1, scale_pos_weight=scale_pos_weight, random_state=42
)
xgb_e.fit(X_train_e, y_train_e)
y_prob_xgb_e = xgb_e.predict_proba(X_test_e)[:,1]
y_pred_xgb_e = (y_prob_xgb_e >= 0.5).astype(int)
auc_xgb_e = roc_auc_score(y_test_e, y_prob_xgb_e)
print("\n=== XGBoost (ENHANCED) ===")
print(classification_report(y_test_e, y_pred_xgb_e, digits=3, zero_division=0))
print(f"ROC AUC: {auc_xgb_e:.3f}")

enh_lr_auc  = float(auc_lr_e)
enh_xgb_auc = float(auc_xgb_e)


  bb_early = early.groupby("pitcher").apply(breaking_pct).rename("breaking_pct_early").reset_index()
  bb_season = dfy.groupby("pitcher").apply(breaking_pct).rename("breaking_pct_season").reset_index()


Saved /Users/kavehnaini/pitcher-injury-predictor/data/features_2021_enhanced.csv | rows=909 | positives=14


  return _nanquantile_unchecked(
  bb_early = early.groupby("pitcher").apply(breaking_pct).rename("breaking_pct_early").reset_index()


TypeError: Index(...) must be called with a collection of some kind, 'breaking_pct_early' was passed

In [5]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
p_labeled = DATA / "pitches_with_injuries_2021_2023_labeled.csv.gz"
assert p_labeled.exists(), f"Missing {p_labeled}"

df = pd.read_csv(p_labeled, low_memory=False, compression="gzip")
df["game_date"] = pd.to_datetime(df["game_date"], errors="coerce")
if "game_year" not in df.columns:
    df["game_year"] = df["game_date"].dt.year
if "release_spin_rate" in df.columns and "spin_rate" not in df.columns:
    df["spin_rate"] = df["release_spin_rate"]
if "pitch_type" not in df.columns:
    df["pitch_type"] = np.nan

dedup_keys = [k for k in ["game_pk","at_bat_number","pitch_number"] if k in df.columns]
if dedup_keys:
    before = len(df)
    df = df.drop_duplicates(subset=dedup_keys, keep="first")
    print(f"Dedup pitches: {before} -> {len(df)}")

fastballs = {"FF","FA","FT","SI","FC"}
breaking  = {"SL","CU","KC","SV","SC"}
offspeed  = {"CH","FS","KN"}

def build_base(dfy):
    gcols = ["pitcher","game_pk","game_date"]
    per_game = dfy.groupby(gcols).size().reset_index(name="pitches_in_game")
    per_game = per_game.sort_values(["pitcher","game_date"])
    per_game["rest_days"] = per_game.groupby("pitcher")["game_date"].diff().dt.days
    rest = per_game.groupby("pitcher").agg(
        games_pitched=("game_pk","nunique"),
        total_pitches=("pitches_in_game","sum"),
        avg_pitches=("pitches_in_game","mean"),
        median_rest=("rest_days","median"),
        mean_rest=("rest_days","mean"),
        short_rest_games=("rest_days", lambda s: np.sum(s<=3) if s.notna().any() else 0),
    ).reset_index()

    vs = dfy.groupby("pitcher").agg(
        avg_velocity=("release_speed","mean"),
        p95_velocity=("release_speed", lambda x: np.nanpercentile(x, 95)),
        avg_spin=("spin_rate","mean"),
    ).reset_index()

    tmp = dfy.copy()
    tmp["pitch_type"] = tmp["pitch_type"].astype(str).str.upper()
    tmp["family"] = np.where(tmp["pitch_type"].isin(fastballs), "fastball",
                      np.where(tmp["pitch_type"].isin(breaking),  "breaking",
                      np.where(tmp["pitch_type"].isin(offspeed),  "offspeed","other")))
    mix = tmp.groupby(["pitcher","family"]).size().reset_index(name="cnt")
    tot = mix.groupby("pitcher")["cnt"].sum().reset_index(name="total")
    mix = mix.merge(tot, on="pitcher", how="left")
    mix["pct"] = mix["cnt"] / mix["total"].replace(0, np.nan)
    mix_pivot = mix.pivot(index="pitcher", columns="family", values="pct").fillna(0).reset_index()
    mix_pivot.columns = ["pitcher"] + [f"mix_{c}" for c in mix_pivot.columns.tolist()[1:]]

    lab = dfy.groupby("pitcher")["injured"].max().reset_index().rename(columns={"injured":"injury"})

    base = rest.merge(vs, on="pitcher", how="outer").merge(mix_pivot, on="pitcher", how="outer").merge(lab, on="pitcher", how="left")
    return base

def add_enhancements(df_all, year, base):
    dfy = df_all[df_all["game_year"]==year].copy()
    if dfy.empty:
        return base.assign(velocity_delta=np.nan, workload_spike_14d=np.nan, breaking_usage_delta=np.nan)

    first_date = dfy.groupby("pitcher")["game_date"].transform("min")
    early_cutoff = first_date + pd.Timedelta(days=30)
    early_mask = dfy["game_date"] <= early_cutoff
    early = dfy[early_mask].copy()

    prev = df_all[df_all["game_year"]==year-1]
    vel_prev = prev.groupby("pitcher")["release_speed"].mean().rename("avg_velocity_prev").reset_index()
    vel_season = dfy.groupby("pitcher")["release_speed"].mean().rename("season_velocity").reset_index()
    vel_early = early.groupby("pitcher")["release_speed"].mean().rename("early_velocity").reset_index()

    vel = vel_season.merge(vel_prev, on="pitcher", how="left").merge(vel_early, on="pitcher", how="left")
    vel["velocity_delta"] = np.where(vel["avg_velocity_prev"].notna(),
                                     vel["season_velocity"] - vel["avg_velocity_prev"],
                                     vel["season_velocity"] - vel["early_velocity"])

    daily = (dfy.groupby(["pitcher", dfy["game_date"].dt.date]).size()
             .reset_index(name="pitches"))
    daily = daily.rename(columns={"game_date":"date"})
    daily["date"] = pd.to_datetime(daily["date"])
    daily = daily.sort_values(["pitcher","date"])
    def _complete_dates(g):
        idx = pd.date_range(g["date"].min(), g["date"].max(), freq="D")
        gg = g.set_index("date").reindex(idx).fillna(0.0)
        gg.index.name = "date"
        gg = gg.rename_axis(["date"]).reset_index()
        gg["pitcher"] = g["pitcher"].iloc[0]
        return gg
    daily_full = daily.groupby("pitcher", group_keys=False).apply(_complete_dates)
    daily_full["roll14"] = daily_full.groupby("pitcher")["pitches"].transform(lambda s: s.rolling(14, min_periods=1).sum())
    spike = daily_full.groupby("pitcher")["roll14"].agg(max_14d="max", median_14d="median").reset_index()
    spike["workload_spike_14d"] = spike["max_14d"] / spike["median_14d"].replace(0, np.nan)

    def _pct_breaking(dd):
        pt = dd["pitch_type"].astype(str).str.upper()
        return (pt.isin(breaking)).mean()

    bb_season = dfy.groupby("pitcher").apply(_pct_breaking).rename("breaking_pct_season").reset_index()
    bb_early  = early.groupby("pitcher").apply(_pct_breaking).rename("breaking_pct_early").reset_index()
    bb = bb_season.merge(bb_early, on="pitcher", how="left")
    bb["breaking_usage_delta"] = bb["breaking_pct_season"] - bb["breaking_pct_early"]

    enh = (base
           .merge(vel[["pitcher","velocity_delta"]], on="pitcher", how="left")
           .merge(spike[["pitcher","workload_spike_14d"]], on="pitcher", how="left")
           .merge(bb[["pitcher","breaking_usage_delta"]], on="pitcher", how="left"))
    return enh

enhanced_paths = {}
for yr in [2021, 2022, 2023]:
    base = build_base(df[df["game_year"]==yr])
    if base is None or base.empty:
        print(f"[WARN] No data for {yr}")
        continue
    enh = add_enhancements(df, yr, base)
    enh["injury"] = enh["injury"].fillna(0).astype(int)
    num_cols = enh.select_dtypes(include=[np.number]).columns
    enh[num_cols] = enh[num_cols].replace([np.inf, -np.inf], np.nan)
    enh[num_cols] = enh[num_cols].fillna(enh[num_cols].median(numeric_only=True))
    out = DATA / f"features_{yr}_enhanced.csv"
    enh.to_csv(out, index=False)
    enhanced_paths[yr] = out
    print(f"Saved {out} | rows={len(enh)} | positives={enh['injury'].sum()}")

df21e = pd.read_csv(enhanced_paths[2021]); df21e["year"] = 2021
df22e = pd.read_csv(enhanced_paths[2022]); df22e["year"] = 2022
df23e = pd.read_csv(enhanced_paths[2023]); df23e["year"] = 2023

train_e = pd.concat([df21e, df22e], ignore_index=True)
test_e  = df23e.copy()

y_train_e = train_e["injury"].astype(int)
y_test_e  = test_e["injury"].astype(int)

drop_cols = ["injury","pitcher","year"]
X_train_e = train_e.drop(columns=[c for c in drop_cols if c in train_e.columns], errors="ignore").select_dtypes(np.number)
X_test_e  = test_e.drop(columns=[c for c in drop_cols if c in test_e.columns], errors="ignore").select_dtypes(np.number)

X_train_e = X_train_e.fillna(X_train_e.median(numeric_only=True))
X_test_e  = X_test_e.fillna(X_train_e.median(numeric_only=True))

lr_pipe_e = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs")),
])
lr_pipe_e.fit(X_train_e, y_train_e)
y_prob_lr_e = lr_pipe_e.predict_proba(X_test_e)[:, 1]
y_pred_lr_e = (y_prob_lr_e >= 0.5).astype(int)
auc_lr_e = roc_auc_score(y_test_e, y_prob_lr_e)
print("\n=== Logistic Regression (ENHANCED) ===")
print(classification_report(y_test_e, y_pred_lr_e, digits=3, zero_division=0))
print(f"ROC AUC: {auc_lr_e:.3f}")

neg, pos = (y_train_e==0).sum(), (y_train_e==1).sum()
scale_pos_weight = (neg/pos) if pos>0 else 1.0
xgb_e = XGBClassifier(
    n_estimators=500, learning_rate=0.05, max_depth=4,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    objective="binary:logistic", eval_metric="auc",
    n_jobs=-1, scale_pos_weight=scale_pos_weight, random_state=42
)
xgb_e.fit(X_train_e, y_train_e)
y_prob_xgb_e = xgb_e.predict_proba(X_test_e)[:, 1]
y_pred_xgb_e = (y_prob_xgb_e >= 0.5).astype(int)
auc_xgb_e = roc_auc_score(y_test_e, y_prob_xgb_e)
print("\n=== XGBoost (ENHANCED) ===")
print(classification_report(y_test_e, y_pred_xgb_e, digits=3, zero_division=0))
print(f"ROC AUC: {auc_xgb_e:.3f}")

enh_lr_auc  = float(auc_lr_e)
enh_xgb_auc = float(auc_xgb_e)


Dedup pitches: 2254254 -> 2254254


  daily_full = daily.groupby("pitcher", group_keys=False).apply(_complete_dates)
  bb_season = dfy.groupby("pitcher").apply(_pct_breaking).rename("breaking_pct_season").reset_index()
  bb_early  = early.groupby("pitcher").apply(_pct_breaking).rename("breaking_pct_early").reset_index()


Saved /Users/kavehnaini/pitcher-injury-predictor/data/features_2021_enhanced.csv | rows=909 | positives=14


  return _nanquantile_unchecked(
  daily_full = daily.groupby("pitcher", group_keys=False).apply(_complete_dates)
  bb_season = dfy.groupby("pitcher").apply(_pct_breaking).rename("breaking_pct_season").reset_index()
  bb_early  = early.groupby("pitcher").apply(_pct_breaking).rename("breaking_pct_early").reset_index()


Saved /Users/kavehnaini/pitcher-injury-predictor/data/features_2022_enhanced.csv | rows=1391 | positives=7


  return _nanquantile_unchecked(
  daily_full = daily.groupby("pitcher", group_keys=False).apply(_complete_dates)
  bb_season = dfy.groupby("pitcher").apply(_pct_breaking).rename("breaking_pct_season").reset_index()
  bb_early  = early.groupby("pitcher").apply(_pct_breaking).rename("breaking_pct_early").reset_index()


Saved /Users/kavehnaini/pitcher-injury-predictor/data/features_2023_enhanced.csv | rows=1243 | positives=17

=== Logistic Regression (ENHANCED) ===
              precision    recall  f1-score   support

           0      0.992     0.686     0.811      1226
           1      0.025     0.588     0.049        17

    accuracy                          0.685      1243
   macro avg      0.509     0.637     0.430      1243
weighted avg      0.979     0.685     0.801      1243

ROC AUC: 0.641

=== XGBoost (ENHANCED) ===
              precision    recall  f1-score   support

           0      0.986     1.000     0.993      1226
           1      0.000     0.000     0.000        17

    accuracy                          0.986      1243
   macro avg      0.493     0.500     0.497      1243
weighted avg      0.973     0.986     0.980      1243

ROC AUC: 0.668


In [6]:
from pathlib import Path
import json

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
DATA.mkdir(parents=True, exist_ok=True)

def _get(v): 
    return float(globals()[v]) if v in globals() else None

metrics = {
    "baseline_lr_auc": _get("baseline_lr_auc"),
    "xgb_auc": _get("xgb_auc"),
    "enh_lr_auc": _get("enh_lr_auc"),
    "enh_xgb_auc": _get("enh_xgb_auc"),
}

def _fmt(x):
    return f"{x:.3f}" if isinstance(x, (int, float)) and x == x else "N/A"

best_auc = None
for k in ["enh_xgb_auc","xgb_auc","enh_lr_auc","baseline_lr_auc"]:
    if metrics.get(k) is not None:
        best_auc = metrics[k]; break

results_para = f"""**Results.** Using Statcast pitch-by-pitch data (2021–2023) with RosterResource injury labels, \
I built pitcher-season features (velocity, spin, workload, pitch mix) and trained models with a proper \
temporal split (train: 2021–2022; test: 2023). Baseline logistic regression achieved ROC AUC {_fmt(metrics['baseline_lr_auc'])}. \
Gradient boosting (XGBoost) improved discrimination to ROC AUC {_fmt(metrics['xgb_auc'])}, and with engineered features—\
velocity deltas, 14-day workload spikes, and breaking-usage change—the best model reached ROC AUC {_fmt(metrics['enh_xgb_auc'])} \
(on 2023 holdout). Due to extreme class imbalance, threshold tuning (e.g., Youden’s J / best-F1) is necessary to trade recall vs precision."""

limitations_para = """**Limitations.** Injury is a rare and partially noisy label; RosterResource coverage varies by year. \
Labels are season-level (did a pitcher go on IL this season), which ignores injury timing within the year. \
Feature scope is mostly kinematics/workload; external risk factors (medical history, biomechanics labs, conditioning) are absent. \
A richer label (injury date) and finer-grained rolling features should improve recall without spiking false positives."""

methods_outline = """**Methods (brief).**
- Data: MLB Statcast (pybaseball extract) 2021–2023; RosterResource injury lists (mapped by MLBAM ID).
- Labeling: pitcher-season injury = 1 if pitcher appears on that season’s injury list; pitch-level labels aggregated to season via max().
- Features: per-game workload & rest days; average & 95th-percentile velocity; average spin; pitch-mix proportions.
- Engineered: velocity delta (season vs prior-year or early-season), 14-day workload spike (max/median), breaking-usage delta (season–early).
- Split: train on 2021–2022, test on 2023 (no leakage).
- Models: Logistic Regression (balanced), XGBoost (scale_pos_weight); median imputation; standardization for LR; threshold tuning by ROC/PR analysis."""

common_app_150 = f"""Built MLB pitcher-injury model (Statcast+FanGraphs); engineered workload/velo-delta features; best ROC-AUC {_fmt(best_auc)} on 2023 holdout."""

(DATA/"RESULTS_AND_LIMITATIONS.md").write_text(results_para + "\n\n" + limitations_para)
(DATA/"METHODS.md").write_text(methods_outline)
(DATA/"COMMON_APP_LINE.txt").write_text(common_app_150)

print("Saved:")
print(" -", (DATA/"RESULTS_AND_LIMITATIONS.md").resolve())
print(" -", (DATA/"METHODS.md").resolve())
print(" -", (DATA/"COMMON_APP_LINE.txt").resolve())
print("\n150-char line:\n", common_app_150)


Saved:
 - /Users/kavehnaini/pitcher-injury-predictor/data/RESULTS_AND_LIMITATIONS.md
 - /Users/kavehnaini/pitcher-injury-predictor/data/METHODS.md
 - /Users/kavehnaini/pitcher-injury-predictor/data/COMMON_APP_LINE.txt

150-char line:
 Built MLB pitcher-injury model (Statcast+FanGraphs); engineered workload/velo-delta features; best ROC-AUC 0.668 on 2023 holdout.


In [7]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report, roc_auc_score

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
DATA.mkdir(parents=True, exist_ok=True)
MODELS = (Path.cwd().parent if Path.cwd().name=="data" else Path.cwd()) / "models"
MODELS.mkdir(parents=True, exist_ok=True)

try:
    import joblib
except Exception:
    from sklearn.externals import joblib  # fallback

use_enh = (DATA/"features_2021_enhanced.csv").exists() and (DATA/"features_2023_enhanced.csv").exists()

def load_features(enh: bool):
    if enh:
        df21 = pd.read_csv(DATA/"features_2021_enhanced.csv"); df21["year"]=2021
        df22 = pd.read_csv(DATA/"features_2022_enhanced.csv"); df22["year"]=2022
        df23 = pd.read_csv(DATA/"features_2023_enhanced.csv"); df23["year"]=2023
    else:
        df21 = pd.read_csv(DATA/"features_2021.csv"); df21["year"]=2021
        df22 = pd.read_csv(DATA/"features_2022.csv"); df22["year"]=2022
        df23 = pd.read_csv(DATA/"features_2023.csv"); df23["year"]=2023
    train = pd.concat([df21, df22], ignore_index=True)
    test = df23.copy()
    y_train = train["injury"].astype(int)
    y_test  = test["injury"].astype(int)
    drop_cols = ["injury","year"]
    X_train = train.drop(columns=[c for c in drop_cols if c in train.columns], errors="ignore")
    X_test  = test.drop(columns=[c for c in drop_cols if c in test.columns], errors="ignore")
    # Keep pitcher id for later merge
    pitcher_test = test["pitcher"] if "pitcher" in test.columns else pd.Series(np.arange(len(test)))
    # numeric only + impute
    X_train = X_train.select_dtypes(np.number).fillna(X_train.median(numeric_only=True))
    X_test  = X_test.select_dtypes(np.number).fillna(X_train.median(numeric_only=True))
    return X_train, y_train, X_test, y_test, pitcher_test

X_train, y_train, X_test, y_test, pitcher_ids = load_features(use_enh)

model_used = None
y_proba = None

try:
    if use_enh and 'xgb_e' in globals():
        y_proba = xgb_e.predict_proba(X_test)[:,1]
        model_used = "xgboost_enhanced"
    elif (not use_enh) and 'xgb' in globals():
        y_proba = xgb.predict_proba(X_test)[:,1]
        model_used = "xgboost_basic"
except Exception:
    pass

if y_proba is None:
    from xgboost import XGBClassifier
    neg, pos = (y_train==0).sum(), (y_train==1).sum()
    spw = (neg/pos) if pos>0 else 1.0
    xgb_tmp = XGBClassifier(
        n_estimators=500, learning_rate=0.05, max_depth=4,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
        objective="binary:logistic", eval_metric="auc",
        n_jobs=-1, scale_pos_weight=spw, random_state=42
    )
    xgb_tmp.fit(X_train, y_train)
    y_proba = xgb_tmp.predict_proba(X_test)[:,1]
    model_used = "xgboost_enhanced_fitnow" if use_enh else "xgboost_basic_fitnow"
    try:
        joblib.dump(xgb_tmp, MODELS/f"{model_used}.joblib")
    except Exception as e:
        print("[WARN] Could not save model:", e)

auc = roc_auc_score(y_test, y_proba)
print(f"Model: {model_used} | Test ROC AUC (2023): {auc:.3f}")

fpr, tpr, thr = roc_curve(y_test, y_proba)
grid = np.unique(np.concatenate([thr, np.linspace(0.05,0.95,181)]))
best_f1, best_thr = -1, 0.5
for t in grid:
    pred = (y_proba >= t).astype(int)
    tp = ((pred==1)&(y_test==1)).sum()
    fp = ((pred==1)&(y_test==0)).sum()
    fn = ((pred==0)&(y_test==1)).sum()
    prec = tp/(tp+fp) if (tp+fp)>0 else 0.0
    rec  = tp/(tp+fn) if (tp+fn)>0 else 0.0
    f1 = 2*prec*rec/(prec+rec) if (prec+rec)>0 else 0.0
    if f1 > best_f1:
        best_f1, best_thr = f1, t

pred = (y_proba >= best_thr).astype(int)
print(f"Chosen threshold (best-F1): {best_thr:.3f}")

preds = pd.DataFrame({
    "pitcher": pitcher_ids.values,
    "injury_actual": y_test.values,
    "injury_prob": y_proba,
    "injury_pred": pred,
    "model": model_used,
    "threshold": best_thr,
})
preds = preds.sort_values("injury_prob", ascending=False).reset_index(drop=True)

out_full = DATA / f"predictions_2023_{model_used}.csv"
out_topk = DATA / f"predictions_2023_{model_used}_top25.csv"
preds.to_csv(out_full, index=False)
preds.head(25).to_csv(out_topk, index=False)
print("Saved:")
print(" -", out_full.resolve())
print(" -", out_topk.resolve())
print("\nTop 10 preview:")
print(preds.head(10))


Model: xgboost_enhanced_fitnow | Test ROC AUC (2023): 0.709
Chosen threshold (best-F1): 0.007
Saved:
 - /Users/kavehnaini/pitcher-injury-predictor/data/predictions_2023_xgboost_enhanced_fitnow.csv
 - /Users/kavehnaini/pitcher-injury-predictor/data/predictions_2023_xgboost_enhanced_fitnow_top25.csv

Top 10 preview:
   pitcher  injury_actual  injury_prob  injury_pred                    model  \
0   502171              0     0.222943            1  xgboost_enhanced_fitnow   
1   681911              0     0.149467            1  xgboost_enhanced_fitnow   
2   686294              0     0.118790            1  xgboost_enhanced_fitnow   
3   687396              0     0.117842            1  xgboost_enhanced_fitnow   
4   641540              0     0.108292            1  xgboost_enhanced_fitnow   
5   622072              0     0.095002            1  xgboost_enhanced_fitnow   
6   682989              0     0.085114            1  xgboost_enhanced_fitnow   
7   471911              0     0.076660      

In [8]:
from pathlib import Path
import textwrap

ROOT = Path.cwd().parent if Path.cwd().name=="data" else Path.cwd()
DATA = ROOT/"data"; DATA.mkdir(exist_ok=True)
SCRIPTS = ROOT/"scripts"; SCRIPTS.mkdir(exist_ok=True)
MODELS = ROOT/"models"; MODELS.mkdir(exist_ok=True)

readme = f"""# Pitcher Injury Predictor

End-to-end pipeline to predict MLB **pitcher-season** injury risk using Statcast (2021–2023) and RosterResource injuries.

## Highlights
- Features: velocity (avg/p95), spin, workload & rest, pitch mix.
- Engineered: **velocity delta**, **14-day workload spike**, **breaking-usage delta**.
- Temporal eval: **train 2021–2022 → test 2023**.
- Best model: gradient boosting (XGBoost). See `data/RESULTS_AND_LIMITATIONS.md`.

## Quickstart
```bash
conda activate pitcherinjury
jupyter lab


SyntaxError: incomplete input (1419805489.py, line 10)

In [9]:
from pathlib import Path
import textwrap

ROOT = Path.cwd().parent if Path.cwd().name=="data" else Path.cwd()
DATA = ROOT/"data"; DATA.mkdir(exist_ok=True)
SCRIPTS = ROOT/"scripts"; SCRIPTS.mkdir(exist_ok=True)
MODELS = ROOT/"models"; MODELS.mkdir(exist_ok=True)

readme = f"""# Pitcher Injury Predictor

End-to-end pipeline to predict MLB **pitcher-season** injury risk using Statcast (2021–2023) and RosterResource injuries.

## Highlights
- Features: velocity (avg/p95), spin, workload & rest, pitch mix.
- Engineered: **velocity delta**, **14-day workload spike**, **breaking-usage delta**.
- Temporal eval: **train 2021–2022 → test 2023**.
- Best model: gradient boosting (XGBoost). See `data/RESULTS_AND_LIMITATIONS.md`.

## Quickstart
```bash
conda activate pitcherinjury
jupyter lab


SyntaxError: incomplete input (1419805489.py, line 10)

In [10]:
from pathlib import Path
import textwrap

CWD = Path.cwd()
ROOT = CWD.parent if CWD.name == "data" else CWD
DATA = ROOT / "data"
SCRIPTS = ROOT / "scripts"
MODELS = ROOT / "models"
for p in (DATA, SCRIPTS, MODELS):
    p.mkdir(parents=True, exist_ok=True)

readme_text = textwrap.dedent("""
# Pitcher Injury Predictor

End-to-end pipeline to predict MLB **pitcher-season** injury risk using Statcast (2021–2023) and RosterResource injuries.

## Highlights
- Features: velocity (avg/p95), spin, workload & rest, pitch mix.
- Engineered: velocity delta, 14-day workload spike, breaking-usage delta.
- Temporal eval: train 2021–2022 → test 2023.
- Best model: gradient boosting (XGBoost). See `data/RESULTS_AND_LIMITATIONS.md`.

## Quickstart

```bash
conda activate pitcherinjury
jupyter lab


SyntaxError: incomplete input (4102527014.py, line 14)

In [15]:
from pathlib import Path
import textwrap

CWD = Path.cwd()
ROOT = CWD.parent if CWD.name == "data" else CWD
DATA = ROOT / "data"
SCRIPTS = ROOT / "scripts"
MODELS = ROOT / "models"
for p in (DATA, SCRIPTS, MODELS):
    p.mkdir(parents=True, exist_ok=True)

readme_text = textwrap.dedent("""
# Pitcher Injury Predictor

End-to-end pipeline to predict MLB **pitcher-season** injury risk using Statcast (2021–2023) and RosterResource injuries.

## Highlights
- Features: velocity (avg/p95), spin, workload & rest, pitch mix.
- Engineered: velocity delta, 14-day workload spike, breaking-usage delta.
- Temporal eval: train 2021–2022 → test 2023.
- Best model: gradient boosting (XGBoost). See `data/RESULTS_AND_LIMITATIONS.md`.

## Quickstart

```bash
conda activate pitcherinjury
jupyter lab """)


In [14]:
print("hi")

hi


In [16]:
python scripts/predict_2023.py


SyntaxError: invalid syntax (175848011.py, line 1)

In [17]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.metrics import average_precision_score

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"

cands = sorted(DATA.glob("predictions_2023_*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
assert cands, "No predictions_2023_*.csv found. Run G2 first."
pred_path = cands[0]
print("Using predictions file:", pred_path)

preds = pd.read_csv(pred_path)
preds = preds.sort_values("injury_prob", ascending=False).reset_index(drop=True)
preds["rank"] = np.arange(1, len(preds)+1)

y_true = preds["injury_actual"].astype(int).values
y_score = preds["injury_prob"].values
n_pos = int(y_true.sum())
n_all = len(y_true)
prevalence = n_pos / n_all if n_all else 0

print(f"Test size: {n_all} | Positives: {n_pos} ({prevalence:.3%})")

def precision_at_k(k):
    k = min(k, n_all)
    return float((preds.head(k)["injury_actual"] == 1).mean() if k>0 else 0.0)

for k in [5, 10, 25, 50, 100]:
    print(f"Precision@{k}: {precision_at_k(k):.3f}")

ap = average_precision_score(y_true, y_score) if n_pos > 0 else float("nan")
print(f"Average Precision (PR-AUC): {ap:.3f}")

pos_rows = preds[preds["injury_actual"] == 1][["pitcher","injury_prob","rank"]]
print("\nActual injured pitchers (2023) and their ranks (sorted by rank):")
print(pos_rows.sort_values("rank").to_string(index=False))


Using predictions file: /Users/kavehnaini/pitcher-injury-predictor/data/predictions_2023_xgboost_enhanced_fitnow_top25.csv
Test size: 25 | Positives: 1 (4.000%)
Precision@5: 0.000
Precision@10: 0.000
Precision@25: 0.040
Precision@50: 0.040
Precision@100: 0.040
Average Precision (PR-AUC): 0.056

Actual injured pitchers (2023) and their ranks (sorted by rank):
 pitcher  injury_prob  rank
  669721     0.047814    18


In [18]:
import pandas as pd, numpy as np
from pathlib import Path

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"

try:
    preds
except NameError:
    cands = sorted(DATA.glob("predictions_2023_*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
    assert cands, "No predictions_2023_*.csv found."
    preds = pd.read_csv(cands[0]).sort_values("injury_prob", ascending=False).reset_index(drop=True)

K = 25   # change this if you want a different list size
topk = preds.head(K).copy()

def add_names(df):
    try:
        from pybaseball import playerid_reverse_lookup
        ids = df["pitcher"].dropna().astype(int).unique().tolist()
        if not ids:
            return df
        names = playerid_reverse_lookup(ids, key_type="mlbam")
        if "name_first" in names.columns and "name_last" in names.columns:
            names["full_name"] = names["name_first"].str.title() + " " + names["name_last"].str.title()
        elif "name_use" in names.columns:
            names["full_name"] = names["name_use"]
        else:
            name_cols = [c for c in names.columns if "name" in c.lower()]
            if name_cols:
                names["full_name"] = names[name_cols[0]]
            else:
                names["full_name"] = ""
        key_col = "key_mlbam" if "key_mlbam" in names.columns else ("mlbam" if "mlbam" in names.columns else None)
        if key_col is None:
            return df
        out = df.merge(names[[key_col,"full_name"]], left_on="pitcher", right_on=key_col, how="left")
        out = out.drop(columns=[c for c in ["key_mlbam","mlbam"] if c in out.columns])
        return out
    except Exception as e:
        print("[INFO] Could not enrich names:", e)
        return df

topk_named = add_names(topk)
out_topk_named = DATA / "predictions_2023_top25_with_names.csv"
topk_named.to_csv(out_topk_named, index=False)
print("Saved:", out_topk_named.resolve())
print(topk_named.head(10))


Gathering player lookup table. This may take a moment.
Saved: /Users/kavehnaini/pitcher-injury-predictor/data/predictions_2023_top25_with_names.csv
   pitcher  injury_actual  injury_prob  injury_pred                    model  \
0   502171              0     0.222943            1  xgboost_enhanced_fitnow   
1   681911              0     0.149467            1  xgboost_enhanced_fitnow   
2   686294              0     0.118790            1  xgboost_enhanced_fitnow   
3   687396              0     0.117842            1  xgboost_enhanced_fitnow   
4   641540              0     0.108292            1  xgboost_enhanced_fitnow   
5   622072              0     0.095002            1  xgboost_enhanced_fitnow   
6   682989              0     0.085114            1  xgboost_enhanced_fitnow   
7   471911              0     0.076660            1  xgboost_enhanced_fitnow   
8   571510              0     0.075359            1  xgboost_enhanced_fitnow   
9   641329              0     0.064499            1 

In [19]:
from pathlib import Path
import pandas as pd, numpy as np
from sklearn.metrics import (
    roc_curve, auc, roc_auc_score, precision_recall_curve,
    average_precision_score, confusion_matrix
)
import textwrap

AUTHOR_NAME   = "Your Name"           # <-- EDIT
CONTACT_EMAIL = "you@example.com"     # <-- EDIT
GITHUB_URL    = "https://github.com/<you>/pitcher-injury-predictor"  # <-- EDIT if you want
DEMO_URL      = "streamlit: run locally; deploy optional"            # <-- optional

DATA = Path.cwd() if Path.cwd().name == "data" else Path.cwd()/"data"
DATA.mkdir(exist_ok=True, parents=True)

cands = sorted(DATA.glob("predictions_2023_*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
assert cands, "No predictions_2023_*.csv found. Run G2 first to export predictions."
pred_path = cands[0]
preds = pd.read_csv(pred_path).sort_values("injury_prob", ascending=False).reset_index(drop=True)

y_true = preds["injury_actual"].astype(int).values
y_score = preds["injury_prob"].values
n_pos = int(y_true.sum()); n_all = len(y_true)
prev = n_pos / n_all if n_all else 0

roc_auc = roc_auc_score(y_true, y_score) if n_pos>0 and n_all>0 else float("nan")
pr_auc  = average_precision_score(y_true, y_score) if n_pos>0 else float("nan")

fpr, tpr, thr = roc_curve(y_true, y_score)
grid = np.unique(np.concatenate([thr, np.linspace(0.01, 0.99, 199)]))
def f1_at(t):
    p = (y_score >= t).astype(int)
    tp = ((p==1)&(y_true==1)).sum(); fp = ((p==1)&(y_true==0)).sum(); fn = ((p==0)&(y_true==1)).sum()
    prec = tp/(tp+fp) if (tp+fp)>0 else 0.0
    rec  = tp/(tp+fn) if (tp+fn)>0 else 0.0
    return (2*prec*rec/(prec+rec)) if (prec+rec)>0 else 0.0
best_f1, best_thr = max((f1_at(t), t) for t in grid)
j_idx = np.argmax(tpr - fpr)
j_thr = thr[j_idx] if len(thr)>0 else 0.5

def prec_at_k(df, k):
    k = min(k, len(df))
    return float((df.head(k)["injury_actual"]==1).mean()) if k>0 else 0.0

p5 = prec_at_k(preds, 5)
p10 = prec_at_k(preds, 10)
p25 = prec_at_k(preds, 25)
p50 = prec_at_k(preds, 50)
p100 = prec_at_k(preds, 100)

def cm_at(t):
    p = (y_score >= t).astype(int)
    return confusion_matrix(y_true, p)

cm_best = cm_at(best_thr)
cm_j    = cm_at(j_thr)

topk = preds.head(25).copy()
def add_names(df):
    try:
        from pybaseball import playerid_reverse_lookup
        ids = df["pitcher"].dropna().astype(int).unique().tolist()
        names = playerid_reverse_lookup(ids, key_type="mlbam")
        if "name_first" in names.columns and "name_last" in names.columns:
            names["full_name"] = names["name_first"].str.title() + " " + names["name_last"].str.title()
        elif "name_use" in names.columns:
            names["full_name"] = names["name_use"]
        else:
            name_cols = [c for c in names.columns if "name" in c.lower()]
            names["full_name"] = names[name_cols[0]] if name_cols else ""
        key_col = "key_mlbam" if "key_mlbam" in names.columns else ("mlbam" if "mlbam" in names.columns else None)
        if key_col is None: 
            return df
        out = df.merge(names[[key_col,"full_name"]], left_on="pitcher", right_on=key_col, how="left")
        out = out.drop(columns=[c for c in ["key_mlbam","mlbam"] if c in out.columns])
        out["display_name"] = out["full_name"].fillna("").where(out["full_name"].notna() & (out["full_name"]!=""), out["pitcher"].astype(str))
        return out
    except Exception as e:
        df["display_name"] = df["pitcher"].astype(str)
        return df

topk = add_names(topk)
topk_out = DATA/"predictions_2023_top25_named.csv"
topk[["pitcher","display_name","injury_prob","injury_actual"]].to_csv(topk_out, index=False)

try:
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
except ImportError:
    raise SystemExit("matplotlib is required. Install it with: pip install matplotlib")

pdf_path = DATA / "pitcher_injury_summary.pdf"
with PdfPages(pdf_path) as pdf:
    fig = plt.figure(figsize=(8.5, 11))
    fig.text(0.1, 0.92, "MLB Pitcher Injury Prediction (2021–2023)", fontsize=20, weight="bold")
    fig.text(0.1, 0.88, f"Author: {AUTHOR_NAME}   •   Contact: {CONTACT_EMAIL}", fontsize=11)
    if GITHUB_URL: fig.text(0.1, 0.85, f"GitHub: {GITHUB_URL}", fontsize=11)
    if DEMO_URL:   fig.text(0.1, 0.82, f"Demo: {DEMO_URL}", fontsize=11)

    abstract = f"""
    We predict pitcher-season injury risk using MLB Statcast (2021–2023) and RosterResource IL lists.
    Features include workload & rest, velocity (avg / p95), spin, and pitch mix. Engineered features add
    velocity deltas vs prior year or early-season, 14-day workload spikes, and breaking-usage change.
    Trained on 2021–2022 and evaluated on 2023 holdout.

    Test ROC AUC (2023): {roc_auc:.3f}    •    PR-AUC: {pr_auc:.3f}    •    Prevalence: {prev:.2%}
    """
    abstract = textwrap.dedent(abstract).strip()
    fig.text(0.1, 0.74, abstract, fontsize=11, va="top")
    rl = (DATA/"RESULTS_AND_LIMITATIONS.md")
    if rl.exists():
        txt = rl.read_text()
        txt_wrapped = textwrap.fill(txt, width=100)
        fig.text(0.1, 0.68, "Results & Limitations (summary):", fontsize=13, weight="bold")
        fig.text(0.1, 0.66, txt_wrapped[:1800] + ("..." if len(txt_wrapped)>1800 else ""), fontsize=9, va="top")
    pdf.savefig(fig); plt.close(fig)

    fig = plt.figure(figsize=(8.5, 11))
    fig.text(0.1, 0.94, "Methods (brief)", fontsize=16, weight="bold")
    methods = """
    • Data: Statcast 2021–2023; RosterResource injuries (mapped by MLBAM ID).
    • Labels: season-level injury = 1 if the pitcher appears that season; aggregated pitch→season by max().
    • Features: per-game workload & rest; avg & p95 velocity; avg spin; pitch-mix %.
    • Engineered: velocity delta (vs prior-year or early-season), 14-day workload spike (max/median),
      breaking-usage delta (season – early).
    • Split: train 2021–2022 → test 2023 (no leakage).
    • Models: Logistic Regression (balanced) and XGBoost (scale_pos_weight); median imputation.
    • Thresholding: tune via ROC (Youden’s J) or Best-F1; also report precision@K.
    """
    fig.text(0.1, 0.90, textwrap.dedent(methods).strip(), fontsize=10, va="top")

    cols = ["K","Precision"]
    tbl = pd.DataFrame([
        (5,  p5), (10, p10), (25, p25), (50, p50), (100, p100)
    ], columns=cols)
    ax = fig.add_axes([0.10, 0.55, 0.35, 0.25])
    ax.axis("off")
    ax.set_title("Precision@K (2023)", fontsize=12, pad=8)
    ax.table(cellText=[(int(k), f"{v:.3f}") for k,v in zip(tbl["K"], tbl["Precision"])],
             colLabels=cols, loc="center")
    ax2 = fig.add_axes([0.55, 0.55, 0.35, 0.25])
    ax2.axis("off")
    ax2.set_title(f"Confusion @ Best-F1 (t={best_thr:.3f})", fontsize=12, pad=8)
    tn, fp, fn, tp = cm_best.ravel()
    ax2.table(cellText=[["TN", tn], ["FP", fp], ["FN", fn], ["TP", tp]],
              colLabels=["", "Count"], loc="center")
    pdf.savefig(fig); plt.close(fig)

    fig = plt.figure(figsize=(8.5, 11))
    fig.text(0.1, 0.94, "Discrimination Curves (2023)", fontsize=16, weight="bold")
    ax = fig.add_axes([0.12, 0.56, 0.75, 0.32])
    ax.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.3f}")
    ax.plot([0,1],[0,1],"--", alpha=0.5)
    ax.set_xlabel("False Positive Rate"); ax.set_ylabel("True Positive Rate"); ax.set_title("ROC")
    ax.legend(loc="lower right")
    prec, recall, _ = precision_recall_curve(y_true, y_score)
    ax2 = fig.add_axes([0.12, 0.12, 0.75, 0.32])
    ax2.plot(recall, prec, label=f"PR AUC = {pr_auc:.3f}")
    ax2.set_xlabel("Recall"); ax2.set_ylabel("Precision"); ax2.set_title("Precision–Recall")
    ax2.legend(loc="upper right")
    pdf.savefig(fig); plt.close(fig)

    fig = plt.figure(figsize=(8.5, 11))
    fig.text(0.1, 0.94, "Top-25 Predicted Injury Risk (2023)", fontsize=16, weight="bold")
    cols = ["Rank","Player","MLBAM","Prob","Actual"]
    tab = pd.DataFrame({
        "Rank": np.arange(1, min(26, len(topk))+1),
        "Player": topk["display_name"].fillna("").str.slice(0,24),
        "MLBAM": topk["pitcher"].astype(str),
        "Prob": topk["injury_prob"].map(lambda x: f"{x:.3f}"),
        "Actual": topk["injury_actual"].astype(int).astype(str),
    })
    half = int(np.ceil(len(tab)/2))
    left, right = tab.iloc[:half], tab.iloc[half:]
    ax = fig.add_axes([0.08, 0.12, 0.40, 0.78]); ax.axis("off"); ax.set_title("Top-25 (1–{})".format(half), fontsize=12)
    ax.table(cellText=left.values, colLabels=cols, loc="center")
    ax2 = fig.add_axes([0.52, 0.12, 0.40, 0.78]); ax2.axis("off"); ax2.set_title("Top-25 ({}–25)".format(half+1), fontsize=12)
    if not right.empty:
        ax2.table(cellText=right.values, colLabels=cols, loc="center")
    pdf.savefig(fig); plt.close(fig)

print("PDF saved to:", pdf_path.resolve())
print("Top-25 with names saved to:", topk_out.resolve())


PDF saved to: /Users/kavehnaini/pitcher-injury-predictor/data/pitcher_injury_summary.pdf
Top-25 with names saved to: /Users/kavehnaini/pitcher-injury-predictor/data/predictions_2023_top25_named.csv


In [20]:
from pathlib import Path
import textwrap, sys

ROOT = Path.cwd().parent if Path.cwd().name=="data" else Path.cwd()
app_py = ROOT / "app.py"
app_text = textwrap.dedent("""
import pandas as pd, numpy as np
from pathlib import Path
import streamlit as st
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix

st.set_page_config(page_title="Pitcher Injury Predictor", layout="wide")

DATA = Path(__file__).resolve().parent / "data"
cands = sorted(DATA.glob("predictions_2023_*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
if not cands:
    st.error("No predictions_2023_*.csv found. Run the notebook G2 cell first.")
    st.stop()

pred_path = cands[0]
preds = pd.read_csv(pred_path).sort_values("injury_prob", ascending=False).reset_index(drop=True)
st.caption(f"Using: {pred_path.name}")

# Optional: enrich names via pybaseball
def add_names(df):
    try:
        from pybaseball import playerid_reverse_lookup
        ids = df["pitcher"].dropna().astype(int).unique().tolist()
        names = playerid_reverse_lookup(ids, key_type="mlbam")
        if "name_first" in names.columns and "name_last" in names.columns:
            names["full_name"] = names["name_first"].str.title() + " " + names["name_last"].str.title()
        elif "name_use" in names.columns:
            names["full_name"] = names["name_use"]
        else:
            name_cols = [c for c in names.columns if "name" in c.lower()]
            names["full_name"] = names[name_cols[0]] if name_cols else ""
        key_col = "key_mlbam" if "key_mlbam" in names.columns else ("mlbam" if "mlbam" in names.columns else None)
        if key_col is None:
            df["player_name"] = df["pitcher"].astype(str)
            return df
        out = df.merge(names[[key_col,"full_name"]], left_on="pitcher", right_on=key_col, how="left")
        out["player_name"] = out["full_name"].fillna(out["pitcher"].astype(str))
        out = out.drop(columns=[c for c in ["key_mlbam","mlbam","full_name"] if c in out.columns])
        return out
    except Exception:
        df["player_name"] = df["pitcher"].astype(str)
        return df

preds = add_names(preds)

y_true = preds["injury_actual"].astype(int).values
y_score = preds["injury_prob"].values
if (y_true.sum() > 0) and (len(y_true) > 0):
    st.sidebar.metric("ROC AUC (2023)", f"{roc_auc_score(y_true, y_score):.3f}")
    try:
        st.sidebar.metric("PR AUC", f"{average_precision_score(y_true, y_score):.3f}")
    except Exception:
        pass

st.title("Pitcher Injury Predictor — 2023")
thr = st.sidebar.slider("Decision threshold", min_value=0.0, max_value=0.2, value=0.01, step=0.001)
k = st.sidebar.number_input("Show Top-K", min_value=5, max_value=200, value=25, step=5)

preds["pred"] = (preds["injury_prob"] >= thr).astype(int)
tn, fp, fn, tp = confusion_matrix(y_true, preds["pred"]).ravel()
st.sidebar.write("Confusion matrix")
st.sidebar.write(pd.DataFrame({"": ["TN","FP","FN","TP"], "Count":[tn,fp,fn,tp]}))

topk = preds.sort_values("injury_prob", ascending=False).head(int(k)).copy()
topk_display = topk[["player_name","pitcher","injury_prob","injury_actual","pred"]]
topk_display.columns = ["Player","MLBAM","Injury Prob","Actual","Pred"]
st.subheader(f"Top {int(k)} Highest-Risk Pitchers")
st.dataframe(topk_display.style.format({"Injury Prob":"{:.3f}"}), use_container_width=True)
""").strip()

app_py.write_text(app_text)
print("Wrote:", app_py.resolve())
print("\nNext steps:")
print("1) pip install streamlit")
print("2) streamlit run app.py")
print("   (It will open a local URL; move the threshold slider and Top-K)")


Wrote: /Users/kavehnaini/pitcher-injury-predictor/app.py

Next steps:
1) pip install streamlit
2) streamlit run app.py
   (It will open a local URL; move the threshold slider and Top-K)


SyntaxError: invalid syntax (2564574216.py, line 1)

In [24]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting altair!=5.4.0,!=5.4.1,<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Downloading cachetools-6.2.0-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Downloading protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading gitpython-3.1.45-py3-none-any.whl.metadata

In [25]:
streamlit run app.py

SyntaxError: invalid syntax (3737097518.py, line 1)

In [1]:
streamlit run app.py

SyntaxError: invalid syntax (3737097518.py, line 1)

In [2]:
# Run Streamlit from the notebook (served at http://localhost:8501)
import sys
from pathlib import Path

ROOT = Path.cwd().parent if Path.cwd().name == "data" else Path.cwd()
app_path = ROOT / "app.py"
assert app_path.exists(), f"app.py not found at {app_path}"

# Optional: sanity check
try:
    import streamlit as st
    print("Streamlit version:", st.__version__)
    print("Python:", sys.executable)
except Exception:
    raise SystemExit("Streamlit not installed. Run: pip install streamlit")

# Launch the app (cell will show logs; stop with the stop button or restart kernel)
!python -m streamlit run "{app_path}" --server.headless true --server.port 8501


Streamlit version: 1.49.1
Python: /Users/kavehnaini/miniconda3/envs/pitcherinjury/bin/python3.10

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.86.21:8501[0m
[34m  External URL: [0m[1mhttp://172.91.138.225:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
Gathering player lookup table. This may take a moment.
2025-08-31 13:02:41.957 Please replace `use_container_width` with `width`.

`use_container_width` will be removed after 2025-12-31.

For `use_container_width=True`, use `width='stretch'`. For `use_container_width=False`, use `width='content'`.
2025-08-31 13:02:45.875 Please replace `use_container_width` with `width`.

`use_container_width` will be removed after 2025

In [1]:
# === (G6) Write a Python .gitignore (then follow terminal steps below) ===
from pathlib import Path
ROOT = Path.cwd().parent if Path.cwd().name=="data" else Path.cwd()
gitignore = ROOT / ".gitignore"
gitignore.write_text("""
# Byte-compiled / cache
__pycache__/
*.py[cod]
*.ipynb_checkpoints/
.ipynb_checkpoints/

# Environments
.env
.venv
venv/
ENV/
env/
.conda/
*.egg-info/

# Jupyter
*/.ipynb_checkpoints/*

# OS
.DS_Store

# Data/model artifacts (keep CSVs you want to share)
data/*.csv.gz
data/*.joblib
models/
*.log

# Optional: include final CSVs and PDF
!data/predictions_2023_*.csv
!data/predictions_2023_*_top25.csv
!data/pitcher_injury_summary.pdf
!data/RESULTS_AND_LIMITATIONS.md
!data/METHODS.md
!data/COMMON_APP_LINE.txt
""".lstrip())
print("Wrote .gitignore at:", gitignore.resolve())


Wrote .gitignore at: /Users/kavehnaini/pitcher-injury-predictor/.gitignore
