In [6]:
#!/usr/bin/env python3
"""
Build team play-type feature table from nba_api SynergyPlayTypes.
Outputs two CSVs:
  - features_with_ids.csv : SEASON, TEAM_ID, TEAM_NAME + [PPP_* , POSS_PCT_*]
  - features_only.csv     : only [PPP_* , POSS_PCT_*]

Usage: python build_synergy_features.py
"""
from cache_manager import stats_cache                   # disk+RAM cache  :contentReference[oaicite:2]{index=2}
from stats_getter import _retry_nba                     # your retry+pacing helper  :contentReference[oaicite:3]{index=3}
import time
import math
import sys
import pandas as pd
from nba_api.stats.endpoints import synergyplaytypes

# Throttle settings (more conservative)
GLOBAL_SLEEP_SECONDS = 1.2      # sleep after every successful API call
MAX_TRIES = 6                   # a touch more patience
BACKOFF_BASE = 2.0              # heavier exponential backoff
BACKOFF_JITTER = 0.40           # a bit more jitter

from pathlib import Path

TMP_DIR = Path("tmp_synergy_features")   # temp per-season outputs
TMP_DIR.mkdir(parents=True, exist_ok=True)

RESUME = True   # if True, skip seasons whose temp files already exist


# Seasons
SEASONS = [f"{y}-{str(y+1)[-2:]}" for y in range(2015, 2025)]  # 2015-16 ... 2024-25

# Verbose/Debug toggles
VERBOSE = True           # prints progress
DEBUG_RAW = True         # print brief info if payload is missing/odd
PLAY_TYPES = [
    "Transition",
    "Isolation",
    "Spotup",
    "PRBallHandler",
    "PRRollman",
    "Handoff",
    "Cut",
    "OffScreen",
    "OffRebound",
    "Postup",
    "Misc"
]
FAILED_CALLS = []   # collects (season, play_type, last_err_repr)



# Column name builders
def ppp_col(pt: str) -> str:
    return f"PPP_{pt}"

def poss_col(pt: str) -> str:
    return f"POSS_PCT_{pt}"

def _empty_return(play_type: str) -> pd.DataFrame:
    # Minimal empty frame that keeps the pipeline alive
    return pd.DataFrame(columns=["TEAM_ID","TEAM_NAME","PLAY_TYPE","PPP","POSS_PCT"]).assign(PLAY_TYPE=play_type)

SYNERGY_TEAM_OFFENSE_ENDPOINT = "SynergyPlayTypes:TeamOffense"

def _skip_empty_synergy(df: pd.DataFrame) -> bool:
    """Don’t cache empties/odd frames (keeps cache clean)."""
    if df is None or not isinstance(df, pd.DataFrame) or df.empty:
        return True
    need = {"TEAM_ID","TEAM_NAME","PPP","POSS_PCT"}
    return not need.issubset(set(df.columns))

def get_synergy_team_offense_cached(
    season: str,
    play_type: str,
    season_type_all_star: str = "Regular Season",
    per_mode_simple: str = "Totals",
    timeout: float = 20.0,
) -> pd.DataFrame:
    """
    One call per (season, play_type), cached on disk+RAM.
    Uses your _retry_nba to be resilient to timeouts/rate limits.
    Normalizes columns and returns a DataFrame.
    """
    def _fetch(season, play_type, season_type_all_star, per_mode_simple, timeout):
        # Use your retry gate; endpoint name can share the steady-sleep with others.
        def _call(to: float = timeout):
            resp = synergyplaytypes.SynergyPlayTypes(
                play_type_nullable=play_type,
                player_or_team_abbreviation='T',
                type_grouping_nullable='Offensive',
                season=season,
                season_type_all_star=season_type_all_star,
                per_mode_simple=per_mode_simple,
                timeout=to,
            )
            dfs = resp.get_data_frames()
            return dfs[0] if dfs else pd.DataFrame()

        df = _retry_nba(_call, endpoint="SynergyPlayTypes", timeout=timeout)  # :contentReference[oaicite:4]{index=4}
        if df is None or df.empty:
            return pd.DataFrame()
        keep = ["TEAM_ID","TEAM_NAME","PPP","POSS_PCT"]
        for c in keep:
            if c not in df.columns:
                df[c] = pd.NA
        # Keep only what you need; PLAY_TYPE is filled by caller
        return df[keep].copy()

    return stats_cache.get_or_fetch(                          # :contentReference[oaicite:5]{index=5}
        SYNERGY_TEAM_OFFENSE_ENDPOINT,
        _fetch,
        skip_if=_skip_empty_synergy,
        season=season,
        play_type=play_type,
        season_type_all_star=season_type_all_star,
        per_mode_simple=per_mode_simple,
        timeout=timeout,
    )


def fetch_synergy_team_table(
    season: str,
    play_type: str,
    max_tries: int = MAX_TRIES,
    sleep_between_calls: float = GLOBAL_SLEEP_SECONDS,
    backoff_base: float = BACKOFF_BASE,
    backoff_jitter: float = BACKOFF_JITTER
) -> pd.DataFrame:
    """
    Cached fetch: retries the SAME play_type only (per your request),
    uses _retry_nba under the hood, and returns an empty-shaped DF on final failure.
    """
    last_err = None
    for t in range(max_tries):
        try:
            if VERBOSE:
                print(f"[CALL] {season} / {play_type} ry {t+1}/{max_tries}")

            df0 = get_synergy_team_offense_cached(
                season=season,
                play_type=play_type,
                season_type_all_star='Regular Season',
                per_mode_simple='Totals',
                timeout=20.0,
            )
            if df0 is not None and not df0.empty and "TEAM_ID" in df0.columns:
                out = df0.copy()
                out["PLAY_TYPE"] = play_type  # normalize to your canonical
                time.sleep(sleep_between_calls)   # post-success throttle
                return out

            if DEBUG_RAW:
                print(f"[DEBUG] Empty/odd table for {season}/{play_type} on try {t+1}")

            # backoff & retry
            time.sleep((backoff_base ** t) + backoff_jitter * t)

        except Exception as e:
            last_err = e
            time.sleep((backoff_base ** t) + backoff_jitter * t)

    print(f"[WARN] No valid Synergy table for SEASON={season} PLAY_TYPE={play_type} (last_err={repr(last_err)})")
    return _empty_return(play_type)




def build_season_long(season: str) -> pd.DataFrame:
    frames = []
    for pt in PLAY_TYPES:
        df = fetch_synergy_team_table(season, pt)
        if VERBOSE:
            print(f"[OK] {season}/{pt}: rows={len(df)}")
        frames.append(df)
    season_df = pd.concat(frames, ignore_index=True)
    season_df["SEASON"] = season
    return season_df




def make_feature_tables(panel_long: pd.DataFrame) -> pd.DataFrame:
    """
    Produces a wide table with columns:
      SEASON, TEAM_ID, TEAM_NAME,
      PPP_{pt} for each pt, followed by POSS_PCT_{pt} for each pt.
    """
    # PPP wide
    ppp_wide = panel_long.pivot_table(
        index=["SEASON","TEAM_ID","TEAM_NAME"],
        columns="PLAY_TYPE",
        values="PPP",
        aggfunc="first"
    )
    # POSS_PCT wide
    poss_wide = panel_long.pivot_table(
        index=["SEASON","TEAM_ID","TEAM_NAME"],
        columns="PLAY_TYPE",
        values="POSS_PCT",
        aggfunc="first"
    )

    # Rename columns
    ppp_wide = ppp_wide.rename(columns={pt: ppp_col(pt) for pt in PLAY_TYPES})
    poss_wide = poss_wide.rename(columns={pt: poss_col(pt) for pt in PLAY_TYPES})

    # Merge to one table
    wide = pd.concat([ppp_wide, poss_wide], axis=1)
    wide = wide.reset_index()

    # Ensure all expected columns exist (in case a play type missing for a season/team)
    for col in [ppp_col(pt) for pt in PLAY_TYPES] + [poss_col(pt) for pt in PLAY_TYPES]:
        if col not in wide.columns:
            wide[col] = 0.0

    # Order columns: ids, PPP_*, POSS_PCT_*
    ordered_cols = (
        ["SEASON","TEAM_ID","TEAM_NAME"] +
        [ppp_col(pt) for pt in PLAY_TYPES] +
        [poss_col(pt) for pt in PLAY_TYPES]
    )
    wide = wide[ordered_cols]

    # Quality checks
    usage_cols = [poss_col(pt) for pt in PLAY_TYPES]
    wide["USAGE_SUM"] = wide[usage_cols].sum(axis=1)

    return wide

def write_season_csvs(season: str, season_long_df: pd.DataFrame):
    """
    For a single season:
      - make wide feature tables
      - write temp CSVs in TMP_DIR
      - returns the wide df so caller can also keep aggregating
    """
    # keep only this season’s rows (defensive)
    season_long_df = season_long_df[season_long_df["SEASON"] == season].copy()

    wide = make_feature_tables(season_long_df)

    # Filenames
    with_ids_path   = TMP_DIR / f"features_with_ids_{season}.csv"
    features_only_path = TMP_DIR / f"features_only_{season}.csv"

    # Write per-season files
    wide.to_csv(with_ids_path.as_posix(), index=False)
    feature_cols = [c for c in wide.columns if c.startswith("PPP_") or c.startswith("POSS_PCT_")]
    wide[feature_cols].to_csv(features_only_path.as_posix(), index=False)

    # Optional: quick log
    print(f"[SEASON SAVE] {season}: wrote {with_ids_path.name} (rows={len(wide)}) "
          f"and {features_only_path.name}")

    return wide

def build_panel_long(seasons) -> pd.DataFrame:
    """
    Processes seasons one-by-one.
    After each season:
      - writes temp CSVs in TMP_DIR
      - appends to 'all_frames' so we can still produce the final full CSVs.
    If RESUME is True and both temp files exist, skips refetching that season.
    """
    all_frames = []

    for s in seasons:
        with_ids_path = TMP_DIR / f"features_with_ids_{s}.csv"
        features_only_path = TMP_DIR / f"features_only_{s}.csv"

        if RESUME and with_ids_path.exists() and features_only_path.exists():
            # Fast path: read the per-season WITH IDS back into memory so we can final-merge
            print(f"[RESUME] Skipping fetch for {s} (found {with_ids_path.name})")
            all_frames.append(pd.read_csv(with_ids_path.as_posix()))
            continue

        print(f"[INFO] Fetching season {s} ...", flush=True)
        # 1) fetch long season panel
        season_long = build_season_long(s)
        # 2) write per-season outputs + get wide df returned
        season_wide = write_season_csvs(s, season_long)
        # 3) keep for final merge
        all_frames.append(season_wide)

    # Final combined wide table across all seasons
    panel = pd.concat(all_frames, ignore_index=True)
    return panel

import time
import pandas as pd
from nba_api.stats.endpoints import leaguestandingsv3

def _fetch_wins_for_season(season: str, max_tries: int = 5, base_sleep: float = 0.9) -> pd.DataFrame:
    """
    Returns a DataFrame with columns: SEASON, TEAM_ID, WINS, LOSSES, WIN_PCT
    Uses LeagueStandingsV3 (Regular Season).
    """
    last_err = None
    for t in range(max_tries):
        try:
            resp = leaguestandingsv3.LeagueStandingsV3(
                season=season,
                season_type="Regular Season",
                league_id="00"
            )
            df = resp.get_data_frames()[0].copy()
            # Normalize column names coming from V3
            df = df.rename(columns={
                "TeamID": "TEAM_ID",
                "Wins": "WINS",
                "Losses": "LOSSES",
                "WinPCT": "WIN_PCT",
            })
            out = df[["TEAM_ID", "WINS", "LOSSES", "WIN_PCT"]].copy()
            out["SEASON"] = season
            # Be polite to the API
            time.sleep(0.8)
            return out
        except Exception as e:
            last_err = e
            time.sleep((2.0 ** t) + 0.3 * t)
    raise RuntimeError(f"LeagueStandingsV3 failed for {season}: {last_err}")

def load_wins(csv_path: str, out_path: str | None = None) -> pd.DataFrame:
    """
    Read features_with_ids.csv, append WINS/LOSSES/WIN_PCT columns, and write back.

    Args
    ----
    csv_path : str
        Path to the existing 'features_with_ids.csv' (must contain SEASON and TEAM_ID).
    out_path : str | None
        Where to write the updated CSV. If None, overwrites `csv_path`.

    Returns
    -------
    pd.DataFrame
        The merged DataFrame with WINS/LOSSES/WIN_PCT added.
    """
    df = pd.read_csv(csv_path)

    required = {"SEASON", "TEAM_ID"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"{csv_path} is missing required columns: {sorted(missing)}")

    seasons = sorted(df["SEASON"].dropna().unique().tolist())
    win_frames = []
    for s in seasons:
        print(f"[WINS] Fetching standings for {s} ...")
        win_frames.append(_fetch_wins_for_season(s))

    wins = pd.concat(win_frames, ignore_index=True)

    merged = df.merge(wins, on=["SEASON", "TEAM_ID"], how="left")

    # Optional: quick check for any teams we couldn't match
    not_matched = merged["WINS"].isna().sum()
    if not_matched:
        print(f"[WARN] {not_matched} team-season rows had no WINS match (check TEAM_ID/SEASON).")

    # Write out
    if out_path is None:
        out_path = csv_path
    merged.to_csv(out_path, index=False)
    print(f"[DONE] Wrote wins to {out_path} (rows={len(merged)})")

    return merged



load_wins("features_with_ids.csv", "features_with_ids_with_wins.csv")                  # overwrite in-place






[WINS] Fetching standings for 2015-16 ...
[WINS] Fetching standings for 2016-17 ...
[WINS] Fetching standings for 2017-18 ...
[WINS] Fetching standings for 2018-19 ...
[WINS] Fetching standings for 2019-20 ...
[WINS] Fetching standings for 2020-21 ...
[WINS] Fetching standings for 2021-22 ...
[WINS] Fetching standings for 2022-23 ...
[WINS] Fetching standings for 2023-24 ...
[WINS] Fetching standings for 2024-25 ...
[DONE] Wrote wins to features_with_ids_with_wins.csv (rows=300)


Unnamed: 0,SEASON,TEAM_ID,TEAM_NAME,PPP_Transition,PPP_Isolation,PPP_Spotup,PPP_PRBallHandler,PPP_PRRollman,PPP_Handoff,PPP_Cut,PPP_OffScreen,PPP_OffRebound,PPP_Postup,PPP_Misc,POSS_PCT_Transition,POSS_PCT_Isolation,POSS_PCT_Spotup,POSS_PCT_PRBallHandler,POSS_PCT_PRRollman,POSS_PCT_Handoff,POSS_PCT_Cut,POSS_PCT_OffScreen,POSS_PCT_OffRebound,POSS_PCT_Postup,POSS_PCT_Misc,USAGE_SUM,WINS,LOSSES,WIN_PCT
0,2015-16,1610612737,Atlanta Hawks,1.118,0.803,0.978,0.733,1.029,0.929,1.216,0.959,1.046,0.97,0.572,0.149,0.056,0.21,0.167,0.086,0.043,0.084,0.061,0.034,0.057,0.053,1.0,48,34,0.585
1,2015-16,1610612738,Boston Celtics,1.102,0.783,0.937,0.781,0.956,0.862,1.258,0.952,1.017,0.869,0.564,0.163,0.052,0.199,0.143,0.073,0.066,0.068,0.061,0.056,0.059,0.06,1.0,48,34,0.585
2,2015-16,1610612739,Cleveland Cavaliers,1.196,0.845,1.054,0.816,1.158,0.895,1.227,1.015,1.021,0.886,0.511,0.132,0.089,0.202,0.179,0.063,0.032,0.07,0.047,0.045,0.081,0.06,1.0,57,25,0.695
3,2015-16,1610612740,New Orleans Pelicans,1.14,0.797,0.932,0.847,1.034,0.914,1.236,1.07,1.002,0.83,0.519,0.128,0.073,0.189,0.205,0.086,0.037,0.06,0.05,0.053,0.064,0.056,1.001,30,52,0.366
4,2015-16,1610612741,Chicago Bulls,0.995,0.815,1.025,0.829,0.979,0.877,1.215,0.944,0.988,0.809,0.491,0.117,0.064,0.198,0.183,0.065,0.051,0.079,0.038,0.063,0.082,0.06,1.0,42,40,0.512
5,2015-16,1610612742,Dallas Mavericks,1.155,0.885,1.012,0.801,1.034,0.868,1.242,0.869,0.982,0.941,0.613,0.096,0.063,0.221,0.178,0.075,0.035,0.082,0.07,0.039,0.08,0.06,0.999,42,40,0.512
6,2015-16,1610612743,Denver Nuggets,1.133,0.71,0.904,0.754,1.045,0.894,1.19,1.023,1.021,0.874,0.593,0.134,0.062,0.194,0.172,0.077,0.05,0.087,0.029,0.064,0.068,0.063,1.0,33,49,0.402
7,2015-16,1610612744,Golden State Warriors,1.158,0.927,1.097,0.98,0.993,1.066,1.249,1.062,0.914,0.801,0.575,0.175,0.063,0.166,0.105,0.051,0.028,0.107,0.118,0.048,0.061,0.079,1.001,73,9,0.89
8,2015-16,1610612745,Houston Rockets,1.107,0.871,0.961,0.832,1.027,0.874,1.181,0.964,1.039,0.83,0.564,0.19,0.094,0.203,0.133,0.048,0.031,0.074,0.033,0.056,0.064,0.072,0.998,41,41,0.5
9,2015-16,1610612746,LA Clippers,1.144,0.921,1.023,0.879,1.047,0.85,1.302,1.046,1.037,0.815,0.606,0.128,0.095,0.17,0.184,0.083,0.056,0.06,0.068,0.04,0.044,0.073,1.001,53,29,0.646
