In [33]:
import sys
import os
import get_data
import pandas as pd


In [34]:

RAW_DIR = os.path.join("data", "raw")
PROC_DIR = os.path.join("data", "processed")


def _ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)

def clean_nfl_team_stats(season: int = 2024) -> None:
    """
    Clean NFL data obtained from API
    Flattens data (json data to be loaded into CSV)
    Isolates Chicago Bears data for comparison

    Parameters
    ----------
    season : int, optional
        Data from which season should be obtained from API call.

    Returns
    -------
    NoneType
        saves raw data to external file
    """
    json_path = os.path.join(RAW_DIR, "nfl", f"team_season_stats_{season}.json")
    if not os.path.exists(json_path):
        print(f"[NFL] Missing raw file: {json_path}")
        return

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    df = pd.json_normalize(data)

    _ensure_dir(PROC_DIR)
    league_path = os.path.join(PROC_DIR, f"nfl_team_stats_{season}.csv")
    df.to_csv(league_path, index=False)
    print(f"[NFL] Saved league team stats to {league_path}")

    bears_mask = df["Team"].isin(["CHI", "Chicago Bears"])
    bears_df = df[bears_mask].copy()
    bears_path = os.path.join(PROC_DIR, f"bears_team_stats_{season}.csv")
    bears_df.to_csv(bears_path, index=False)
    print(f"[NFL] Saved Bears-only stats to {bears_path}")


def clean_cfb_player_stats(year=2024) -> None:
    """
    Clean obtained college football player data
    Flattens data (json data to be loaded into CSV)

    Parameters
    ----------
    year : int, optional
        Data from which season data is from 

    Returns
    -------
    NoneType
        saves raw data to external file
    """
    
    path = f"data/raw/college/player_stats_{year}.json"
    df = pd.json_normalize(json.load(open(path)))

    # Ensure statName and statValue exist
    if {"statType", "stat", "statValue"}.intersection(df.columns):
        # CFBD sometimes uses statType or statName
        stat_col = "statName" if "statName" in df.columns else "statType"
        value_col = "statValue" if "statValue" in df.columns else "stat"

        id_cols = ["playerId", "player", "position", "team", "conference"]

        wide = df.pivot_table(
            index=id_cols,
            columns=stat_col,
            values=value_col,
            aggfunc="sum",
        ).reset_index()

        wide.to_csv(f"data/processed/cfb_player_stats_{year}.csv", index=False)
        print("Saved clean wide-format CFB stats.")


def main() -> None:
    clean_nfl_team_stats(season=2024)
    clean_cfb_player_stats(year=2024)


if __name__ == "__main__":
    main()


[NFL] Saved league team stats to data/processed/nfl_team_stats_2024.csv
[NFL] Saved Bears-only stats to data/processed/bears_team_stats_2024.csv
Saved clean wide-format CFB stats.
