# Recent Load Health

This notebook surfaces the latest ETL status at a glance. It automatically loads environment variables from the nearest `.env` file (including `apps/backend/.env`) and highlights weeks with missing or incomplete game data.

In [None]:
from __future__ import annotations

import os
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", None)


def locate_env(start: Path) -> Path | None:
    candidates = []
    for parent in [start, *start.parents]:
        candidates.append(parent / ".env")
        candidates.append(parent / "apps" / "backend" / ".env")
    for candidate in candidates:
        if candidate.exists():
            return candidate
    return None


dotenv_path = locate_env(Path.cwd())
if dotenv_path:
    load_dotenv(dotenv_path=dotenv_path, override=False)
    print(f"Loaded environment from {dotenv_path}")
else:
    print("No .env file found; relying on existing environment variables.")

database_url = os.environ.get("DATABASE_URL")
if not database_url:
    raise RuntimeError(
        "DATABASE_URL is not set. Update your .env or environment variables before continuing."
    )

engine = create_engine(database_url)


In [None]:
query = """
SELECT s.year AS season,
       w.week_number AS week,
       COUNT(g.game_id) AS total_games,
       SUM(CASE WHEN g.home_points IS NOT NULL AND g.away_points IS NOT NULL THEN 1 ELSE 0 END) AS completed_games,
       MAX(g.kickoff_ts) AS last_kickoff
FROM seasons s
JOIN weeks w ON w.season_id = s.season_id
LEFT JOIN games g ON g.week_id = w.week_id
GROUP BY s.year, w.week_number
ORDER BY s.year DESC, w.week_number DESC
"""

df = pd.read_sql(query, engine)
df["last_kickoff"] = pd.to_datetime(df["last_kickoff"])
df["missing_games"] = df["total_games"] - df["completed_games"]
df["status"] = df.apply(
    lambda row: (
        "No games loaded"
        if row.total_games == 0
        else ("Healthy" if row.missing_games == 0 else "Incomplete")
    ),
    axis=1,
)
df.head(10)


In [None]:
if df.empty:
    print("No season/week records found. Run a backfill before using this notebook.")
else:
    latest_kickoff = df["last_kickoff"].dropna().max()
    latest_kickoff_display = latest_kickoff.isoformat(timespec="seconds") if pd.notna(latest_kickoff) else "N/A"
    latest_season = df["season"].max()

    incomplete = df[df["status"] == "Incomplete"][
        ["season", "week", "total_games", "completed_games", "missing_games", "last_kickoff"]
    ].sort_values(["season", "week"], ascending=[False, False])
    missing = df[df["status"] == "No games loaded"][
        ["season", "week", "total_games", "completed_games", "last_kickoff"]
    ].sort_values(["season", "week"], ascending=[False, False])

    print(f"Latest kickoff: {latest_kickoff_display}")
    print(f"Latest season tracked: {latest_season}")
    print()

    if incomplete.empty and missing.empty:
        print("All tracked weeks have complete game data.")
    else:
        if not incomplete.empty:
            display(incomplete)
        if not missing.empty:
            display(missing)

    latest_season_weeks = df[df["season"] == latest_season].sort_values("week")
    display(latest_season_weeks.tail(10))


In [None]:
staging_summary = pd.DataFrame()
staging_query = """
SELECT season,
       week,
       COUNT(*) AS total_rows
FROM nfl_weekly_stats
GROUP BY season, week
ORDER BY season DESC, week DESC
"""

try:
    staging_summary = pd.read_sql(staging_query, engine)
except Exception as exc:
    print(f"Unable to query staging table: {exc}")

if staging_summary.empty:
    print("Staging table is empty or unavailable.")
else:
    loaded_keys = set()
    if not df.empty:
        completed = df[df["completed_games"] > 0]
        loaded_keys = set(
            tuple(row) for row in completed[["season", "week"]].drop_duplicates().to_numpy()
        )

    staging_summary["loaded"] = staging_summary.apply(
        lambda row: "Yes" if (row.season, row.week) in loaded_keys else "No",
        axis=1,
    )

    display(staging_summary.head(20))

    backlog = staging_summary[staging_summary["loaded"] == "No"]
    if backlog.empty:
        print("All staged weeks are represented in production tables.")
    else:
        print("Weeks staged but not yet finalized:")
        display(backlog.head(20))


In [None]:
if not df.empty:
    pivot = df.pivot(index="week", columns="season", values="completed_games")
    pivot.plot(kind="bar", figsize=(10, 4), title="Completed Games by Week")
