In [None]:
# ==============================================================================
# CMS Ownership — Build CHOW-ready panel (one highest-level snapshot per CCN-month)
# ==============================================================================
import os, re, json, hashlib
import numpy as np
import pandas as pd
from pathlib import Path

# ------------------------------------------------------------------------------
# 1) Paths
# ------------------------------------------------------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
OWN_DIR     = RAW_DIR / "ownership-files"
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
OWN_DIR.mkdir(parents=True, exist_ok=True)
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

# INPUT should be your *clean* combined file (the one we just finished standardizing).
IN_COMBINED = OWN_DIR / "ownership_combined.csv"
# OUTPUT is the CHOW-ready monthly panel
OUT_PANEL   = INTERIM_DIR / "ownership_ccn_month_chow_ready.csv"

print(f"[paths] OWN_DIR={OWN_DIR}")
print(f"[paths] INTERIM_DIR={INTERIM_DIR}")
print(f"[load]  {IN_COMBINED}")

# ------------------------------------------------------------------------------
# 2) Load
# ------------------------------------------------------------------------------
df = pd.read_csv(
    IN_COMBINED,
    dtype={
        "cms_certification_number": "string",
        "role": "string",
        "owner_type": "string",
        "owner_name": "string",
    },
    low_memory=False,
    parse_dates=["association_date","processing_date"],
)
print(f"[loaded] rows={len(df):,}, cols={df.shape[1]}")

# Basic hygiene
df["cms_certification_number"] = (
    df["cms_certification_number"].astype("string").str.replace(r"\D","",regex=True).str.zfill(6)
)
# Keep the three roles we standardized
df = df[df["role"].isin(["DIRECT","INDIRECT","PARTNERSHIP"])].copy()

# Month key = floor to month start
df["month_ts"] = df["processing_date"].values.astype("datetime64[M]")

# ------------------------------------------------------------------------------
# 3) Name normalization & helpers
# ------------------------------------------------------------------------------
CORP_SUFFIX_RE = re.compile(
    r"\b(CORP(ORATION)?|INCORPORATED|INC|LLC|L\.L\.C\.|LP|L\.P\.|LLP|CO|COMPANY|HOLDINGS?)\b\.?",
    re.I
)
PUNCT_RE = re.compile(r"[,\.&]")

def norm_owner_name(name: str) -> str:
    s = str(name).upper().strip()
    s = PUNCT_RE.sub(" ", s)
    s = CORP_SUFFIX_RE.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Role priority: pick a single *highest* level per month
ROLE_PRIORITY = {"INDIRECT": 0, "DIRECT": 1, "PARTNERSHIP": 2}

def choose_top_level(roles: pd.Series) -> str | None:
    if roles.empty:
        return None
    # Which roles are present?
    present = sorted(set(roles.dropna().astype(str).str.upper()), key=lambda r: ROLE_PRIORITY.get(r, 99))
    return present[0] if present else None

def json_list(values, as_date=False):
    out = []
    if as_date:
        vals = pd.to_datetime(values, errors="coerce")
        for v in vals:
            out.append(None if pd.isna(v) else v.date().isoformat())
    else:
        for v in values:
            if isinstance(v, float) and np.isnan(v):
                out.append(None)
            elif pd.isna(v):
                out.append(None)
            else:
                out.append(v)
    return json.dumps(out, ensure_ascii=False)

def pct_fill_tag(pcts) -> str:
    # pcts is a numeric array-like (with NaNs)
    pcts = pd.Series(pcts, dtype="float64")
    if pcts.empty or pcts.notna().sum()==0:
        return "all_null"
    if pcts.notna().sum()==len(pcts):
        return "none_null"
    return "some_null"

# ------------------------------------------------------------------------------
# 4) Build a single highest-level snapshot per (CCN, month)
#    - pick top role level per (CCN, month)
#    - normalize names; dedupe by normalized name (sum %s, min assoc_date)
#    - deterministic sort: by % desc, then name_norm
# ------------------------------------------------------------------------------
rows = []
for (ccn, month), g in df.groupby(["cms_certification_number","month_ts"], sort=True):
    # Pick the single highest level available this month
    used = choose_top_level(g["role"])
    if used is None:
        # no valid role found; skip this month
        continue
    gm = g[g["role"].str.upper()==used].copy()

    # Normalize names and coerce % to numeric
    gm["name_norm"] = gm["owner_name"].map(norm_owner_name)
    gm["pct_num"]   = pd.to_numeric(gm["ownership_percentage"], errors="coerce")
    # If association_date is missing (should be filled already), backfill with processing_date
    if "association_date" in gm.columns:
        gm["association_date"] = gm["association_date"].fillna(gm["processing_date"])

    # Deduplicate by normalized name: sum %s, keep earliest association date, and keep the most frequent owner_type/role
    agg = (gm.groupby("name_norm", dropna=False, as_index=False)
             .agg(
                 owner_name=("owner_name","first"),  # keep an example raw name
                 owner_type=("owner_type", lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
                 role=("role", lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
                 pct_sum=("pct_num","sum"),
                 assoc_min=("association_date","min"),
             ))

    # Order owners deterministically within the month
    agg = agg.sort_values(["pct_sum","name_norm"], ascending=[False, True], kind="mergesort").reset_index(drop=True)

    # Pack JSON lists (aligned by the deterministic sort)
    roles_json   = json_list(agg["role"])
    types_json   = json_list(agg["owner_type"])
    names_json   = json_list(agg["owner_name"])
    pcts_list    = agg["pct_sum"].astype(float).tolist()
    pcts_json    = json.dumps([None if pd.isna(x) else float(x) for x in pcts_list], ensure_ascii=False)
    assoc_json   = json_list(agg["assoc_min"], as_date=True)
    names_norm_json = json_list(agg["name_norm"])

    tag = pct_fill_tag(pcts_list)
    total_pct = float(np.nansum(pcts_list)) if len(pcts_list) else np.nan
    has_pct = bool(np.isfinite(np.nanmax(pcts_list))) if len(pcts_list) else False

    rows.append({
        "cms_certification_number": ccn,
        "month_ts": month,
        "processing_date": month.date().isoformat(),
        "used_level": used,               # which level this snapshot represents
        "n_owners": int(len(agg)),        # number of unique owners at this level
        "roles": roles_json,
        "owner_types": types_json,
        "owner_names": names_json,
        "name_norms": names_norm_json,    # helpful for name-based fallbacks
        "ownership_percentages": pcts_json,
        "association_dates": assoc_json,
        "pct_fill_tag": tag,
        "total_pct": total_pct,
        "has_percent": has_pct,
    })

panel = pd.DataFrame(rows).sort_values(["cms_certification_number","month_ts"]).reset_index(drop=True)

# Strong dtypes
panel["cms_certification_number"] = panel["cms_certification_number"].astype(str).str.zfill(6)
panel["month_ts"] = pd.to_datetime(panel["month_ts"], errors="coerce")

# ------------------------------------------------------------------------------
# 5) Save + brief QC
# ------------------------------------------------------------------------------
panel.to_csv(OUT_PANEL, index=False)
print(f"[save] CHOW-ready panel → {OUT_PANEL}  (rows={len(panel):,})")

print("\n=== QC snapshot ===")
print("Distinct CCNs:", panel["cms_certification_number"].nunique())
print("Years covered:",
      sorted(pd.to_datetime(panel["month_ts"]).dt.year.dropna().unique().tolist())[:10], "...")
print("\nHead:")
with pd.option_context("display.max_colwidth", 120):
    print(panel.head(3).to_string(index=False))

[paths] OWN_DIR=C:\Users\Owner\OneDrive\NursingHomeData\ownership-files
[paths] INTERIM_DIR=C:\Repositories\white-bowblis-nhmc\data\interim
[load]  C:\Users\Owner\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv
[loaded] rows=5,330,314, cols=7
