In [5]:
# ==============================================================================
# Facility Signatures (Wide, JSON lists) — Highest level + dedup across time
#   Input : data/raw/ownership-files/ownership_combined.csv
#   Output: data/interim/facility_signatures_grouped.jsonlists.csv
# ------------------------------------------------------------------------------
# For each CCN, we produce group1, group2, ... where each group has aligned JSON:
#   - groupN_owners      (JSON list)
#   - groupN_pcts        (JSON list, floats; uses MAX within-snapshot, not SUM)
#   - groupN_roles       (JSON list; all values identical, the chosen highest level)
#   - groupN_start       (ISO date, earliest association_date for the group)
#   - groupN_snapshot_id (hash of [level, normalized owner list, pct list])
#
# We collapse repeats of the *same ownership composition* (same level + owners + pcts)
# across different processing runs, keeping only the earliest occurrence (by assoc_date).
# ==============================================================================

import os, re, json, hashlib
import numpy as np
import pandas as pd
from pathlib import Path

# ---------------- Paths ----------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
OWN_DIR     = RAW_DIR / "ownership-files"
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

IN_COMBINED = OWN_DIR / "ownership_combined.csv"
OUT_SIG     = INTERIM_DIR / "facility_signatures_grouped.jsonlists.csv"

print(f"[load] {IN_COMBINED}")
df = pd.read_csv(
    IN_COMBINED,
    dtype={"cms_certification_number": "string", "role": "string",
           "owner_type": "string", "owner_name": "string"},
    parse_dates=["association_date","processing_date"],
    low_memory=False,
)
print(f"[loaded] rows={len(df):,}, cols={df.shape[1]}")

# ---------------- Helpers ----------------
ROLE_PRIORITY = {"INDIRECT": 0, "DIRECT": 1, "PARTNERSHIP": 2}

def normalize_role(x: str) -> str:
    s = (str(x) if pd.notna(x) else "").upper()
    if "INDIRECT" in s: return "INDIRECT"
    if "DIRECT"   in s: return "DIRECT"
    if "PARTNER"  in s: return "PARTNERSHIP"
    return "OTHER"

CORP_SUFFIX_RE = re.compile(
    r"\b(CORP(ORATION)?|INCORPORATED|INC|LLC|L\.L\.C\.|LP|L\.P\.|LLP|CO|COMPANY|HOLDINGS?|INVESTMENTS?)\b\.?",
    re.I
)
def norm_owner_name(name: str) -> str:
    s = str(name).upper().strip()
    s = re.sub(r"[,\.&]", " ", s)
    s = CORP_SUFFIX_RE.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def json_list(values):
    out = []
    for v in values:
        if pd.isna(v):
            out.append(None)
        else:
            out.append(v)
    return json.dumps(out, ensure_ascii=False)

def make_snapshot_id(level: str, names_norm: list[str], pcts: list[float]) -> str:
    payload = json.dumps({
        "level": level,
        "names": names_norm,
        "pcts": [None if pd.isna(x) else float(x) for x in pcts],
    }, ensure_ascii=False)
    return hashlib.sha1(payload.encode("utf-8")).hexdigest()

# ---------------- Basic hygiene ----------------
df["cms_certification_number"] = (
    df["cms_certification_number"].astype("string")
      .str.replace(r"\D","",regex=True).str.zfill(6)
)

df["role_norm"]  = df["role"].map(normalize_role)
df["owner_norm"] = df["owner_name"].map(norm_owner_name)

# Keep only roles we care about
df = df[df["role_norm"].isin(ROLE_PRIORITY.keys())].copy()

# association_date fallback to processing_date if missing
if "association_date" in df.columns:
    df["association_date"] = df["association_date"].fillna(df["processing_date"])
else:
    df["association_date"] = df["processing_date"]

# Robust % to numeric (assumes ownership_combined already standardized; still sanitize)
pct_raw = (df["ownership_percentage"].astype(str)
           .str.replace("%","",regex=False)
           .str.replace(",","",regex=False)
           .str.strip())
df["pct_num"] = pd.to_numeric(pct_raw, errors="coerce")

# ---------------- Step 1: choose highest level per (CCN, association_date) ----------------
def highest_level_block(g: pd.DataFrame) -> pd.DataFrame:
    for candidate in ("INDIRECT","DIRECT","PARTNERSHIP"):
        if (g["role_norm"] == candidate).any():
            return g[g["role_norm"] == candidate].copy()
    return g.iloc[0:0]  # empty

blocks = []
grouped = df.groupby(["cms_certification_number","association_date"], sort=True, as_index=False)
for (ccn, adate), g in grouped:
    sel = highest_level_block(g)
    if sel.empty:
        continue

    # Collapse duplicate owner lines *within this snapshot/level*:
    # - pct: use MAX observed (NOT sum) to avoid inflating due to repeated filings
    # - owner_name: keep first example raw string
    agg = (
        sel.groupby(
            ["cms_certification_number","association_date","role_norm","owner_norm","owner_type"],
            dropna=False, as_index=False
        )
        .agg(
            pct=("pct_num","max"),
            owner_name=("owner_name","first"),
        )
    )

    # Stable sort inside the snapshot for aligned lists: % desc, name asc
    agg = agg.sort_values(["pct","owner_norm"], ascending=[False, True], kind="mergesort")

    # Pack aligned lists
    names_norm = agg["owner_norm"].tolist()
    names_raw  = agg["owner_name"].tolist()
    roles_list = [agg["role_norm"].iloc[0]] * len(agg)   # all same level here
    types_list = agg["owner_type"].tolist()
    pcts_list  = agg["pct"].astype(float).tolist()

    # Snapshot id based on level+names+%s
    snap_id = make_snapshot_id(agg["role_norm"].iloc[0], names_norm, pcts_list)

    blocks.append({
        "cms_certification_number": ccn,
        "association_date": pd.to_datetime(adate),
        "used_level": agg["role_norm"].iloc[0],
        "owner_names": json_list(names_raw),
        "name_norms": json_list(names_norm),
        "owner_types": json_list(types_list),
        "roles": json_list(roles_list),
        "ownership_percentages": json_list([None if pd.isna(x) else float(x) for x in pcts_list]),
        "snapshot_id": snap_id,
    })

snap = (
    pd.DataFrame(blocks)
      .sort_values(["cms_certification_number","association_date"])
      .reset_index(drop=True)
)
print(f"[snapshots @ highest level] {len(snap):,} rows")

# ---------------- Step 2: drop exact duplicates except processing date ----------------
# "Exact duplicate composition" = same CCN + used_level + name_norms + ownership_percentages
snap_unique = (
    snap.sort_values(["cms_certification_number","association_date"])
        .drop_duplicates(
            subset=["cms_certification_number","used_level","name_norms","ownership_percentages"],
            keep="first"
        )
        .reset_index(drop=True)
)
print(f"[after dedup composition] {len(snap_unique):,} rows")

# ---------------- Step 3: group numbering per CCN ----------------
snap_unique = snap_unique.sort_values(["cms_certification_number","association_date"])
snap_unique["grp_n"] = snap_unique.groupby("cms_certification_number").cumcount() + 1

# ---------------- Step 4: pivot wide with grouped columns together ----------------
def pivot_block(s: pd.DataFrame, value_col: str, prefix: str):
    wide = s.pivot(index="cms_certification_number", columns="grp_n", values=value_col)
    wide.columns = [f"{prefix}{i}" for i in wide.columns]
    return wide

w_owners = pivot_block(snap_unique, "owner_names", "group")
w_pcts   = pivot_block(snap_unique, "ownership_percentages", "group")
w_roles  = pivot_block(snap_unique, "roles", "group")
w_start  = pivot_block(snap_unique, "association_date", "group")
w_hash   = pivot_block(snap_unique, "snapshot_id", "group")

def rename_block(df_block, suffix):
    return df_block.rename(columns={c: f"{c}_{suffix}" for c in df_block.columns})

owners_block = rename_block(w_owners, "owners")
pcts_block   = rename_block(w_pcts,   "pcts")
roles_block  = rename_block(w_roles,  "roles")
start_block  = rename_block(w_start,  "start")
id_block     = rename_block(w_hash,   "snapshot_id")

# Interleave so columns appear as group1_owners, group1_pcts, group1_roles, group1_start, group1_snapshot_id, group2_...
all_groups = sorted({int(c.replace("group","").split("_")[0]) for c in owners_block.columns})
ordered_cols = ["cms_certification_number"]
for i in all_groups:
    ordered_cols += [f"group{i}_owners", f"group{i}_pcts", f"group{i}_roles", f"group{i}_start", f"group{i}_snapshot_id"]

wide = (
    pd.concat([owners_block, pcts_block, roles_block, start_block, id_block], axis=1)
      .reset_index()
)
# Keep only the columns we constructed above (in correct order)
present_cols = [c for c in ordered_cols if c in wide.columns]
wide = wide.reindex(columns=present_cols)

# ---------------- Save ----------------
wide.to_csv(OUT_SIG, index=False, date_format="%Y-%m-%d")
print(f"[save] {OUT_SIG}  (rows={len(wide):,})")

# ---------------- Quick sanity peek ----------------
with pd.option_context("display.max_colwidth", 100):
    print("\n[head]")
    print(wide.head(3).to_string(index=False))

# ---------------- Optional QC: pct totals (uncomment to inspect) ----------------
# tmp = wide[["cms_certification_number"] + [c for c in wide.columns if c.endswith("_pcts")]].copy()
# def _totals(js):
#     if pd.isna(js): return np.nan
#     try:
#         arr = json.loads(js)
#         return float(np.nansum([float(x) if x is not None else np.nan for x in arr]))
#     except Exception:
#         return np.nan
# for c in [c for c in tmp.columns if c.endswith("_pcts")]:
#     tmp[c.replace("_pcts","_total")] = tmp[c].apply(_totals)
# keep_cols = ["cms_certification_number"] + [c for c in tmp.columns if c.endswith("_total")]
# print("\n[pct totals (first few facilities)]")
# print(tmp[keep_cols].head(5).to_string(index=False))

[load] C:\Users\Owner\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv
[loaded] rows=5,075,773, cols=7
[snapshots @ highest level] 40,927 rows
[after dedup composition] 39,601 rows
[save] C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_grouped.jsonlists.csv  (rows=14,075)

[head]
cms_certification_number                                                                                           group1_owners              group1_pcts                                     group1_roles group1_start                       group1_snapshot_id        group2_owners group2_pcts group2_roles group2_start                       group2_snapshot_id group3_owners group3_pcts group3_roles group3_start group3_snapshot_id group4_owners group4_pcts group4_roles group4_start group4_snapshot_id group5_owners group5_pcts group5_roles group5_start group5_snapshot_id group6_owners group6_pcts group6_roles group6_start group6_snapshot_id group7_owners group7_pcts group7_roles grou