In [1]:
# ==============================================================================
# Facility Signatures v3 — Stable ownership groups for CHOW analysis
#   Input : data/raw/ownership-files/ownership_combined.csv
#   Output: data/interim/facility_signatures_v3.jsonlists.csv
#
# Design:
#  - Prefer INDIRECT owners; when present, APPEND DIRECT owners (stable vs level churn)
#  - Collapse duplicate owners within snapshot; % = MAX; round; drop tiny stakes
#  - Across time: tolerant equality (owner set same & pct diffs ≤ PCT_TOL)
#  - New group only if turnover ≥ GROUP_CHANGE_THRESH (≥50% by default)
#  - Wide output with JSON lists: groupN_owners, groupN_pcts, groupN_roles, groupN_start, groupN_snapshot_id
# ==============================================================================

import os, re, json, hashlib
import numpy as np
import pandas as pd
from pathlib import Path

# ---------------- Configuration ----------------
PCT_ROUND = 1.0          # round each owner's pct to nearest X% (e.g., 1.0 or 0.5)
MIN_PCT   = 1.0          # drop owners with pct < MIN_PCT (after rounding)
PCT_TOL   = 2.0          # tolerant equality: per-owner pct difference allowed (in points)
GROUP_CHANGE_THRESH = 50.0  # require turnover ≥ this to promote a new group

# Role priority & merge behavior
ROLE_PRIORITY = {"INDIRECT": 0, "DIRECT": 1, "PARTNERSHIP": 2}
APPEND_DIRECT_WHEN_INDIRECT = True  # if INDIRECT exists, append DIRECT into same snapshot

# ---------------- Paths ----------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
OWN_DIR     = RAW_DIR / "ownership-files"
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

IN_COMBINED = OWN_DIR / "ownership_combined.csv"
OUT_SIG     = INTERIM_DIR / "facility_signatures_v3.jsonlists.csv"

print(f"[load] {IN_COMBINED}")
df = pd.read_csv(
    IN_COMBINED,
    dtype={"cms_certification_number": "string", "role": "string",
           "owner_type": "string", "owner_name": "string"},
    parse_dates=["association_date","processing_date"],
    low_memory=False,
)
print(f"[loaded] rows={len(df):,}, cols={df.shape[1]}")

# ---------------- Helpers ----------------
def normalize_role(x: str) -> str:
    s = (str(x) if pd.notna(x) else "").upper()
    if "INDIRECT" in s: return "INDIRECT"
    if "DIRECT"   in s: return "DIRECT"
    if "PARTNER"  in s: return "PARTNERSHIP"
    return "OTHER"

CORP_SUFFIX_RE = re.compile(
    r"\b(CORP(ORATION)?|INCORPORATED|INC|LLC|L\.L\.C\.|LP|L\.P\.|LLP|CO|COMPANY|HOLDINGS?|INVESTMENTS?)\b\.?",
    re.I
)
def norm_owner_name(name: str) -> str:
    s = str(name).upper().strip()
    s = re.sub(r"[,\.&]", " ", s)
    s = CORP_SUFFIX_RE.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def json_list(values):
    out = []
    for v in values:
        out.append(None if pd.isna(v) else v)
    return json.dumps(out, ensure_ascii=False)

def make_snapshot_id(level: str, names_norm: list[str], pcts: list[float]) -> str:
    payload = json.dumps({
        "level": level,
        "names": names_norm,
        "pcts": [None if pd.isna(x) else float(x) for x in pcts],
    }, ensure_ascii=False)
    return hashlib.sha1(payload.encode("utf-8")).hexdigest()

# Turnover metrics
def pct_turnover(prev_map: dict, next_map: dict) -> float:
    names = set(prev_map) | set(next_map)
    overlap = 0.0
    for n in names:
        p = float(prev_map.get(n, 0) or 0)
        q = float(next_map.get(n, 0) or 0)
        overlap += min(p, q)
    base_prev = sum(float(v or 0) for v in prev_map.values())
    base_next = sum(float(v or 0) for v in next_map.values())
    base = max(base_prev, base_next, 100.0)
    if base <= 0:
        return 0.0
    return float(np.clip(100.0 * (1.0 - overlap / base), 0.0, 100.0))

def names_turnover(prev_set: set, next_set: set) -> float:
    union = prev_set | next_set
    if not union:
        return 0.0
    inter = prev_set & next_set
    return float(np.clip(100.0 * (1 - len(inter)/len(union)), 0.0, 100.0))

# Tolerant equality between two compositions
def equivalent_compositions(prev_names, prev_pcts, next_names, next_pcts, pct_tol=PCT_TOL) -> bool:
    prev_set = set(prev_names)
    next_set = set(next_names)
    # If name sets differ → not equivalent
    if prev_set != next_set:
        return False
    # If either side lacks usable %s → treat equality by names only
    if (prev_pcts is None) or (next_pcts is None):
        return True
    if any(p is None for p in prev_pcts) or any(q is None for q in next_pcts):
        return True
    # Compare per-owner pcts with tolerance
    pmap = {n: float(p) for n,p in zip(prev_names, prev_pcts)}
    qmap = {n: float(q) for n,q in zip(next_names, next_pcts)}
    for n in prev_set:
        if abs(pmap.get(n, 0.0) - qmap.get(n, 0.0)) > pct_tol:
            return False
    return True

# ---------------- Basic hygiene ----------------
df["cms_certification_number"] = (
    df["cms_certification_number"].astype("string")
      .str.replace(r"\D","",regex=True).str.zfill(6)
)
df["role_norm"]  = df["role"].map(normalize_role)
df["owner_norm"] = df["owner_name"].map(norm_owner_name)

# association_date fallback to processing_date if missing
if "association_date" in df.columns:
    df["association_date"] = df["association_date"].fillna(df["processing_date"])
else:
    df["association_date"] = df["processing_date"]

# Robust % to numeric
pct_raw = (df["ownership_percentage"].astype(str)
           .str.replace("%","",regex=False)
           .str.replace(",","",regex=False)
           .str.strip())
df["pct_num_raw"] = pd.to_numeric(pct_raw, errors="coerce")

# ---------------- Build per (CCN, assoc_date) snapshot with level handling ----------------
snap_rows = []
for (ccn, adate), g in df.groupby(["cms_certification_number","association_date"], sort=True):
    if pd.isna(ccn) or pd.isna(adate):
        continue
    g = g.copy()

    # Which levels exist?
    levels_present = set(g["role_norm"].dropna().unique().tolist())
    used_level = None

    # Start with INDIRECT if present
    parts = []
    if "INDIRECT" in levels_present:
        used_level = "INDIRECT"
        parts.append(g[g["role_norm"] == "INDIRECT"])
        if APPEND_DIRECT_WHEN_INDIRECT and "DIRECT" in levels_present:
            parts.append(g[g["role_norm"] == "DIRECT"])
    elif "DIRECT" in levels_present:
        used_level = "DIRECT"
        parts.append(g[g["role_norm"] == "DIRECT"])
    elif "PARTNERSHIP" in levels_present:
        used_level = "PARTNERSHIP"
        parts.append(g[g["role_norm"] == "PARTNERSHIP"])
    else:
        # no recognized level → skip
        continue

    sel = pd.concat(parts, ignore_index=True)

    # Collapse duplicate owners within snapshot (by owner_norm, owner_type)
    # pct = MAX, then round & drop tiny stakes
    agg = (sel.groupby(["owner_norm","owner_type"], dropna=False, as_index=False)
              .agg(owner_name=("owner_name","first"),
                   pct=("pct_num_raw","max"),
                   role=("role_norm","first")))

    # Round and drop small
    agg["pct"] = agg["pct"].round(3)  # safe before rounding step
    if PCT_ROUND is not None and PCT_ROUND > 0:
        agg["pct"] = (np.round(agg["pct"] / PCT_ROUND) * PCT_ROUND)
    agg.loc[agg["pct"] < MIN_PCT, "pct"] = np.nan  # treat as missing, then drop
    agg = agg[~agg["pct"].isna()].copy()

    # If everything dropped (all tiny/NaN), keep names with pct=None for names-only comparison
    if agg.empty:
        # use distinct names from sel
        names_norm = sorted(set(sel["owner_norm"].dropna().tolist()))
        names_raw  = names_norm  # fallback raw ≈ norm
        roles_list = [used_level] * len(names_norm)
        types_list = [None] * len(names_norm)
        pcts_list  = [None] * len(names_norm)
    else:
        # Sort by pct desc, then name asc — stable
        agg = agg.sort_values(["pct","owner_norm"], ascending=[False, True], kind="mergesort").reset_index(drop=True)
        names_norm = agg["owner_norm"].tolist()
        names_raw  = agg["owner_name"].tolist()
        roles_list = [used_level] * len(agg)
        types_list = agg["owner_type"].tolist()
        pcts_list  = agg["pct"].astype(float).tolist()

    snap_rows.append({
        "cms_certification_number": ccn,
        "association_date": pd.to_datetime(adate),
        "used_level": used_level,
        "owner_names": json_list(names_raw),
        "name_norms": json_list(names_norm),
        "owner_types": json_list(types_list),
        "roles": json_list(roles_list),
        "ownership_percentages": json_list([None if pd.isna(x) else float(x) for x in pcts_list]),
    })

snap = (pd.DataFrame(snap_rows)
          .sort_values(["cms_certification_number","association_date"])
          .reset_index(drop=True))
print(f"[snapshots built] {len(snap):,} rows")

# ---------------- Build stable groups per CCN with tolerant equality + turnover rule ----------------
def _parse(js):
    try:
        return json.loads(js) if isinstance(js, str) else (js if isinstance(js, list) else [])
    except Exception:
        return []

groups = []
for ccn, g in snap.groupby("cms_certification_number", sort=True):
    g = g.sort_values("association_date").reset_index(drop=True)
    if g.empty:
        continue

    # Seed with first snapshot
    first = g.iloc[0]
    cur_names = _parse(first["name_norms"])
    cur_pcts  = [None if x is None else float(x) for x in _parse(first["ownership_percentages"])]
    cur_level = str(first["used_level"])
    cur_start = pd.to_datetime(first["association_date"])

    def _sid(level, names, pcts):
        payload = json.dumps({"level": level, "names": names, "pcts": pcts}, ensure_ascii=False)
        return hashlib.sha1(payload.encode("utf-8")).hexdigest()

    groups.append({
        "cms_certification_number": ccn,
        "group_n": 1,
        "owners": json_list(_parse(first["owner_names"])),
        "names": json_list(cur_names),
        "pcts": json_list(cur_pcts),
        "roles": json_list([cur_level]*len(cur_names)),
        "start": cur_start,
        "snapshot_id": _sid(cur_level, cur_names, cur_pcts),
    })
    grp_n = 1

    # Walk forward
    for i in range(1, len(g)):
        row = g.iloc[i]
        nxt_names = _parse(row["name_norms"])
        nxt_pcts  = [None if x is None else float(x) for x in _parse(row["ownership_percentages"])]
        nxt_level = str(row["used_level"])
        nxt_start = pd.to_datetime(row["association_date"])

        # Decide if the new snapshot should start a new group
        same_enough = equivalent_compositions(cur_names, cur_pcts, nxt_names, nxt_pcts, pct_tol=PCT_TOL)

        if same_enough:
            # Keep current group; do not promote
            continue

        # Not equivalent — check turnover threshold (percent if usable, else names)
        prev_map = {n:p for n,p in zip(cur_names, cur_pcts) if p is not None}
        next_map = {n:p for n,p in zip(nxt_names, nxt_pcts) if p is not None}
        prev_has = (len(prev_map) > 0)
        next_has = (len(next_map) > 0)
        if prev_has and next_has and all(p is not None for p in cur_pcts) and all(q is not None for q in nxt_pcts):
            turn = pct_turnover(prev_map, next_map)
        else:
            turn = names_turnover(set(cur_names), set(nxt_names))

        if turn < GROUP_CHANGE_THRESH:
            # Below threshold → treat as the same group (update “current” to latest snapshot)
            cur_names, cur_pcts, cur_level = nxt_names, nxt_pcts, nxt_level
            continue

        # Promote to a NEW group
        grp_n += 1
        groups.append({
            "cms_certification_number": ccn,
            "group_n": grp_n,
            "owners": json_list(_parse(row["owner_names"])),
            "names": json_list(nxt_names),
            "pcts": json_list(nxt_pcts),
            "roles": json_list([nxt_level]*len(nxt_names)),
            "start": nxt_start,
            "snapshot_id": _sid(nxt_level, nxt_names, nxt_pcts),
        })
        # Reset current to the new group's composition
        cur_names, cur_pcts, cur_level = nxt_names, nxt_pcts, nxt_level

groups_long = pd.DataFrame(groups).sort_values(["cms_certification_number","group_n"]).reset_index(drop=True)
print(f"[groups] {len(groups_long):,} group records across CCNs")

# ---------------- Pivot wide (grouped columns together) ----------------
def pivot_block(s: pd.DataFrame, value_col: str, prefix: str):
    wide = s.pivot(index="cms_certification_number", columns="group_n", values=value_col)
    wide.columns = [f"{prefix}{i}" for i in wide.columns]
    return wide

w_owners = pivot_block(groups_long, "owners",      "group")
w_pcts   = pivot_block(groups_long, "pcts",        "group")
w_roles  = pivot_block(groups_long, "roles",       "group")
w_start  = pivot_block(groups_long, "start",       "group")
w_hash   = pivot_block(groups_long, "snapshot_id", "group")

def rename_block(df_block, suffix):
    return df_block.rename(columns={c: f"{c}_{suffix}" for c in df_block.columns})

owners_block = rename_block(w_owners, "owners")
pcts_block   = rename_block(w_pcts,   "pcts")
roles_block  = rename_block(w_roles,  "roles")
start_block  = rename_block(w_start,  "start")
id_block     = rename_block(w_hash,   "snapshot_id")

all_groups = sorted({int(c.replace("group","").split("_")[0]) for c in owners_block.columns})
ordered_cols = ["cms_certification_number"]
for i in all_groups:
    ordered_cols += [f"group{i}_owners", f"group{i}_pcts", f"group{i}_roles", f"group{i}_start", f"group{i}_snapshot_id"]

wide = (pd.concat([owners_block, pcts_block, roles_block, start_block, id_block], axis=1)
          .reset_index())
wide = wide.reindex(columns=[c for c in ordered_cols if c in wide.columns])

# ---------------- Save ----------------
wide.to_csv(OUT_SIG, index=False, date_format="%Y-%m-%d")
print(f"[save] {OUT_SIG}  (rows={len(wide):,})")

# ---------------- Quick QC ----------------
print("\nQC — examples:")
with pd.option_context("display.max_colwidth", 100):
    print(wide.head(3).to_string(index=False))

# Distribution of number of groups per CCN
grp_counts = groups_long.groupby("cms_certification_number")["group_n"].max()
print("\nGroups per facility — describe():")
print(grp_counts.describe().to_string())
print("Top 10 facilities by groups:")
print(grp_counts.sort_values(ascending=False).head(10).to_string())

[load] C:\Users\wrthj\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv
[loaded] rows=5,075,773, cols=7
[snapshots built] 40,927 rows
[groups] 38,978 group records across CCNs
[save] C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_v3.jsonlists.csv  (rows=14,075)

QC — examples:
cms_certification_number                                                                         group1_owners              group1_pcts                                     group1_roles group1_start                       group1_snapshot_id        group2_owners group2_pcts group2_roles group2_start                       group2_snapshot_id group3_owners group3_pcts group3_roles group3_start group3_snapshot_id group4_owners group4_pcts group4_roles group4_start group4_snapshot_id group5_owners group5_pcts group5_roles group5_start group5_snapshot_id group6_owners group6_pcts group6_roles group6_start group6_snapshot_id group7_owners group7_pcts group7_roles group7_start group7_snaps