In [1]:
# ──────────────────────────────────────────────────────────────────────────────
# CHOW Engine for Facility Signatures (long form)
#   - Primary turnover: percent-weight overlap
#   - Fallback: names (Jaccard)
#   - Surname override: prevent family handoffs from counting as CHOWs
#   - Buckets preserved; study window filter
#
# Inputs:
#   <PROJECT>/data/interim/facility_signatures_long.csv
#
# Outputs:
#   <PROJECT>/data/interim/ccn_group_transitions.csv
#   <PROJECT>/data/interim/ccn_chow_summary.csv
# ──────────────────────────────────────────────────────────────────────────────

import os, re, json, pathlib
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

# ------------------------------- Paths ----------------------------------------
PROJECT_ROOT = pathlib.Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

IN_LONG   = INTERIM_DIR / "facility_signatures_long.csv"
OUT_TRANS = INTERIM_DIR / "ccn_group_transitions.csv"
OUT_SUM   = INTERIM_DIR / "ccn_chow_summary.csv"

print("[INPUT ]", IN_LONG)
print("[OUTPUT]", OUT_TRANS)
print("[OUTPUT]", OUT_SUM)

# ------------------------------- Config ---------------------------------------
THRESH_CHOW      = 50.0
CUTOFF_DATE      = pd.Timestamp("2017-01-01")
BUCKET_LABELS    = {
    0: "<50",
    1: "90–100",
    2: "80–90",
    3: "70–80",
    4: "60–70",
    5: "50–60",
    6: "inconclusive"
}

# Surname override controls
ORG_MARKERS_RE   = re.compile(r"\b(LLC|INC|CORP|CORPORATION|L\.L\.C\.|L\.P\.|LP|LLP|PLC|COMPANY|CO\.?|HOLDINGS?|GROUP|TRUST|FUND|CAPITAL|PARTNERS(hip)?|HEALTH|CARE|AUTHORITY|HOSPITAL|CENTER|NURSING|HOME|OPERATING|MANAGEMENT)\b", re.I)
TOKEN_RE         = re.compile(r"[^\w\s]")
SURNAME_MIN_FRACTION_KEEP = 0.80   # if ≥80% of ownership (by % weight) remains within same surname family, don't CHOW
USE_SURNAME_OVERRIDE       = True

# ------------------------------- Helpers --------------------------------------
def parse_list(j):
    """Parse JSON list from string; return [] if bad/NA."""
    try:
        if pd.isna(j): return []
        out = json.loads(j)
        return out if isinstance(out, list) else []
    except Exception:
        return []

def weight_map(names_list, pcts_list):
    """Return dict: owner -> pct (floats)."""
    wm = defaultdict(float)
    for n, p in zip(names_list, pcts_list):
        try:
            f = float(p)
        except Exception:
            continue
        if pd.isna(f):
            continue
        wm[str(n)] += f
    return dict(wm)

def pct_overlap(prev_map, curr_map):
    """Percent overlap in [0, 100]. Assumes both sides ~sum to 100 but handles drift."""
    if not prev_map and not curr_map:
        return np.nan
    owners = set(prev_map) | set(curr_map)
    overlap = sum(min(prev_map.get(o, 0.0), curr_map.get(o, 0.0)) for o in owners)
    denom   = max(sum(prev_map.values()), sum(curr_map.values()), 100.0)
    return max(0.0, min(100.0 * overlap / denom, 100.0))

def jaccard_names(prev_names, curr_names):
    a, b = set(prev_names), set(curr_names)
    if not a and not b:
        return np.nan
    inter = len(a & b)
    union = len(a | b) or 1
    return inter / union

def looks_like_person(name: str) -> bool:
    """Heuristic: treat as person if no org markers and has 1–3 word tokens."""
    if not name or ORG_MARKERS_RE.search(name):
        return False
    toks = TOKEN_RE.sub(" ", str(name)).split()
    toks = [t for t in toks if t]
    return 1 <= len(toks) <= 3

def surname_of(name: str) -> str:
    """Very simple surname getter: last token of a person-like name; else ''."""
    toks = TOKEN_RE.sub(" ", str(name)).split()
    toks = [t for t in toks if t]
    return toks[-1].upper() if toks else ""

def surname_weight_map(wm: dict) -> dict:
    """Aggregate owner weights by surname for person-like owners; others grouped as org_* buckets."""
    agg = defaultdict(float)
    for n, p in wm.items():
        if looks_like_person(n):
            s = surname_of(n)
            if s:
                agg[s] += p
            else:
                agg["_PERSON_"] += p
        else:
            agg["_ORG_"] += p
    return dict(agg)

def surname_family_overlap(prev_wm: dict, curr_wm: dict) -> float:
    """Surname-level % overlap in [0,100] using surname-weight maps."""
    ps = surname_weight_map(prev_wm)
    cs = surname_weight_map(curr_wm)
    owners = set(ps) | set(cs)
    overlap = sum(min(ps.get(k, 0.0), cs.get(k, 0.0)) for k in owners)
    denom   = max(sum(ps.values()), sum(cs.values()), 100.0)
    return max(0.0, min(100.0 * overlap / denom, 100.0))

def bucket_code(turnover_pct: float) -> int:
    if pd.isna(turnover_pct): return 6
    t = float(turnover_pct)
    if t < 50:  return 0
    if t >= 90: return 1
    if t >= 80: return 2
    if t >= 70: return 3
    if t >= 60: return 4
    return 5  # 50–60

# ------------------------------- Load -----------------------------------------
long = pd.read_csv(IN_LONG, dtype={"cms_certification_number":"string"}, low_memory=False)
# Parse dates & lists
long["start"] = pd.to_datetime(long["start"], errors="coerce")
long["end"]   = pd.to_datetime(long["end"],   errors="coerce")
long["names"] = long["names_list"].apply(parse_list)
long["pcts"]  = long["pcts_list"].apply(parse_list)

# Defensive: ensure lists align (length mismatch should be rare here)
bad_align = (long["names"].str.len() != long["pcts"].str.len())
if int(bad_align.sum()) > 0:
    print(f"[warn] rows with names/pcts length mismatch: {int(bad_align.sum())}")

# -------------------------- Build transitions per CCN -------------------------
rows = []

for ccn, g in long.groupby("cms_certification_number", sort=True):
    g = g.sort_values("start").reset_index(drop=True)
    if len(g) < 2:
        continue
    for i in range(1, len(g)):
        prev = g.loc[i-1]
        curr = g.loc[i]
        from_start = prev["start"]
        to_start   = curr["start"]

        prev_w = weight_map(prev["names"], prev["pcts"])
        curr_w = weight_map(curr["names"], curr["pcts"])

        # Primary: percent-based turnover
        ov_pct    = pct_overlap(prev_w, curr_w)                # [0,100]
        turn_pct  = None if pd.isna(ov_pct) else (100.0 - ov_pct)
        method    = 0  # 0 = percent-based, 1 = names-based

        # Fallback: names Jaccard if percent not available (should be rare)
        if pd.isna(turn_pct):
            jacc = jaccard_names(prev["names"], curr["names"])
            turn_pct = None if pd.isna(jacc) else (100.0 * (1.0 - jacc))
            method = 1

        # Inconclusive if still NaN
        inconclusive = pd.isna(turn_pct)

        # Surname override: if same family keeps majority control, don't CHOW
        surname_keep_pct = surname_family_overlap(prev_w, curr_w)  # [0,100]
        surname_override = False
        if USE_SURNAME_OVERRIDE and not pd.isna(turn_pct):
            if surname_keep_pct >= 100.0 * SURNAME_MIN_FRACTION_KEEP:
                surname_override = True

        # Bucket
        bcode = bucket_code(turn_pct)

        # CHOW decision (study window + threshold + not overridden)
        is_in_window = (pd.notna(to_start) and to_start >= CUTOFF_DATE)
        is_chow = bool(is_in_window and (not inconclusive) and (turn_pct >= THRESH_CHOW) and (not surname_override))

        rows.append({
            "cms_certification_number": ccn,
            "from_group": int(prev["group_n"]),
            "to_group":   int(curr["group_n"]),
            "from_start": from_start,
            "from_end":   prev["end"],
            "to_start":   to_start,
            "to_end":     curr["end"],
            "from_level": prev["source_level"],
            "to_level":   curr["source_level"],
            "turnover_pct": None if pd.isna(turn_pct) else round(float(turn_pct), 1),
            "overlap_pct":  None if pd.isna(ov_pct)   else round(float(ov_pct), 1),
            "method":       method,              # 0=percent, 1=names
            "bucket_code":  bcode,
            "bucket_label": BUCKET_LABELS[bcode],
            "surname_keep_pct": round(float(surname_keep_pct), 1) if not pd.isna(surname_keep_pct) else np.nan,
            "surname_override": surname_override,
            "inconclusive": inconclusive,
            "is_chow": is_chow
        })

trans = pd.DataFrame(rows).sort_values(
    ["cms_certification_number","to_start","to_group"]
).reset_index(drop=True)

# -------------------------- Save transitions ----------------------------------
trans.to_csv(OUT_TRANS, index=False)
print(f"[save] transitions: {OUT_TRANS}  rows={len(trans):,}")

# Universe of CCNs (even those with 0 transitions)
all_ccns = (
    long[["cms_certification_number","start"]]
    .groupby("cms_certification_number", as_index=False)
    .agg(first_seen=("start","min"))
)

# Helper: summarize a single CCN's transition records (may be empty)
def summarize_ccn_safe(ccn, df_ccn, max_events=12):
    out = {
        "cms_certification_number": ccn,
        "num_chows": 0,
        "first_seen_month": pd.NaT,
        "present_at_start": False,
        "entered_after_start": False,
    }
    # bucket counts 0..6
    for k in range(0,7):
        out[f"bucket_{k}_count"] = 0

    if df_ccn is None or df_ccn.empty:
        return out  # zero CHOWs, no transitions

    df_ccn = df_ccn.sort_values("to_start")
    # bucket histogram (all transitions, including <2017 for diagnostics)
    bc = df_ccn["bucket_code"].value_counts().to_dict()
    for k in range(0,7):
        out[f"bucket_{k}_count"] = int(bc.get(k, 0))

    # CHOWs in the study window
    chow = df_ccn[df_ccn["is_chow"]].copy()
    out["num_chows"] = int(chow.shape[0])

    # dates/magnitudes/methods up to max_events
    for i, (_, r) in enumerate(chow.head(max_events).iterrows(), start=1):
        out[f"chow_date_{i}"]      = r["to_start"]
        out[f"chow_magnitude_{i}"] = r.get("turnover_pct", np.nan)
        out[f"chow_method_{i}"]    = r.get("method", np.nan)       # 0=percent,1=names
        out[f"chow_inconcl_{i}"]   = bool(r.get("inconclusive", False))

    return out

# Build a dict of transitions by CCN for quick lookup (may be empty for some)
trans_by_ccn = {k: v.copy() for k, v in trans.groupby("cms_certification_number")} if not trans.empty else {}

summary_rows = []
for _, row in all_ccns.iterrows():
    ccn = row["cms_certification_number"]
    s   = summarize_ccn_safe(ccn, trans_by_ccn.get(ccn))
    # fill first-seen / present-at-start flags from long file
    first_seen = long.loc[long["cms_certification_number"]==ccn, "start"].min()
    s["first_seen_month"]  = first_seen.to_period("M").to_timestamp("M") if pd.notna(first_seen) else pd.NaT
    s["present_at_start"]  = bool(pd.notna(first_seen) and (first_seen <  CUTOFF_DATE))
    s["entered_after_start"] = bool(pd.notna(first_seen) and (first_seen >= CUTOFF_DATE))
    summary_rows.append(s)

summary = (
    pd.DataFrame(summary_rows)
      .sort_values("cms_certification_number")
      .reset_index(drop=True)
)
summary.to_csv(OUT_SUM, index=False)
print(f"[save] summary(all CCNs): {OUT_SUM}  rows={len(summary):,}")

# --- LITE EXPORT (always all CCNs) ---
chow_date_cols = [c for c in summary.columns if c.startswith("chow_date_")]
lite = summary[["cms_certification_number","num_chows"] + chow_date_cols].copy()
lite["is_chow"] = (lite["num_chows"] > 0).astype(int)
lite = lite[["cms_certification_number","num_chows","is_chow"] + chow_date_cols]
OUT_LITE = INTERIM_DIR / "ccn_chow_lite.csv"
lite.to_csv(OUT_LITE, index=False)
print(f"[save] lite (all CCNs): {OUT_LITE}  rows={len(lite):,}  cols={len(lite.columns)}")

# --- Quick sanity:
print("[diag] CCNs in long:", long['cms_certification_number'].nunique())
print("[diag] CCNs in trans:", trans['cms_certification_number'].nunique() if not trans.empty else 0)
print("[diag] CCNs in summary:", summary['cms_certification_number'].nunique())
# -------------------------- Quick prints --------------------------------------
print("\n=== Quick diagnostics ===")
if not trans.empty:
    print("Transitions total:", len(trans))
    print("CHOWs total     :", int(trans["is_chow"].sum()))
    print("Share overridden by surname rule:",
          f"{100.0*trans.loc[trans['surname_override'],'is_chow'].count()/max(1,len(trans)):.2f}% (of all transitions)")
    print("\nCHOWs by year:")
    print(trans[trans["is_chow"]].assign(year=pd.to_datetime(trans["to_start"]).dt.year)
              .groupby("year").size().rename("count"))

print("\nBucket distribution (all transitions):")
print(trans["bucket_label"].value_counts())

[INPUT ] C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_long.csv
[OUTPUT] C:\Repositories\white-bowblis-nhmc\data\interim\ccn_group_transitions.csv
[OUTPUT] C:\Repositories\white-bowblis-nhmc\data\interim\ccn_chow_summary.csv
[save] transitions: C:\Repositories\white-bowblis-nhmc\data\interim\ccn_group_transitions.csv  rows=23,955
[save] summary(all CCNs): C:\Repositories\white-bowblis-nhmc\data\interim\ccn_chow_summary.csv  rows=13,419
[save] lite (all CCNs): C:\Repositories\white-bowblis-nhmc\data\interim\ccn_chow_lite.csv  rows=13,419  cols=12
[diag] CCNs in long: 13419
[diag] CCNs in trans: 9004
[diag] CCNs in summary: 13419

=== Quick diagnostics ===
Transitions total: 23955
CHOWs total     : 12120
Share overridden by surname rule: 5.93% (of all transitions)

CHOWs by year:
year
2017    1443
2018    1624
2019    1931
2020    1424
2021    2011
2022    1400
2023    1436
2024     811
2025      40
Name: count, dtype: int64

Bucket distribution (all transitions):
b

In [2]:
import pandas as pd
import pathlib

# Paths
PROJECT_ROOT = pathlib.Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent
INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"

IN_SUM  = INTERIM_DIR / "ccn_chow_summary.csv"
OUT_LITE = INTERIM_DIR / "ccn_chow_lite.csv"

# Load
df = pd.read_csv(IN_SUM, dtype={"cms_certification_number":"string"}, low_memory=False)

# Find chow_date columns dynamically
chow_date_cols = [c for c in df.columns if c.startswith("chow_date_")]

# Build lite frame
lite = df[["cms_certification_number","num_chows"] + chow_date_cols].copy()
lite["is_chow"] = (lite["num_chows"] > 0).astype(int)

# Reorder: ccn, num_chows, is_chow, then chow dates
cols = ["cms_certification_number","num_chows","is_chow"] + chow_date_cols
lite = lite[cols]

# Save
lite.to_csv(OUT_LITE, index=False)
print(f"[save] wrote {OUT_LITE} with {len(lite)} rows and {len(lite.columns)} cols")

[save] wrote C:\Repositories\white-bowblis-nhmc\data\interim\ccn_chow_lite.csv with 13419 rows and 12 cols
