In [1]:
# ==============================================================================
# CHOWs from Facility Signatures (group-to-group) + Market Entry
#   Inputs : data/interim/facility_signatures_grouped.jsonlists.csv
#   Outputs: data/interim/ownership_ccn_chow_transitions_from_groups.csv
#            data/interim/ownership_ccn_chow_summary_from_groups.csv
# ==============================================================================

import os, json, math
import numpy as np
import pandas as pd
from pathlib import Path

# ---------- Paths ----------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

IN_SIG     = INTERIM_DIR / "facility_signatures_grouped.jsonlists.csv"
OUT_TRANS  = INTERIM_DIR / "ownership_ccn_chow_transitions_from_groups.csv"
OUT_SUMM   = INTERIM_DIR / "ownership_ccn_chow_summary_from_groups.csv"

STUDY_START = pd.Timestamp("2017-01-01")
CHOW_THRESH = 50.0

print(f"[load] {IN_SIG}")
sig = pd.read_csv(IN_SIG, low_memory=False)

# ---------- Helpers ----------
def _parse_json_list(x):
    if isinstance(x, list): return x
    try:
        return json.loads(x) if isinstance(x, str) else []
    except Exception:
        return []

def _pct_list(x):
    arr = _parse_json_list(x)
    out = []
    for v in arr:
        if v is None:
            out.append(None)
        else:
            try:
                out.append(float(v))
            except Exception:
                out.append(None)
    return out

def pct_turnover(prev_map: dict, next_map: dict) -> float:
    names = set(prev_map) | set(next_map)
    overlap = 0.0
    for n in names:
        p = float(prev_map.get(n, 0) or 0)
        q = float(next_map.get(n, 0) or 0)
        overlap += min(p, q)
    base_prev = sum(float(v or 0) for v in prev_map.values())
    base_next = sum(float(v or 0) for v in next_map.values())
    base = max(base_prev, base_next, 100.0)
    if base <= 0:
        return 0.0
    return float(np.clip(100.0 * (1.0 - overlap / base), 0.0, 100.0))

def names_turnover(prev_set: set, next_set: set) -> float:
    union = prev_set | next_set
    if not union:
        return 0.0
    inter = prev_set & next_set
    return float(np.clip(100.0 * (1 - len(inter)/len(union)), 0.0, 100.0))

def bucket_code(turnover_pct: float, inconclusive: bool) -> int:
    # 0 <50%; 1 90–100; 2 80–90; 3 70–80; 4 60–70; 5 50–60%; 6 inconclusive
    if inconclusive: return 6
    x = float(turnover_pct)
    if x < 50: return 0
    if x >= 90: return 1
    if x >= 80: return 2
    if x >= 70: return 3
    if x >= 60: return 4
    return 5

def pct_array_tag(pcts):
    """Tag a list of pcts as 'none_null', 'some_null', or 'all_null'."""
    s = pd.Series(pcts, dtype="float64")
    if s.empty or s.notna().sum() == 0:
        return "all_null"
    if s.notna().sum() == len(s):
        return "none_null"
    return "some_null"

# ---------- Gather group columns ----------
# Identify all group indices present (group1_owners ... groupK_start ...)
group_nums = sorted({
    int(c.split("_")[0].replace("group",""))
    for c in sig.columns if c.startswith("group") and c.endswith("_owners")
})

# Build a long representation of groups per CCN
parts = []
for n in group_nums:
    owners_col = f"group{n}_owners"
    pcts_col   = f"group{n}_pcts"
    roles_col  = f"group{n}_roles"
    start_col  = f"group{n}_start"
    id_col     = f"group{n}_snapshot_id"

    if owners_col not in sig.columns: 
        continue

    chunk = sig[["cms_certification_number"]].copy()
    chunk["group_n"] = n
    chunk["owners_json"] = sig[owners_col]
    chunk["pcts_json"]   = sig[pcts_col] if pcts_col in sig.columns else None
    chunk["roles_json"]  = sig[roles_col] if roles_col in sig.columns else None
    chunk["start"]       = pd.to_datetime(sig[start_col], errors="coerce") if start_col in sig.columns else pd.NaT
    chunk["snapshot_id"] = sig[id_col] if id_col in sig.columns else None
    parts.append(chunk)

groups_long = pd.concat(parts, ignore_index=True)
# Drop rows where there is no group (owners_json is NaN)
groups_long = groups_long[groups_long["owners_json"].notna()].copy()

# Parse JSON lists
groups_long["owners"] = groups_long["owners_json"].apply(_parse_json_list)
groups_long["pcts"]   = groups_long["pcts_json"].apply(_pct_list)
groups_long["roles"]  = groups_long["roles_json"].apply(_parse_json_list)

# Basic hygiene
groups_long["cms_certification_number"] = groups_long["cms_certification_number"].astype(str).str.zfill(6)

# Compute CCN-level first seen (market entry baseline)
first_seen = (groups_long
              .groupby("cms_certification_number", as_index=False)["start"]
              .min()
              .rename(columns={"start":"first_seen_month"}))
first_seen["entered_after_start"] = first_seen["first_seen_month"] > STUDY_START
first_seen["present_at_start"]    = first_seen["first_seen_month"] == STUDY_START

# ---------- Build group-to-group transitions ----------
transitions = []
for ccn, g in groups_long.groupby("cms_certification_number", sort=True):
    g = g.sort_values("start").reset_index(drop=True)
    if len(g) <= 1:
        # no transitions for single-group facilities
        continue

    # CCN market-entry flags
    fs = first_seen.loc[first_seen["cms_certification_number"] == ccn].iloc[0]
    ccn_first = fs["first_seen_month"]
    entered_after = bool(fs["entered_after_start"])
    present_start = bool(fs["present_at_start"])

    for i in range(len(g)-1):
        a = g.iloc[i]
        b = g.iloc[i+1]

        # Owners & percents
        prev_names = [str(x) for x in (a["owners"] or []) if x is not None]
        next_names = [str(x) for x in (b["owners"] or []) if x is not None]
        prev_pcts  = [None if x is None else float(x) for x in (a["pcts"] or [])]
        next_pcts  = [None if x is None else float(x) for x in (b["pcts"] or [])]

        prev_map = {n:p for n,p in zip(prev_names, prev_pcts) if p is not None}
        next_map = {n:p for n,p in zip(next_names, next_pcts) if p is not None}

        # Decide method & inconclusive tag
        prev_tag = pct_array_tag(prev_pcts)
        next_tag = pct_array_tag(next_pcts)
        # Inconclusive when either side mixes null & numeric
        inconclusive = (prev_tag == "some_null") or (next_tag == "some_null")

        # Choose method:
        # - percent method (0) only if both sides have at least one positive pct and no nulls
        prev_has = (prev_tag == "none_null") and (len(prev_map) > 0) and (max(prev_map.values()) > 0)
        next_has = (next_tag == "none_null") and (len(next_map) > 0) and (max(next_map.values()) > 0)
        if prev_has and next_has:
            method = 0
            turnover = pct_turnover(prev_map, next_map)
        else:
            method = 1
            turnover = names_turnover(set(prev_names), set(next_names))

        bc = bucket_code(turnover, inconclusive)
        is_chow = bool((turnover >= CHOW_THRESH) and not inconclusive)

        transitions.append({
            "cms_certification_number": ccn,
            "from_group": int(a["group_n"]),
            "to_group": int(b["group_n"]),
            "from_start": a["start"],
            "to_start": b["start"],              # event date: first month we see the new group
            "method": method,                    # 0=percent, 1=names
            "inconclusive": inconclusive,
            "turnover_pct": round(float(turnover), 4),
            "bucket_code": bc,
            "is_chow": is_chow,

            # market-entry context (CCN-level)
            "first_seen_month": ccn_first,
            "entered_after_start": entered_after,
            "present_at_start": present_start,
        })

trans_df = pd.DataFrame(transitions).sort_values(
    ["cms_certification_number","from_group"]
).reset_index(drop=True)

# ---------- Summarize per CCN ----------
def _pad(lst, n, fill=None):
    return (lst + [fill]*n)[:n]

MAX_EVENTS = 12
summary_rows = []
for ccn, g in trans_df.groupby("cms_certification_number", sort=True):
    g = g.sort_values("to_start")
    chow = g.loc[g["is_chow"]]

    dates = pd.to_datetime(chow["to_start"], errors="coerce").dt.date.astype(str).tolist()
    mags  = chow["turnover_pct"].tolist()
    meths = chow["method"].tolist()
    incon = chow["inconclusive"].tolist()

    buckets = g["bucket_code"].value_counts().to_dict()

    first_seen_month = g["first_seen_month"].iloc[0]
    entered_after    = bool(g["entered_after_start"].iloc[0])
    present_start    = bool(g["present_at_start"].iloc[0])

    out = {
        "cms_certification_number": ccn,
        "num_chows": len(dates),

        "cnt_bucket0": buckets.get(0, 0),
        "cnt_bucket1": buckets.get(1, 0),
        "cnt_bucket2": buckets.get(2, 0),
        "cnt_bucket3": buckets.get(3, 0),
        "cnt_bucket4": buckets.get(4, 0),
        "cnt_bucket5": buckets.get(5, 0),
        "cnt_bucket6_inconcl": buckets.get(6, 0),

        "first_seen_month": pd.to_datetime(first_seen_month).date().isoformat() if pd.notna(first_seen_month) else None,
        "entered_after_start": entered_after,
        "present_at_start": present_start,
    }
    for k in range(1, MAX_EVENTS+1):
        out[f"chow_date_{k}"]      = _pad(dates, MAX_EVENTS, None)[k-1]
        out[f"chow_magnitude_{k}"] = _pad(mags,  MAX_EVENTS, None)[k-1]
        out[f"chow_method_{k}"]    = _pad(meths, MAX_EVENTS, None)[k-1]   # 0=percent,1=names
        out[f"chow_inconcl_{k}"]   = _pad(incon, MAX_EVENTS, None)[k-1]

    summary_rows.append(out)

summary_ccn = pd.DataFrame(summary_rows).sort_values("cms_certification_number").reset_index(drop=True)

# ---------- Save ----------
trans_df.to_csv(OUT_TRANS, index=False)
summary_ccn.to_csv(OUT_SUMM, index=False)
print(f"[save] transitions → {OUT_TRANS}    (rows={len(trans_df):,})")
print(f"[save] CCN summary → {OUT_SUMM} (rows={len(summary_ccn):,})")

# ---------- Quick peek + stats ----------
print("\n=== Transitions head ===")
print(trans_df.head(8).to_string(index=False))

print("\n=== Summary head ===")
print(summary_ccn.head(8).to_string(index=False))

print("\nCounts:")
print("Transitions:", len(trans_df), "| CHOWs:", int(trans_df["is_chow"].sum()))

print("\nMethod counts (0=percent,1=names):")
print(trans_df["method"].value_counts().sort_index())

print("\nBucket distribution (0..6):")
print(trans_df["bucket_code"].value_counts().sort_index())

print("\nMarket entry — facilities entering after start:",
      int(summary_ccn["entered_after_start"].sum()))
print("Present at start (baseline incumbents):",
      int(summary_ccn["present_at_start"].sum()))

[load] C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_grouped.jsonlists.csv
[save] transitions → C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_chow_transitions_from_groups.csv    (rows=25,526)
[save] CCN summary → C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_chow_summary_from_groups.csv (rows=9,540)

=== Transitions head ===
cms_certification_number  from_group  to_group from_start   to_start  method  inconclusive  turnover_pct  bucket_code  is_chow first_seen_month  entered_after_start  present_at_start
                  015009           1         2 1969-09-01 2012-01-25       0         False         100.0            1     True       1969-09-01                False             False
                  015012           1         2 1967-01-01 2021-10-01       1         False         100.0            1     True       1967-01-01                False             False
                  015019           1         2 2015-02-02 2016-01-01  