In [1]:
# ==============================================================================
# CHOWs from Facility Signatures v3 (group-to-group) + Market Entry (2017+)
#   Input : data/interim/facility_signatures_v3.jsonlists.csv
#   Output: data/interim/ownership_ccn_chow_transitions_v3.csv
#           data/interim/ownership_ccn_chow_summary_v3.csv
# ==============================================================================

import os, json
import numpy as np
import pandas as pd
from pathlib import Path

# ---------------- Config ----------------
STUDY_START = pd.Timestamp("2017-01-01")
CHOW_THRESH = 50.0
MAX_EVENTS  = 12
SAVE_FILES  = True  # set False to skip writing CSVs

# ---------------- Paths ----------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

IN_SIG   = INTERIM_DIR / "facility_signatures_v3.jsonlists.csv"
OUT_TRNS = INTERIM_DIR / "ownership_ccn_chow_transitions_v3.csv"
OUT_SUMM = INTERIM_DIR / "ownership_ccn_chow_summary_v3.csv"

print(f"[load] {IN_SIG}")
sig = pd.read_csv(IN_SIG, low_memory=False, dtype={"cms_certification_number":"string"})

# ---------------- Helpers ----------------
def _parse_list(x):
    if isinstance(x, list): return x
    try:
        return json.loads(x) if isinstance(x, str) else []
    except Exception:
        return []

def _parse_pcts(x):
    arr = _parse_list(x)
    out = []
    for v in arr:
        if v is None: out.append(None)
        else:
            try: out.append(float(v))
            except: out.append(None)
    return out

def pct_array_tag(pcts):
    s = pd.Series(pcts, dtype="float64")
    if s.empty or s.notna().sum()==0: return "all_null"
    if s.notna().sum()==len(s):        return "none_null"
    return "some_null"

def pct_turnover(prev_map: dict, next_map: dict) -> float:
    names = set(prev_map) | set(next_map)
    overlap = 0.0
    for n in names:
        p = float(prev_map.get(n, 0) or 0)
        q = float(next_map.get(n, 0) or 0)
        overlap += min(p, q)
    base_prev = sum(float(v or 0) for v in prev_map.values())
    base_next = sum(float(v or 0) for v in next_map.values())
    base = max(base_prev, base_next, 100.0)
    if base <= 0: return 0.0
    return float(np.clip(100.0 * (1 - overlap/base), 0.0, 100.0))

def names_turnover(prev_set: set, next_set: set) -> float:
    union = prev_set | next_set
    if not union: return 0.0
    return float(np.clip(100.0 * (1 - len(prev_set & next_set)/len(union)), 0.0, 100.0))

def bucket_code(turnover_pct: float, inconclusive: bool) -> int:
    # 0 <50%; 1 90–100; 2 80–90; 3 70–80; 4 60–70; 5 50–60; 6 inconclusive
    if inconclusive: return 6
    x = float(turnover_pct)
    if x < 50: return 0
    if x >= 90: return 1
    if x >= 80: return 2
    if x >= 70: return 3
    if x >= 60: return 4
    return 5

# ---------------- Melt groups to long ----------------
# detect group indices
group_nums = sorted({
    int(c.split("_")[0].replace("group",""))
    for c in sig.columns if c.startswith("group") and c.endswith("_owners")
})

parts = []
for n in group_nums:
    base = f"group{n}_"
    chunk = sig[["cms_certification_number"]].copy()
    chunk["group_n"] = n
    chunk["owners"]  = sig.get(base+"owners")
    chunk["pcts"]    = sig.get(base+"pcts")
    chunk["roles"]   = sig.get(base+"roles")
    chunk["start"]   = pd.to_datetime(sig.get(base+"start"), errors="coerce")
    parts.append(chunk)

gl = pd.concat(parts, ignore_index=True)
gl = gl[gl["owners"].notna()].copy()

# parse JSON lists
gl["names"] = gl["owners"].apply(_parse_list)     # owners are already normalized in v3
gl["pcts"]  = gl["pcts"].apply(_parse_pcts)

# CCN-level first_seen (market entry baseline)
first_seen = (gl.groupby("cms_certification_number", as_index=False)["start"]
                .min().rename(columns={"start":"first_seen_month"}))
first_seen["entered_after_start"] = first_seen["first_seen_month"] >  STUDY_START
first_seen["present_at_start"]    = first_seen["first_seen_month"] <= STUDY_START  # <= (fix)

# ---------------- Build transitions ----------------
transitions = []
for ccn, g in gl.groupby("cms_certification_number", sort=True):
    g = g.sort_values("start").reset_index(drop=True)
    if len(g) <= 1: 
        continue

    fs = first_seen.loc[first_seen["cms_certification_number"] == ccn].iloc[0]
    ccn_first  = fs["first_seen_month"]
    ccn_after  = bool(fs["entered_after_start"])
    ccn_start  = bool(fs["present_at_start"])

    for i in range(len(g)-1):
        a, b = g.iloc[i], g.iloc[i+1]

        prev_names = [str(x) for x in (a["names"] or []) if x is not None]
        next_names = [str(x) for x in (b["names"] or []) if x is not None]
        prev_pcts  = [None if x is None else float(x) for x in (a["pcts"] or [])]
        next_pcts  = [None if x is None else float(x) for x in (b["pcts"] or [])]

        prev_tag = pct_array_tag(prev_pcts)
        next_tag = pct_array_tag(next_pcts)
        inconcl  = (prev_tag == "some_null") or (next_tag == "some_null")

        prev_map = {n:p for n,p in zip(prev_names, prev_pcts) if p is not None}
        next_map = {n:p for n,p in zip(next_names, next_pcts) if p is not None}

        prev_ok = (prev_tag == "none_null") and len(prev_map)>0
        next_ok = (next_tag == "none_null") and len(next_map)>0
        if prev_ok and next_ok:
            method = 0
            turnover = pct_turnover(prev_map, next_map)
        else:
            method = 1
            turnover = names_turnover(set(prev_names), set(next_names))

        transitions.append({
            "cms_certification_number": ccn,
            "from_group": int(a["group_n"]),
            "to_group": int(b["group_n"]),
            "from_start": a["start"],
            "to_start": b["start"],                 # event date
            "method": method,                       # 0=percent, 1=names
            "inconclusive": bool(inconcl),
            "turnover_pct": round(float(turnover), 4),
            "bucket_code": bucket_code(turnover, inconcl),
            "is_chow": bool((turnover >= CHOW_THRESH) and not inconcl),
            "first_seen_month": ccn_first,
            "entered_after_start": ccn_after,
            "present_at_start": ccn_start,
        })

trans_df = pd.DataFrame(transitions)
if trans_df.empty:
    print("No transitions constructed."); raise SystemExit

# ---------------- Filter to study window ----------------
trans_df = trans_df[ trans_df["to_start"] >= STUDY_START ].copy()
trans_df = trans_df.sort_values(["cms_certification_number","to_start","to_group"]).reset_index(drop=True)

# ---------------- Summarize per CCN ----------------
def _pad(lst, n, fill=None): return (lst + [fill]*n)[:n]

summary_rows = []
for ccn, g in trans_df.groupby("cms_certification_number", sort=True):
    g = g.sort_values("to_start")
    chow = g.loc[g["is_chow"]]

    dates = pd.to_datetime(chow["to_start"]).dt.date.astype(str).tolist()
    mags  = chow["turnover_pct"].tolist()
    meths = chow["method"].tolist()
    incon = chow["inconclusive"].tolist()

    buckets = g["bucket_code"].value_counts().to_dict()

    first_seen_month = g["first_seen_month"].iloc[0]
    entered_after    = bool(g["entered_after_start"].iloc[0])
    present_start    = bool(g["present_at_start"].iloc[0])

    out = {
        "cms_certification_number": ccn,
        "num_chows": len(dates),
        "cnt_bucket0": buckets.get(0, 0),
        "cnt_bucket1": buckets.get(1, 0),
        "cnt_bucket2": buckets.get(2, 0),
        "cnt_bucket3": buckets.get(3, 0),
        "cnt_bucket4": buckets.get(4, 0),
        "cnt_bucket5": buckets.get(5, 0),
        "cnt_bucket6_inconcl": buckets.get(6, 0),
        "first_seen_month": pd.to_datetime(first_seen_month).date().isoformat() if pd.notna(first_seen_month) else None,
        "entered_after_start": entered_after,
        "present_at_start": present_start,
    }
    for k in range(1, MAX_EVENTS+1):
        out[f"chow_date_{k}"]      = _pad(dates, MAX_EVENTS, None)[k-1]
        out[f"chow_magnitude_{k}"] = _pad(mags,  MAX_EVENTS, None)[k-1]
        out[f"chow_method_{k}"]    = _pad(meths, MAX_EVENTS, None)[k-1]
        out[f"chow_inconcl_{k}"]   = _pad(incon, MAX_EVENTS, None)[k-1]
    summary_rows.append(out)

summary = pd.DataFrame(summary_rows).sort_values("cms_certification_number").reset_index(drop=True)

# ---------------- Save ----------------
if SAVE_FILES:
    trans_df.to_csv(OUT_TRNS, index=False, date_format="%Y-%m-%d")
    summary.to_csv(OUT_SUMM, index=False)
    print(f"[save] transitions → {OUT_TRNS}    (rows={len(trans_df):,})")
    print(f"[save] CCN summary → {OUT_SUMM} (rows={len(summary):,})")

# ---------------- Diagnostics ----------------
print("\n=== Transitions head ===")
print(trans_df.head(8).to_string(index=False))

print("\n=== Summary head ===")
print(summary.head(8).to_string(index=False))

print("\nCounts:")
print("Transitions:", len(trans_df), "| CHOWs:", int(trans_df["is_chow"].sum()))
print("Method counts (0=percent,1=names):")
print(trans_df["method"].value_counts().sort_index())

print("\nBucket distribution (0..6):")
print(trans_df["bucket_code"].value_counts().sort_index())

# Per-year CHOWs (only ≥50% buckets, excluding inconclusive)
yr = pd.to_datetime(trans_df["to_start"]).dt.year.rename("year")
mask_chow = (trans_df["is_chow"]) & (trans_df["bucket_code"] <= 5)
per_year = (trans_df.loc[mask_chow]
            .assign(year=yr[mask_chow])
            .groupby("year")["cms_certification_number"]
            .count())
print("\nCHOWs per year (≥50%):")
print(per_year.to_string())

# Distribution: number of CHOWs per facility
chows_per_ccn = (trans_df.loc[mask_chow]
                 .groupby("cms_certification_number")
                 .size().rename("num_chows"))
print("\nNum CHOWs per facility — describe():")
print(chows_per_ccn.describe().to_string())
print("Top 15 facilities by CHOWs (2017+):")
print(chows_per_ccn.sort_values(ascending=False).head(15).to_string())

[load] C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_v3.jsonlists.csv
[save] transitions → C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_chow_transitions_v3.csv    (rows=13,431)
[save] CCN summary → C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_chow_summary_v3.csv (rows=7,284)

=== Transitions head ===
cms_certification_number  from_group  to_group from_start   to_start  method  inconclusive  turnover_pct  bucket_code  is_chow first_seen_month  entered_after_start  present_at_start
                  015012           1         2 1967-01-01 2021-10-01       1         False         100.0            1     True       1967-01-01                False              True
                  015019           2         3 2016-01-01 2021-10-01       1         False         100.0            1     True       2015-02-02                False              True
                  015019           3         4 2021-10-01 2024-03-01       1         False   

In [2]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent
INTERIM = PROJECT_ROOT / "data" / "interim"

trans = pd.read_csv(INTERIM / "ownership_ccn_chow_transitions_v3.csv", parse_dates=["to_start"])
print("Share 100% flips:", (trans["turnover_pct"]>=99.5).mean().round(3))
print(trans["bucket_code"].value_counts().sort_index().to_string())

Share 100% flips: 0.92
bucket_code
0        3
1    12501
2      231
3      207
4      139
5      350


In [3]:
import json
sig = pd.read_csv(INTERIM / "facility_signatures_v3.jsonlists.csv", dtype={"cms_certification_number":"string"})

def listify(x):
    try: return json.loads(x) if isinstance(x,str) else (x if isinstance(x,list) else [])
    except: return []

# pick 5 random 100% transitions
sample_idxs = trans.loc[trans["turnover_pct"]>=99.5].sample(5, random_state=42).index
for i in sample_idxs:
    r = trans.loc[i]
    ccn = r["cms_certification_number"]; g1 = r["from_group"]; g2 = r["to_group"]
    row = sig.loc[sig["cms_certification_number"]==ccn].iloc[0]
    A_names = listify(row[f"group{int(g1)}_owners"])
    A_pcts  = listify(row[f"group{int(g1)}_pcts"])
    B_names = listify(row[f"group{int(g2)}_owners"])
    B_pcts  = listify(row[f"group{int(g2)}_pcts"])
    print(f"\nCCN {ccn}  {r['from_start'].date()} → {r['to_start'].date()}  (turnover={r['turnover_pct']})")
    print("A:", list(zip(A_names, A_pcts)))
    print("B:", list(zip(B_names, B_pcts)))

  sig = pd.read_csv(INTERIM / "facility_signatures_v3.jsonlists.csv", dtype={"cms_certification_number":"string"})


IndexError: single positional indexer is out-of-bounds

In [4]:
many = (trans.loc[trans["turnover_pct"]>=50]
        .groupby("cms_certification_number").size().sort_values(ascending=False).head(10))
print(many)

cms_certification_number
395286    9
395785    9
395521    9
395295    9
395711    9
395259    9
395494    8
395912    8
395509    8
395226    8
dtype: int64
