In [1]:
# ────────────────────────────────────────────────────────────────────────────────
# 0.  Portable path setup  (no src import required)
# ────────────────────────────────────────────────────────────────────────────────
import os, re, pathlib, pandas as pd, numpy as np

# locate repo root (folder that contains "data")
PROJECT_ROOT = pathlib.Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR      = pathlib.Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

SIG_PATH  = INTERIM_DIR / "facility_signatures.csv"
OUT_PATH  = INTERIM_DIR / "facility_signatures_groupflags.csv"
print("INPUT :", SIG_PATH)
print("OUTPUT:", OUT_PATH)

# ────────────────────────────────────────────────────────────────────────────────
# config  (add one switch)
# ────────────────────────────────────────────────────────────────────────────────
THRESH              = 50.0                      # % equity turnover
CUTOFF_DATE         = pd.Timestamp("2017-01-01")
USE_SURNAME_OVERRIDE= True
LOOK_THROUGH_LLC    = True   

INPUT : C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures.csv
OUTPUT: C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_groupflags.csv


In [2]:
# ────────────────────────────────────────────────────────────────────────────────
# 1. Load signature-wide table & detect group numbers
# ────────────────────────────────────────────────────────────────────────────────
sig = pd.read_csv(SIG_PATH, dtype={"cms_certification_number":"string"}, low_memory=False)
print(f"[load] {len(sig):,} rows, {sig.shape[1]} cols")

# drop any legacy chow columns
sig = sig.loc[:, ~sig.columns.str.startswith("chow")]
group_nums = sorted(
    int(m.group(1))
    for c in sig.columns
    if (m := re.search(r"group(\d+)_owners$", c))
)
print("[groups] detected:", group_nums)

[load] 14,075 rows, 66 cols
[groups] detected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [3]:
# ────────────────────────────────────────────────────────────────────────────────
# 2. WIDE → LONG helper  (now also grabs groupN_roles)
# ────────────────────────────────────────────────────────────────────────────────
def sig_wide_to_long(sig_df, group_nums):
    recs = []
    for _, row in sig_df.iterrows():
        ccn  = row["cms_certification_number"]
        prov = row["provider_name"]
        for n in group_nums:
            owners = row.get(f"group{n}_owners")
            pcts   = row.get(f"group{n}_pcts")
            roles  = row.get(f"group{n}_roles")          # ← NEW
            date   = row.get(f"group{n}_date")
            if (pd.isna(owners) or owners == "") and (pd.isna(date) or date == ""):
                continue
            recs.append((ccn, prov, n, owners, pcts, roles, date))
    long = pd.DataFrame(
        recs,
        columns=["cms_certification_number","provider_name","grp_n",
                 "owners","pcts","roles","date_str"]
    )
    long["date"] = pd.to_datetime(long["date_str"], errors="coerce")
    return long

long = sig_wide_to_long(sig, group_nums)          # ← create the variable
print(f"[long] {len(long):,} group rows")

[long] 39,789 group rows


In [4]:
# ────────────────────────────────────────────────────────────────────────────────
# 3. Parse owner, pct *and* role strings
# ────────────────────────────────────────────────────────────────────────────────
split_pipe = lambda s: [] if pd.isna(s) or s=="" else [x.strip() for x in str(s).split("|")]
pct_re     = re.compile(r"(\d+(?:\.\d+)?)")
pct_float  = lambda s: float(pct_re.search(str(s)).group(1)) if pct_re.search(str(s)) else np.nan

long["owners_list"] = long["owners"].apply(split_pipe)
long["pcts_list"]   = long["pcts"].apply(split_pipe)
long["roles_list"]  = long["roles"].apply(split_pipe)       # ← NEW
long["pcts_num"]    = long["pcts_list"].apply(lambda lst: [pct_float(x) for x in lst])

In [5]:
# DEBUG: what percent of pairs got a real turnover? what % have all percents missing?
import numpy as np, pandas as pd

# 1) ensure turnover exists
print("Has 'turnover' column? ->", "turnover" in long.columns)

# 2) compute pair diagnostics without changing your data
def pair_stats(group):
    group = group.sort_values("date")
    rows = []
    for prev, curr in zip(group.iloc[:-1].itertuples(index=False), group.iloc[1:].itertuples(index=False)):
        # usable % counts
        prev_p = sum(0 if (v is None or (isinstance(v,float) and np.isnan(v))) else 1 for v in getattr(prev,"pcts_num"))
        curr_p = sum(0 if (v is None or (isinstance(v,float) and np.isnan(v))) else 1 for v in getattr(curr,"pcts_num"))
        rows.append({
            "ccn": getattr(curr,"cms_certification_number"),
            "grp": getattr(curr,"grp_n"),
            "date": getattr(curr,"date"),
            "prev_has_pct": prev_p>0,
            "curr_has_pct": curr_p>0,
        })
    return pd.DataFrame(rows)

pairs = pd.concat([pair_stats(g) for _, g in long.groupby("cms_certification_number")], ignore_index=True)
pct_pairs_with_any_pct = (pairs["prev_has_pct"] & pairs["curr_has_pct"]).mean()
print("Pairs with pct on BOTH sides:", round(100*pct_pairs_with_any_pct,2), "%")
print("Total pairs:", len(pairs))

# If turnover exists, show how many are non-NA and above threshold
if "turnover" in long.columns:
    non_na = long["turnover"].notna().sum()
    print("turnover non-NA rows:", non_na)
    if non_na:
        print(long.loc[long["turnover"].notna(), "turnover"].describe())


Has 'turnover' column? -> False
Pairs with pct on BOTH sides: 35.53 %
Total pairs: 25714


In [6]:
# ────────────────────────────────────────────────────────────────────────────────
# 4. %-equity turnover that can look through LLC shells
# ────────────────────────────────────────────────────────────────────────────────
LLC_RE = re.compile(r"\b(LLC|INC|CORP|L\.L\.C\.|L\.P\.|CO\.?)\b", re.I)

def build_dict(row, look_through=True):
    owners = row["owners_list"]
    pcts   = row["pcts_num"]
    roles  = row.get("roles_list", ["DIRECT"] * len(owners))  # safe default
    any_indirect = any((r or "").strip().upper() == "INDIRECT" for r in roles)

    recs = []
    for o, p, r in zip(owners, pcts, roles):
        if np.isnan(p):
            continue
        # only treat DIRECT corporate shells as pass-through if we *have* indirect detail
        if look_through and any_indirect and (r or "").upper() == "DIRECT" and LLC_RE.search(o):
            continue
        recs.append((o, float(p)))

    d = {}
    for o, p in recs:
        d[o] = d.get(o, 0.0) + p
    return d

def pct_turnover(prev, curr, look_through=True):
    prev_d = build_dict(prev, look_through=look_through)
    curr_d = build_dict(curr, look_through=look_through)
    if not prev_d and not curr_d:
        return np.nan
    owners = set(prev_d) | set(curr_d)
    diff   = sum(abs(curr_d.get(o, 0.0) - prev_d.get(o, 0.0)) for o in owners)
    return diff / 2.0

if "turnover" not in long.columns:
    long = long.sort_values(
        ["cms_certification_number", "date", "grp_n"]
    ).reset_index(drop=True)
    long["turnover"] = np.nan

    for ccn, idxs in long.groupby("cms_certification_number").groups.items():
        idxs = sorted(idxs)
        for i in range(1, len(idxs)):  # compare each group to its previous group
            long.loc[idxs[i], "turnover"] = pct_turnover(
                long.loc[idxs[i - 1]],
                long.loc[idxs[i]],
                look_through=LOOK_THROUGH_LLC
            )

print("[turnover] non-NA rows:", long["turnover"].notna().sum())

[turnover] non-NA rows: 14557


In [7]:
# ────────────────────────────────────────────────────────────────────────────────
# 5. Surname continuity override
# ────────────────────────────────────────────────────────────────────────────────
surname_set = lambda lst: { (n.split(",")[0] if "," in n else n.split()[0]).strip().upper()
                            for n in lst if n.strip() }

long["surnames_set"] = long["owners_list"].apply(surname_set)
long["surname_overlap_prev"] = False
if USE_SURNAME_OVERRIDE:
    for ccn, idxs in long.groupby("cms_certification_number").groups.items():
        idxs = sorted(idxs)
        for i in range(1, len(idxs)):
            overlap = bool(long.loc[idxs[i-1],"surnames_set"] & long.loc[idxs[i], "surnames_set"])
            long.loc[idxs[i], "surname_overlap_prev"] = overlap

# after you compute surname_overlap_prev in §5:
long["eff_turnover"] = long["turnover"]
mask_na = long["eff_turnover"].isna()
long.loc[mask_na & ~long["surname_overlap_prev"], "eff_turnover"] = 100.0

In [8]:
# How many snapshots even mention INDIRECT owners?
long["has_indirect"] = long["roles_list"].apply(
    lambda rs: any((r or "").upper() == "INDIRECT" for r in (rs or []))
)
print("snapshots with any INDIRECT:", long["has_indirect"].sum())

# Are roles aligned with owners? (lengths should match)
def len_mismatch(row):
    return len(row["owners_list"]) != len(row.get("roles_list", []))
print("rows with owners/roles length mismatch:",
      long.apply(len_mismatch, axis=1).sum())

snapshots with any INDIRECT: 22009
rows with owners/roles length mismatch: 0


In [9]:
# ────────────────────────────────────────────────────────────────────────────────
# 6. Apply ChOW rules
# ────────────────────────────────────────────────────────────────────────────────
long["is_chow"]  = False
long["chow_date"] = pd.NaT

for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)
    # baseline group (group1)
    first = idxs[0]
    if pd.notna(long.loc[first,"date"]) and long.loc[first,"date"] >= CUTOFF_DATE:
        long.loc[first,"is_chow"] = True
        long.loc[first,"chow_date"] = long.loc[first,"date"]
    # subsequent groups
    # subsequent groups
    for i in range(1, len(idxs)):
        condition = (
            (long.loc[idxs[i], "eff_turnover"] >= THRESH) &
            (long.loc[idxs[i], "date"] >= CUTOFF_DATE)
    )
    # (optional) drop the extra surname veto since eff_turnover already uses it
    # if USE_SURNAME_OVERRIDE:
    #     condition &= ~long.loc[idxs[i], "surname_overlap_prev"]

        if condition:
            long.loc[idxs[i], "is_chow"] = True

print(f"[chow] groups flagged: {long['is_chow'].sum():,}")

[chow] groups flagged: 11,229


In [10]:
# ────────────────────────────────────────────────────────────────────────────────
# 7. LONG → WIDE  (is_chow, chow_date flags)
# ────────────────────────────────────────────────────────────────────────────────
w_is  = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)
w_cd  = long.pivot(index="cms_certification_number", columns="grp_n", values="chow_date")

w_is.columns = [f"group{n}_is_chow"  for n in w_is.columns]
w_cd.columns = [f"group{n}_chow_date" for n in w_cd.columns]
w_is = w_is.replace({True:"Yes", False:""})

# combine with original signature-wide table
sig_chow = (
    sig.set_index("cms_certification_number")
       .join(w_is, how="left")
       .join(w_cd, how="left")
       .reset_index()
)

# reorder columns
def order_cols(df):
    grp_nums = sorted(int(re.search(r"group(\d+)_owners$", c).group(1))
                      for c in df.columns if re.search(r"group(\d+)_owners$", c))
    cols = ["cms_certification_number", "provider_name"]
    for n in grp_nums:
        cols += [f"group{n}_owners", f"group{n}_pcts", f"group{n}_roles", f"group{n}_date",
                 f"group{n}_is_chow", f"group{n}_chow_date"]
    return cols

sig_chow = sig_chow[order_cols(sig_chow)]
print(f"[final] facility_signatures_groupflags shape: {sig_chow.shape}")

[final] facility_signatures_groupflags shape: (14075, 98)


  w_is  = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)


In [11]:
# ────────────────────────────────────────────────────────────────────────────────
# 8. SAVE
# ────────────────────────────────────────────────────────────────────────────────
sig_chow.to_csv(OUT_PATH, index=False)
print(f"[save] wrote {OUT_PATH}")

[save] wrote C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_groupflags.csv


In [12]:
# ---- CONFIG ----
THRESH = 50.0
CUTOFF_DATE = pd.Timestamp("2017-01-01")
LOOK_THROUGH_LLC = True
SIG_PATH = INTERIM_DIR / "facility_signatures.csv"
OUT_PATH = INTERIM_DIR / "facility_signatures_groupflags.csv"

import re, numpy as np, pandas as pd

# ---- LOAD & WIDE->LONG ----
sig = pd.read_csv(SIG_PATH, dtype={"cms_certification_number":"string"}, low_memory=False)
group_nums = sorted(int(re.search(r"group(\d+)_owners$", c).group(1)) for c in sig.columns if re.search(r"group(\d+)_owners$", c))

def split_pipe(s): 
    return [] if (pd.isna(s) or str(s).strip()=="") else [x.strip() for x in str(s).split("|")]
pct_re = re.compile(r"(\d+(?:\.\d+)?)")
pct_float = lambda s: float(pct_re.search(str(s)).group(1)) if pct_re.search(str(s)) else np.nan

recs=[]
for _, row in sig.iterrows():
    ccn=row["cms_certification_number"]; prov=row.get("provider_name","")
    for n in group_nums:
        owners=row.get(f"group{n}_owners"); pcts=row.get(f"group{n}_pcts")
        roles=row.get(f"group{n}_roles"); date=row.get(f"group{n}_date")
        if (pd.isna(owners) or owners=="") and (pd.isna(date) or date==""): 
            continue
        recs.append((ccn,prov,n,owners,pcts,roles,date))
long = pd.DataFrame(recs, columns=["cms_certification_number","provider_name","grp_n","owners","pcts","roles","date_str"])
long["date"] = pd.to_datetime(long["date_str"], errors="coerce")
long["owners_list"] = long["owners"].apply(split_pipe)
long["pcts_list"]   = long["pcts"].apply(split_pipe)
long["pcts_num"]    = long["pcts_list"].apply(lambda lst: [pct_float(x) for x in lst])
long["roles_list"]  = long["roles"].apply(split_pipe)

# quick alignment check
misalign = (long["owners_list"].str.len() != long["pcts_num"].str.len()) | (long["owners_list"].str.len() != long["roles_list"].str.len())
assert not misalign.any(), f"Owner/pct/role length mismatch in {misalign.sum()} rows"

# ---- TURNOVER (with guarded look-through) ----
LLC_RE = re.compile(r"\b(LLC|INC|CORP|L\.L\.C\.|L\.P\.|CO\.?)\b", re.I)
def build_dict(row, look_through=True):
    owners, pcts, roles = row["owners_list"], row["pcts_num"], row["roles_list"]
    any_indirect = any((r or "").strip().upper()=="INDIRECT" for r in roles)
    recs=[]
    for o,p,r in zip(owners,pcts,roles):
        if np.isnan(p): 
            continue
        if look_through and any_indirect and (r or "").upper()=="DIRECT" and LLC_RE.search(o):
            continue
        recs.append((o, float(p)))
    d={}
    for o,p in recs:
        d[o]=d.get(o,0.0)+p
    return d

def pct_turnover(prev, curr, look_through=True):
    prev_d = build_dict(prev, look_through)
    curr_d = build_dict(curr, look_through)
    if not prev_d and not curr_d: 
        return np.nan
    owners = set(prev_d) | set(curr_d)
    diff = sum(abs(curr_d.get(o,0.0)-prev_d.get(o,0.0)) for o in owners)
    return diff/2.0

long = long.sort_values(["cms_certification_number","date","grp_n"]).reset_index(drop=True)
long["turnover"] = np.nan
pairs = 0
for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)
    for i in range(1, len(idxs)):
        pairs += 1
        long.loc[idxs[i], "turnover"] = pct_turnover(long.loc[idxs[i-1]], long.loc[idxs[i]], look_through=LOOK_THROUGH_LLC)
print("[turnover] non-NA:", long["turnover"].notna().sum(), "of", pairs, "pairs")

# ---- FALLBACK (names) ----
def surname_set(lst):
    out=set()
    for n in lst:
        n=str(n).strip()
        if not n: continue
        out.add((n.split(",")[0] if "," in n else n.split()[0]).upper())
    return out
long["surnames_set"] = long["owners_list"].apply(surname_set)
long["surname_overlap_prev"] = False
for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)
    for i in range(1, len(idxs)):
        long.loc[idxs[i], "surname_overlap_prev"] = bool(long.loc[idxs[i-1],"surnames_set"] & long.loc[idxs[i],"surnames_set"])

long["eff_turnover"] = long["turnover"]
mask_na = long["eff_turnover"].isna()
long.loc[mask_na & ~long["surname_overlap_prev"], "eff_turnover"] = 100.0

# ---- FLAG ----
long["is_chow"] = False
long["chow_date"] = pd.NaT
for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)
    # baseline
    first = idxs[0]
    if pd.notna(long.loc[first,"date"]) and long.loc[first,"date"] >= CUTOFF_DATE:
        long.loc[first, ["is_chow","chow_date"]] = [True, long.loc[first,"date"]]
    # groups 2+
    for i in range(1, len(idxs)):
        if (pd.notna(long.loc[idxs[i], "date"]) and
            long.loc[idxs[i], "date"] >= CUTOFF_DATE and
            pd.notna(long.loc[idxs[i], "eff_turnover"]) and
            long.loc[idxs[i], "eff_turnover"] >= THRESH):
            long.loc[idxs[i], ["is_chow","chow_date"]] = [True, long.loc[idxs[i], "date"]]

g1 = (long.query("grp_n==1")["is_chow"]==True).sum()
g2p = (long.query("grp_n>1")["is_chow"]==True).sum()
print(f"[flags] group1: {g1}  groups≥2: {g2p}")

# ---- SAVE BACK TO WIDE ----
w_is = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)
w_cd = long.pivot(index="cms_certification_number", columns="grp_n", values="chow_date")
w_is.columns = [f"group{n}_is_chow" for n in w_is.columns]
w_cd.columns = [f"group{n}_chow_date" for n in w_cd.columns]
w_is = w_is.replace({True:"Yes", False:""})

sig_chow = (
    sig.set_index("cms_certification_number")
       .join(w_is, how="left")
       .join(w_cd, how="left")
       .reset_index()
)
sig_chow.to_csv(OUT_PATH, index=False)
print(f"[save] wrote {OUT_PATH}")


[turnover] non-NA: 14557 of 25714 pairs
[flags] group1: 701  groups≥2: 10528


  w_is = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)


[save] wrote C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_groupflags.csv
