In [1]:
# ────────────────────────────────────────────────────────────────────────────────
# 0.  Portable path setup  (no src import required)
# ────────────────────────────────────────────────────────────────────────────────
import os, re, pathlib, pandas as pd, numpy as np

# locate repo root (folder that contains "data")
PROJECT_ROOT = pathlib.Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR      = pathlib.Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

SIG_PATH  = INTERIM_DIR / "facility_signatures.csv"
OUT_PATH  = INTERIM_DIR / "facility_signatures_groupflags.csv"
print("INPUT :", SIG_PATH)
print("OUTPUT:", OUT_PATH)

# config
THRESH        = 50.0                       # percent-equity turnover to flag
CUTOFF_DATE   = pd.Timestamp("2017-01-01") # ignore changes before this
USE_SURNAME_OVERRIDE = True                # keep flag off if surnames overlap

INPUT : C:\Repositories\white_bowblis_nhmc\data\interim\facility_signatures.csv
OUTPUT: C:\Repositories\white_bowblis_nhmc\data\interim\facility_signatures_groupflags.csv


In [2]:
# ────────────────────────────────────────────────────────────────────────────────
# 1. Load signature-wide table & detect group numbers
# ────────────────────────────────────────────────────────────────────────────────
sig = pd.read_csv(SIG_PATH, dtype={"cms_certification_number":"string"}, low_memory=False)
print(f"[load] {len(sig):,} rows, {sig.shape[1]} cols")

# drop any legacy chow columns
sig = sig.loc[:, ~sig.columns.str.startswith("chow")]
group_nums = sorted(
    int(m.group(1))
    for c in sig.columns
    if (m := re.search(r"group(\d+)_owners$", c))
)
print("[groups] detected:", group_nums)

[load] 13,893 rows, 50 cols
[groups] detected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [3]:
# ────────────────────────────────────────────────────────────────────────────────
# 2. WIDE → LONG helper
# ────────────────────────────────────────────────────────────────────────────────
def sig_wide_to_long(sig_df, group_nums):
    recs = []
    for _, row in sig_df.iterrows():
        ccn  = row["cms_certification_number"]
        prov = row["provider_name"]
        for n in group_nums:
            owners = row.get(f"group{n}_owners")
            pcts   = row.get(f"group{n}_pcts")
            date   = row.get(f"group{n}_date")
            if (pd.isna(owners) or owners == "") and (pd.isna(date) or date == ""):
                continue
            recs.append((ccn, prov, n, owners, pcts, date))
    long = pd.DataFrame(
        recs,
        columns=["cms_certification_number","provider_name","grp_n",
                 "owners","pcts","date_str"]
    )
    long["date"] = pd.to_datetime(long["date_str"], errors="coerce")
    return long

long = sig_wide_to_long(sig, group_nums)
print(f"[long] {len(long):,} group rows")

[long] 38,124 group rows


In [4]:
# ────────────────────────────────────────────────────────────────────────────────
# 3. Parse owner strings & percentage lists
# ────────────────────────────────────────────────────────────────────────────────
split_pipe = lambda s: [] if pd.isna(s) or s=="" else [x.strip() for x in str(s).split("|")]
pct_re  = re.compile(r"(\d+(?:\.\d+)?)")
pct_float = lambda s: float(pct_re.search(str(s)).group(1)) if pct_re.search(str(s)) else np.nan

long["owners_list"] = long["owners"].apply(split_pipe)
long["pcts_list"]   = long["pcts"].apply(split_pipe)
long["pcts_num"]    = long["pcts_list"].apply(lambda lst: [pct_float(x) for x in lst])

In [5]:
# ────────────────────────────────────────────────────────────────────────────────
# 4. Compute percent-equity turnover vs previous group
# ────────────────────────────────────────────────────────────────────────────────
def pct_turnover(prev, curr):
    prev_d = {o:p for o,p in zip(prev["owners_list"], prev["pcts_num"]) if not np.isnan(p)}
    curr_d = {o:p for o,p in zip(curr["owners_list"], curr["pcts_num"]) if not np.isnan(p)}
    if not prev_d and not curr_d:
        return np.nan
    all_owners = set(prev_d) | set(curr_d)
    diff = sum(abs(curr_d.get(o,0.0) - prev_d.get(o,0.0)) for o in all_owners)
    return diff / 2.0

long = long.sort_values(["cms_certification_number","date","grp_n"]).reset_index(drop=True)
long["turnover"] = np.nan
for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)
    for i in range(1, len(idxs)):
        long.loc[idxs[i], "turnover"] = pct_turnover(long.loc[idxs[i-1]], long.loc[idxs[i]])

In [6]:
# ────────────────────────────────────────────────────────────────────────────────
# 5. Surname continuity override
# ────────────────────────────────────────────────────────────────────────────────
surname_set = lambda lst: { (n.split(",")[0] if "," in n else n.split()[0]).strip().upper()
                            for n in lst if n.strip() }

long["surnames_set"] = long["owners_list"].apply(surname_set)
long["surname_overlap_prev"] = False
if USE_SURNAME_OVERRIDE:
    for ccn, idxs in long.groupby("cms_certification_number").groups.items():
        idxs = sorted(idxs)
        for i in range(1, len(idxs)):
            overlap = bool(long.loc[idxs[i-1],"surnames_set"] & long.loc[idxs[i], "surnames_set"])
            long.loc[idxs[i], "surname_overlap_prev"] = overlap

In [7]:
# ────────────────────────────────────────────────────────────────────────────────
# 6. Apply ChOW rules
# ────────────────────────────────────────────────────────────────────────────────
long["is_chow"]  = False
long["chow_date"] = pd.NaT

for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)
    # baseline group (group1)
    first = idxs[0]
    if pd.notna(long.loc[first,"date"]) and long.loc[first,"date"] >= CUTOFF_DATE:
        long.loc[first,"is_chow"] = True
        long.loc[first,"chow_date"] = long.loc[first,"date"]
    # subsequent groups
    for i in range(1, len(idxs)):
        condition = (
            (long.loc[idxs[i],"turnover"] >= THRESH) &
            (long.loc[idxs[i],"date"] >= CUTOFF_DATE)
        )
        if USE_SURNAME_OVERRIDE:
            condition &= ~long.loc[idxs[i],"surname_overlap_prev"]
        if condition:
            long.loc[idxs[i],"is_chow"] = True
            long.loc[idxs[i],"chow_date"] = long.loc[idxs[i],"date"]

print(f"[chow] groups flagged: {long['is_chow'].sum():,}")

[chow] groups flagged: 7,111


In [8]:
# ────────────────────────────────────────────────────────────────────────────────
# 7. LONG → WIDE  (is_chow, chow_date flags)
# ────────────────────────────────────────────────────────────────────────────────
w_is  = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)
w_cd  = long.pivot(index="cms_certification_number", columns="grp_n", values="chow_date")

w_is.columns = [f"group{n}_is_chow"  for n in w_is.columns]
w_cd.columns = [f"group{n}_chow_date" for n in w_cd.columns]
w_is = w_is.replace({True:"Yes", False:""})

# combine with original signature-wide table
sig_chow = (
    sig.set_index("cms_certification_number")
       .join(w_is, how="left")
       .join(w_cd, how="left")
       .reset_index()
)

# reorder columns
def order_cols(df):
    grp_nums = sorted(int(re.search(r"group(\d+)_owners$", c).group(1))
                      for c in df.columns if re.search(r"group(\d+)_owners$", c))
    cols = ["cms_certification_number", "provider_name"]
    for n in grp_nums:
        cols += [f"group{n}_owners", f"group{n}_pcts", f"group{n}_date",
                 f"group{n}_is_chow", f"group{n}_chow_date"]
    return cols

sig_chow = sig_chow[order_cols(sig_chow)]
print(f"[final] facility_signatures_groupflags shape: {sig_chow.shape}")

[final] facility_signatures_groupflags shape: (13893, 82)


  w_is  = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)


In [9]:
# ────────────────────────────────────────────────────────────────────────────────
# 8. SAVE
# ────────────────────────────────────────────────────────────────────────────────
sig_chow.to_csv(OUT_PATH, index=False)
print(f"[save] wrote {OUT_PATH}")

[save] wrote C:\Repositories\white_bowblis_nhmc\data\interim\facility_signatures_groupflags.csv
