In [1]:
# ────────────────────────────────────────────────────────────────────────────────
# CHOW Flagging — tightened logic with A–E fixes
#   A) Replace 100%-fallback with cautious, names-only heuristic
#   B) Normalize corporate owner names before turnover
#   C) Collapse same‑day multi‑group hops per facility
#   D) Stricter new‑home (group1) rule: require at least one owner %
#   E) CHOW condition unchanged but now uses safer eff_turnover
# ────────────────────────────────────────────────────────────────────────────────
import os, re, pathlib, numpy as np, pandas as pd

# ────────────────────────────────────────────────────────────────────────────────
# 0) Paths / config
# ────────────────────────────────────────────────────────────────────────────────
PROJECT_ROOT = pathlib.Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR      = pathlib.Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

SIG_PATH  = INTERIM_DIR / "facility_signatures.csv"
OUT_PATH  = INTERIM_DIR / "facility_signatures_groupflags.csv"

# Tunables
THRESH           = 50.0
CUTOFF_DATE      = pd.Timestamp("2017-01-01")
LOOK_THROUGH_LLC = True

print("INPUT :", SIG_PATH)
print("OUTPUT:", OUT_PATH)

# ────────────────────────────────────────────────────────────────────────────────
# 1) Load & detect groups
# ────────────────────────────────────────────────────────────────────────────────
sig = pd.read_csv(SIG_PATH, dtype={"cms_certification_number": "string"}, low_memory=False)
print(f"[load] {len(sig):,} rows, {sig.shape[1]} cols")

# drop any legacy chow columns
sig = sig.loc[:, ~sig.columns.str.startswith("chow")]
group_nums = sorted(
    int(m.group(1))
    for c in sig.columns
    if (m := re.search(r"group(\d+)_owners$", c))
)
print("[groups] detected:", group_nums)

# ────────────────────────────────────────────────────────────────────────────────
# 2) WIDE → LONG
# ────────────────────────────────────────────────────────────────────────────────
def split_pipe(s):
    return [] if (pd.isna(s) or str(s).strip()=="") else [x.strip() for x in str(s).split("|")]

_pct_re = re.compile(r"(\d+(?:\.\d+)?)")
def pct_float(s):
    m = _pct_re.search(str(s))
    return float(m.group(1)) if m else np.nan

recs = []
for _, row in sig.iterrows():
    ccn  = row["cms_certification_number"]
    prov = row.get("provider_name", "")
    for n in group_nums:
        owners = row.get(f"group{n}_owners")
        pcts   = row.get(f"group{n}_pcts")
        roles  = row.get(f"group{n}_roles")
        date   = row.get(f"group{n}_date")
        if (pd.isna(owners) or owners=="") and (pd.isna(date) or date==""):
            continue
        recs.append((ccn, prov, n, owners, pcts, roles, date))

long = pd.DataFrame(recs, columns=[
    "cms_certification_number","provider_name","grp_n",
    "owners","pcts","roles","date_str"
])
long["date"]         = pd.to_datetime(long["date_str"], errors="coerce")
long["owners_list"]  = long["owners"].apply(split_pipe)
long["pcts_list"]    = long["pcts"].apply(split_pipe)
long["pcts_num"]     = long["pcts_list"].apply(lambda lst: [pct_float(x) for x in lst])
long["roles_list"]   = long["roles"].apply(split_pipe)

# quick alignment check (won't crash, but prints if off)
misalign = (
    long["owners_list"].str.len() != long["pcts_num"].str.len()
) | (
    long["owners_list"].str.len() != long["roles_list"].str.len()
)
mis_ct = int(misalign.sum())
print(f"[align] owner/pct/role length mismatches: {mis_ct}")

# ────────────────────────────────────────────────────────────────────────────────
# 3) Normalization (B) + turnover engine
# ────────────────────────────────────────────────────────────────────────────────
LLC_RE          = re.compile(r"\b(LLC|INC|CORP|L\.L\.C\.|L\.P\.|CO\.?)\b", re.I)
CORP_SUFFIX_RE  = re.compile(r"\b(CORP(ORATION)?|INCORPORATED|INC|LLC|L\.L\.C\.|LP|L\.P\.|CO|COMPANY|HOLDINGS?)\b\.?", re.I)
PUNCT_RE        = re.compile(r"[,\.&]")

def norm_owner_name(name: str) -> str:
    s = str(name).upper().strip()
    s = PUNCT_RE.sub(" ", s)
    s = CORP_SUFFIX_RE.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_dict(row, look_through=True):
    owners, pcts, roles = row["owners_list"], row["pcts_num"], row["roles_list"]
    any_indirect = any((r or "").strip().upper()=="INDIRECT" for r in roles)
    recs=[]
    for o,p,r in zip(owners,pcts,roles):
        if np.isnan(p):
            continue
        # look-through DIRECT corp shells only when INDIRECT detail exists
        if look_through and any_indirect and (r or "").upper()=="DIRECT" and LLC_RE.search(o):
            continue
        recs.append((norm_owner_name(o), float(p)))
    d={}
    for o,p in recs:
        d[o]=d.get(o,0.0)+p
    return d

def pct_turnover(prev, curr, look_through=True):
    prev_d = build_dict(prev, look_through)
    curr_d = build_dict(curr, look_through)
    if not prev_d and not curr_d:
        return np.nan
    owners = set(prev_d) | set(curr_d)
    diff   = sum(abs(curr_d.get(o,0.0) - prev_d.get(o,0.0)) for o in owners)
    return diff / 2.0

# ────────────────────────────────────────────────────────────────────────────────
# 4) Collapse same‑day hops (C) and compute turnover
# ────────────────────────────────────────────────────────────────────────────────
# Keep only the last snapshot per (ccn, date) to avoid double-counting on identical dates
long = long.sort_values(["cms_certification_number","date","grp_n"])
long = long.groupby(["cms_certification_number","date"], as_index=False).tail(1).reset_index(drop=True)

# surnames for heuristic
def surname_set(lst):
    out=set()
    for n in lst:
        n=str(n).strip()
        if not n: 
            continue
        out.add((n.split(",")[0] if "," in n else n.split()[0]).upper())
    return out

long["surnames_set"] = long["owners_list"].apply(surname_set)

# compute raw turnover and basic % presence
long["turnover"] = np.nan
long["has_pct"]  = long["pcts_num"].apply(lambda lst: sum(pd.notna(x) for x in lst) > 0)

pairs = 0
for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)
    for i in range(1, len(idxs)):
        pairs += 1
        long.loc[idxs[i], "turnover"] = pct_turnover(
            long.loc[idxs[i-1]], long.loc[idxs[i]], look_through=LOOK_THROUGH_LLC
        )
print("[turnover] computed for pairs:", pairs, " non-NA:", int(long["turnover"].notna().sum()))

# ────────────────────────────────────────────────────────────────────────────────
# 5) A — cautious names-only heuristic when BOTH sides lack %s
# ────────────────────────────────────────────────────────────────────────────────
def surname_jaccard(a, b):
    if not a and not b:
        return np.nan
    inter = len(a & b)
    union = len(a | b) if (a or b) else 1
    return inter / union if union else np.nan

long["surname_overlap_prev"]  = False
long["surname_overlap_ratio"] = np.nan
neither_has_pct = np.zeros(len(long), dtype=bool)

for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)
    for i in range(1, len(idxs)):
        prev_has = bool(long.loc[idxs[i-1], "has_pct"])
        curr_has = bool(long.loc[idxs[i],   "has_pct"])
        neither_has_pct[idxs[i]] = (not prev_has) and (not curr_has)

        prev = long.loc[idxs[i-1], "surnames_set"]
        curr = long.loc[idxs[i],   "surnames_set"]
        long.loc[idxs[i], "surname_overlap_prev"]  = bool(prev & curr)
        long.loc[idxs[i], "surname_overlap_ratio"] = surname_jaccard(prev, curr)

# effective turnover:
#   - use numeric turnover when available
#   - when turnover is NaN AND neither side has %s:
#         if surname overlap is very low (<= 0.25) → proxy 75 (still needs THRESH=50)
#         else leave NaN (no CHOW via turnover)
long["eff_turnover"] = long["turnover"]
mask_nan_t = long["eff_turnover"].isna()
use_names  = mask_nan_t & neither_has_pct
long.loc[use_names & (long["surname_overlap_ratio"] <= 0.25), "eff_turnover"] = 75.0
# leave other NaNs as NaN (we do NOT auto-flag)

# ────────────────────────────────────────────────────────────────────────────────
# 6) Flag CHOWs (D + E)
# ────────────────────────────────────────────────────────────────────────────────
long["is_chow"]   = False
long["chow_date"] = pd.NaT

for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(idxs)

    # Group1 = "new home" only if:
    #   - date >= cutoff
    #   - at least one owner % present (avoid placeholder entries)
    first = idxs[0]
    g1_ok = (
        pd.notna(long.loc[first, "date"]) and
        long.loc[first, "date"] >= CUTOFF_DATE and
        (sum(pd.notna(long.loc[first, "pcts_num"])) > 0)
    )
    if g1_ok:
        long.loc[first, ["is_chow", "chow_date"]] = [True, long.loc[first, "date"]]

    # Subsequent groups — unchanged rule, but now using eff_turnover (E)
    for i in range(1, len(idxs)):
        cond = (
            pd.notna(long.loc[idxs[i], "date"]) and
            long.loc[idxs[i], "date"] >= CUTOFF_DATE and
            pd.notna(long.loc[idxs[i], "eff_turnover"]) and
            long.loc[idxs[i], "eff_turnover"] >= THRESH
        )
        if cond:
            long.loc[idxs[i], ["is_chow", "chow_date"]] = [True, long.loc[idxs[i], "date"]]

print(f"[flags] total CHOW group rows: {int(long['is_chow'].sum()):,}")

# ────────────────────────────────────────────────────────────────────────────────
# 7) Diagnostics (optional but helpful)
# ────────────────────────────────────────────────────────────────────────────────
flagged = long[long["is_chow"]].copy()
flagged["year"] = pd.to_datetime(flagged["chow_date"]).dt.year
by_year = flagged.groupby("year").size().rename("flags").to_frame()

# What share relied on names-only heuristic (i.e., raw turnover NaN but eff_turnover set)?
flagged["via_name_heuristic"] = flagged["turnover"].isna() & flagged["eff_turnover"].notna()
diag = flagged.groupby("year")["via_name_heuristic"].mean().mul(100).round(1).rename("% via name heuristic").to_frame()

print("\n[CHOW by year — after A–E fixes]")
print(by_year.join(diag, how="left").fillna(0))

# ────────────────────────────────────────────────────────────────────────────────
# 8) LONG → WIDE (is_chow, chow_date) and SAVE
# ────────────────────────────────────────────────────────────────────────────────
w_is = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)
w_cd = long.pivot(index="cms_certification_number", columns="grp_n", values="chow_date")

w_is.columns = [f"group{n}_is_chow"   for n in w_is.columns]
w_cd.columns = [f"group{n}_chow_date" for n in w_cd.columns]
w_is = w_is.replace({True: "Yes", False: ""})

sig_chow = (
    sig.set_index("cms_certification_number")
       .join(w_is, how="left")
       .join(w_cd, how="left")
       .reset_index()
)

# Reorder to keep your familiar sequence
def order_cols(df):
    grp_nums = sorted(int(re.search(r"group(\d+)_owners$", c).group(1))
                      for c in df.columns if re.search(r"group(\d+)_owners$", c))
    cols = ["cms_certification_number", "provider_name"]
    for n in grp_nums:
        cols += [f"group{n}_owners", f"group{n}_pcts", f"group{n}_roles", f"group{n}_date",
                 f"group{n}_is_chow", f"group{n}_chow_date"]
    return [c for c in cols if c in df.columns]

sig_chow = sig_chow[order_cols(sig_chow)]
sig_chow.to_csv(OUT_PATH, index=False)
print(f"[save] wrote {OUT_PATH}")

INPUT : C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures.csv
OUTPUT: C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_groupflags.csv
[load] 14,075 rows, 66 cols
[groups] detected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
[align] owner/pct/role length mismatches: 0
[turnover] computed for pairs: 25693  non-NA: 14540
[flags] total CHOW group rows: 9,809

[CHOW by year — after A–E fixes]
      flags  % via name heuristic
year                             
2017   1314                  15.4
2018   1326                  27.1
2019   1685                  28.3
2020   1161                  35.3
2021   1565                  39.3
2022   1173                  10.4
2023   1014                  16.2
2024    543                  26.5
2025     28                  10.7


  w_is = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)


[save] wrote C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_groupflags.csv
