In [10]:
import pandas as pd, numpy as np, re
from pathlib import Path

In [16]:
# ------------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------------
SIG_PATH = Path(r"C:\Users\wrthj\OneDrive\Documents\Honors_Thesis\data\data\facility_signatures.csv")
OUT_PATH = SIG_PATH.with_name("facility_signatures_groupflags.csv")

THRESH      = 50.0                         # % equity turnover to mark ChOW
CUTOFF_DATE = pd.Timestamp("2017-01-01")   # ignore earlier changes
USE_SURNAME_OVERRIDE = True                # turn off ChOW if family continuity

In [17]:
# ------------------------------------------------------------------
# 1. LOAD signature-wide table
#    (drop any old chow columns if present)
# ------------------------------------------------------------------
sig = pd.read_csv(SIG_PATH, dtype={"cms_certification_number":"string"}, low_memory=False)
print(f"[load] {sig.shape[0]:,} rows, {sig.shape[1]} cols.")

# drop legacy chow columns defensively
sig = sig.loc[:, ~sig.columns.str.startswith("chow")]
print(f"[clean] after dropping legacy chow cols: {sig.shape[1]} columns.")

# detect group numbers present
group_pat = re.compile(r"group(\d+)_owners$")
group_nums = sorted(int(group_pat.search(c).group(1)) for c in sig.columns if group_pat.search(c))
print(f"[groups] detected: {group_nums}")

[load] 13,893 rows, 50 cols.
[clean] after dropping legacy chow cols: 50 columns.
[groups] detected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [27]:

# =============================================================================
# WIDE → LONG
# =============================================================================
def sig_wide_to_long(sig_df, group_nums):
    recs = []
    for _, row in sig_df.iterrows():
        ccn  = row["cms_certification_number"]
        prov = row["provider_name"]
        for n in group_nums:
            owners = row.get(f"group{n}_owners")
            pcts   = row.get(f"group{n}_pcts")
            date   = row.get(f"group{n}_date")
            if (pd.isna(owners) or owners == "") and (pd.isna(date) or date == ""):
                continue
            recs.append((ccn, prov, n, owners, pcts, date))
    out = pd.DataFrame(recs, columns=["cms_certification_number","provider_name","grp_n","owners","pcts","date_str"])
    out["date"] = pd.to_datetime(out["date_str"], errors="coerce")
    return out

long = sig_wide_to_long(sig, group_nums)
print(f"[long] {len(long):,} group rows.")

[long] 38,092 group rows.


In [28]:
# =============================================================================
# PARSE OWNERS & %s
# =============================================================================
def split_pipe(s):
    if pd.isna(s) or s == "":
        return []
    return [x.strip() for x in str(s).split("|")]

pct_re = re.compile(r"(\d+(?:\.\d+)?)")
def pct_to_float(s):
    m = pct_re.search(str(s))
    return float(m.group(1)) if m else np.nan

long["owners_list"] = long["owners"].apply(split_pipe)
long["pcts_list"]   = long["pcts"].apply(split_pipe)
long["pcts_num"]    = long["pcts_list"].apply(lambda lst: [pct_to_float(x) for x in lst])

In [29]:
# =============================================================================
# TURNOVER vs PREVIOUS GROUP (numeric % only)
# =============================================================================
def pct_turnover(prev, curr):
    prev_d = {o:p for o,p in zip(prev["owners_list"], prev["pcts_num"]) if not np.isnan(p)}
    curr_d = {o:p for o,p in zip(curr["owners_list"], curr["pcts_num"]) if not np.isnan(p)}
    if not prev_d and not curr_d:
        return np.nan
    all_owners = set(prev_d) | set(curr_d)
    diff = sum(abs(curr_d.get(o,0.0) - prev_d.get(o,0.0)) for o in all_owners)
    return diff / 2.0

long = long.sort_values(["cms_certification_number","date","grp_n"]).reset_index(drop=True)
long["turnover"] = np.nan
for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    idxs = sorted(list(idxs))
    for i in range(1, len(idxs)):
        long.loc[idxs[i], "turnover"] = pct_turnover(long.loc[idxs[i-1]], long.loc[idxs[i]])

In [31]:
# =============================================================================
# SURNAME CONTINUITY
# =============================================================================
def surname_set(lst):
    out = set()
    for n in lst:
        n = n.strip()
        if not n:
            continue
        if "," in n:
            out.add(n.split(",")[0].strip().upper())
        else:
            out.add(n.split()[0].strip().upper())
    return out

long["surnames_set"] = long["owners_list"].apply(surname_set)
long["surname_overlap_prev"] = False
if USE_SURNAME_OVERRIDE:
    for ccn, idxs in long.groupby("cms_certification_number").groups.items():
        idxs = sorted(list(idxs))
        for i in range(1, len(idxs)):
            overlap = bool(long.loc[idxs[i-1], "surnames_set"] & long.loc[idxs[i], "surnames_set"])
            long.loc[idxs[i], "surname_overlap_prev"] = overlap

In [33]:
# =============================================================================
# APPLY CHOW RULES
# =============================================================================
long["is_chow"] = False

# Baseline groups: group1 is ChOW if its start date >= cutoff
for ccn, idxs in long.groupby("cms_certification_number").groups.items():
    first_i = min(idxs)
    d = long.loc[first_i, "date"]
    if pd.notna(d) and d >= CUTOFF_DATE:
        long.loc[first_i, "is_chow"] = True

# Subsequent groups: turnover rule (>= THRESH), date >= cutoff, and (optionally) no surname overlap
mask = (long["turnover"] >= THRESH) & (long["date"] >= CUTOFF_DATE)
if USE_SURNAME_OVERRIDE:
    mask &= ~long["surname_overlap_prev"]
long.loc[mask, "is_chow"] = True

# ChOW date = group date when flagged
long["chow_date"] = pd.NaT
long.loc[long["is_chow"], "chow_date"] = long.loc[long["is_chow"], "date"]

print(f"[chow] groups flagged: {long['is_chow'].sum():,}")

[chow] groups flagged: 7,089


In [35]:
# =============================================================================
# LONG → WIDE FLAGS
# =============================================================================
w_is = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)
w_is.columns = [f"group{n}_is_chow" for n in w_is.columns]
w_is = w_is.replace({True:"Yes", False:""})

w_cd = long.pivot(index="cms_certification_number", columns="grp_n", values="chow_date")
w_cd.columns = [f"group{n}_chow_date" for n in w_cd.columns]

# combine
sig_chow = (
    sig.set_index("cms_certification_number")
       .join(w_is, how="left")
       .join(w_cd, how="left")
       .reset_index()
)

# order columns: CCN, provider_name, then group blocks
def ordered_group_cols(sig_df, group_nums):
    cols = ["cms_certification_number", "provider_name"]
    for n in group_nums:
        cols += [f"group{n}_owners", f"group{n}_pcts", f"group{n}_date",
                 f"group{n}_is_chow", f"group{n}_chow_date"]
    return cols

sig_chow = sig_chow[ordered_group_cols(sig_chow, group_nums)]
print(f"[final] {sig_chow.shape[0]:,} rows × {sig_chow.shape[1]} cols.")

[final] 13,893 rows × 82 cols.


  w_is = long.pivot(index="cms_certification_number", columns="grp_n", values="is_chow").fillna(False)


In [24]:
# ------------------------------------------------------------------
# 8. Merge onto original signature-wide (sig)
# ------------------------------------------------------------------
sig_chow = (
    sig.set_index("cms_certification_number")
       .join(w_is, how="left")
       .join(w_cd, how="left")
       .reset_index()
)

# reorder: CCN, provider_name, then for each group: owners/pcts/date/is_chow/chow_date
def ordered_group_cols(sig_df):
    # detect group nums again (in case some were missing in join)
    grp_nums = sorted(int(re.search(r"group(\d+)_owners$", c).group(1))
                      for c in sig_df.columns if re.search(r"group(\d+)_owners$", c))
    cols = ["cms_certification_number", "provider_name"]
    for n in grp_nums:
        cols += [f"group{n}_owners", f"group{n}_pcts", f"group{n}_date",
                 f"group{n}_is_chow", f"group{n}_chow_date"]
    return cols

cols = ordered_group_cols(sig_chow)
sig_chow = sig_chow[cols]

print(f"[final] facility_signatures_groupflags shape: {sig_chow.shape}")

[final] facility_signatures_groupflags shape: (13893, 82)


In [36]:
# ------------------------------------------------------------------
# 9. SAVE
# ------------------------------------------------------------------
sig_chow.to_csv(OUT_PATH, index=False)
print(f"[save] wrote {OUT_PATH}")

[save] wrote C:\Users\wrthj\OneDrive\Documents\Honors_Thesis\data\data\facility_signatures_groupflags.csv
