In [1]:
# ────────────────────────────────────────────────────────────────────────────────
# 0.  Portable path setup  (NO src import required)
# ────────────────────────────────────────────────────────────────────────────────
import os, re, pathlib, pandas as pd, numpy as np

# Walk upward until we find the repo root (folder that contains “data”)
PROJECT_ROOT = pathlib.Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

# Shared raw-data dir (OneDrive) comes from NH_DATA_DIR env var.
RAW_DIR      = pathlib.Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
OWN_DIR      = RAW_DIR / "ownership-files"              # extraction outputs
INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"        # all cleaned / interim
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

print("RAW_DIR :", RAW_DIR)
print("INTERIM :", INTERIM_DIR)

RAW_DIR : C:\Users\Owner\OneDrive\NursingHomeData
INTERIM : C:\Repositories\white_bowblis_nhmc\data\interim


In [2]:
# ────────────────────────────────────────────────────────────────────────────────
# 1. Load the clean ownership table from step 02
# ────────────────────────────────────────────────────────────────────────────────
CLEAN_PATH = INTERIM_DIR / "ownership_file_clean.csv"
if not CLEAN_PATH.exists():
    raise FileNotFoundError(CLEAN_PATH)

df = pd.read_csv(
    CLEAN_PATH,
    dtype={"cms_certification_number": "string"},
    parse_dates=["processing_date", "association_date"],
    low_memory=False,
)
print(f"[load] {len(df):,} rows, {df.shape[1]} columns")

[load] 162,904 rows, 12 columns


In [3]:
# ────────────────────────────────────────────────────────────────────────────────
# 2. Canonical provider name (latest non-null per CCN)
# ────────────────────────────────────────────────────────────────────────────────
canon_name = (
    df.sort_values("processing_date")
      .groupby("cms_certification_number", observed=True)["provider_name"]
      .agg(lambda s: s.dropna().iat[-1] if len(s.dropna()) else pd.NA)
      .to_frame("provider_name")
)

In [4]:
# ────────────────────────────────────────────────────────────────────────────────
# 3. Collapse each (CCN, association_date) → owners / pcts strings
# ────────────────────────────────────────────────────────────────────────────────
def collect(group):
    group = group.sort_values("owner_name")
    owners = " | ".join(group["owner_name"].astype(str))
    pcts   = " | ".join(group["ownership_percentage"].astype(str))
    return pd.Series({"owners": owners, "pcts": pcts})

fac_date = (
    df.groupby(["cms_certification_number", "association_date"],
               observed=True, dropna=False)
      .apply(collect)
      .reset_index()
)
print(f"[fac_date] grouped rows: {len(fac_date):,}")

[fac_date] grouped rows: 39,742


  .apply(collect)


In [6]:
# ────────────────────────────────────────────────────────────────────────────────
# 4. Remove duplicate owner/pct combos within facility
# ────────────────────────────────────────────────────────────────────────────────
fac_date["combo_key"] = fac_date["owners"] + "||" + fac_date["pcts"]
fac_unique = (
    fac_date.sort_values("association_date")
            .drop_duplicates(subset=["cms_certification_number", "combo_key"], keep="first")
            .reset_index(drop=True)
)
print(f"[fac_unique] unique combos: {len(fac_unique):,}")

[fac_unique] unique combos: 39,289


In [8]:
# ────────────────────────────────────────────────────────────────────────────────
# 5. Split rows with / without CCN and assign group numbers
# ────────────────────────────────────────────────────────────────────────────────
fac_missing = fac_unique[fac_unique["cms_certification_number"].isna()].copy()
fac_valid   = fac_unique[fac_unique["cms_certification_number"].notna()].copy()

fac_valid = fac_valid.sort_values(["cms_certification_number", "association_date"])
fac_valid["grp_n"] = fac_valid.groupby("cms_certification_number").cumcount() + 1
assert not fac_valid.duplicated(["cms_certification_number", "grp_n"]).any()

In [9]:
# ────────────────────────────────────────────────────────────────────────────────
# 6. Pivot wide: one row per CCN
# ────────────────────────────────────────────────────────────────────────────────
w_owners = fac_valid.pivot(index="cms_certification_number", columns="grp_n", values="owners")
w_pcts   = fac_valid.pivot(index="cms_certification_number", columns="grp_n", values="pcts")
w_dates  = fac_valid.pivot(index="cms_certification_number", columns="grp_n", values="association_date")

w_owners.columns = [f"group{i}_owners" for i in w_owners.columns]
w_pcts.columns   = [f"group{i}_pcts"   for i in w_pcts.columns]
w_dates.columns  = [f"group{i}_date"   for i in w_dates.columns]

facility_signatures = (
    pd.concat([w_owners, w_pcts, w_dates], axis=1)
      .merge(canon_name, left_index=True, right_index=True, how="left")
      .reset_index()
)

# tidy column order
import re as _re
group_nums = sorted({int(_re.search(r"group(\d+)_", c).group(1))
                     for c in facility_signatures.columns if "group" in c})
cols = ["cms_certification_number", "provider_name"]
for n in group_nums:
    cols += [f"group{n}_owners", f"group{n}_pcts", f"group{n}_date"]
facility_signatures = facility_signatures[cols]

print(f"[final] facility_signatures shape: {facility_signatures.shape}")

[final] facility_signatures shape: (13893, 50)


In [10]:
# ────────────────────────────────────────────────────────────────────────────────
# 7. Save outputs
# ────────────────────────────────────────────────────────────────────────────────
out_sig   = INTERIM_DIR / "facility_signatures.csv"
out_miss  = INTERIM_DIR / "facility_signatures_missing_ccn.csv"

facility_signatures.to_csv(out_sig, index=False)
fac_missing.to_csv(out_miss, index=False)

print(f"[save] wrote:\n  {out_sig}\n  {out_miss}")

# quick sanity example
for test_ccn in ["015009", "366123"]:
    row = facility_signatures.query("cms_certification_number == @test_ccn")
    print(f"\n[check] CCN {test_ccn} {'found' if not row.empty else 'not found'}")

[save] wrote:
  C:\Repositories\white_bowblis_nhmc\data\interim\facility_signatures.csv
  C:\Repositories\white_bowblis_nhmc\data\interim\facility_signatures_missing_ccn.csv

[check] CCN 015009 found

[check] CCN 366123 found
