In [18]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

In [20]:
# ------------------------------------------------------------------
# 1. LOAD
# ------------------------------------------------------------------
CLEAN_PATH = Path(r"C:\Users\wrthj\OneDrive\Documents\Honors_Thesis\data\data\ownership_file_clean.csv")
OUT_DIR = CLEAN_PATH.parent
OUT_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(
    CLEAN_PATH,
    dtype={"cms_certification_number": "string"},  # preserve leading zeros
    low_memory=False,
)
print(f"[load] {len(df):,} rows, {df.shape[1]} columns.")

[load] 162,904 rows, 13 columns.


In [21]:
# ------------------------------------------------------------------
# 2. PARSE DATES (idempotent safe re-parse)
# ------------------------------------------------------------------
df["association_date"] = pd.to_datetime(df["association_date"], errors="coerce")
df["processing_date"]  = pd.to_datetime(df.get("processing_date"), errors="coerce")

In [22]:
# ------------------------------------------------------------------
# 3. CANONICAL PROVIDER NAME PER CCN (latest non-null name)
# ------------------------------------------------------------------
canon_name = (
    df.sort_values("processing_date")
      .groupby("cms_certification_number")["provider_name"]
      .agg(lambda s: s.dropna().iat[-1] if len(s.dropna()) else pd.NA)
      .to_frame("provider_name")
)

In [23]:
# ------------------------------------------------------------------
# 4. FACILITY × ASSOCIATION_DATE GROUPS
#     collect owner & pct strings per (CCN, assoc_date)
# ------------------------------------------------------------------
def collect_group(sub: pd.DataFrame) -> pd.Series:
    sub = sub.sort_values("owner_name")
    owners = " | ".join(sub["owner_name"].astype(str))
    pcts   = " | ".join(sub["ownership_percentage"].astype(str))
    return pd.Series({"owners": owners, "pcts": pcts})

fac_date = (
    df.groupby(["cms_certification_number", "association_date"], dropna=False, observed=True)
      .apply(collect_group)
      .reset_index()
      .sort_values(["cms_certification_number", "association_date"])
)
print(f"[fac_date] grouped rows: {len(fac_date):,}")

[fac_date] grouped rows: 39,778


  .apply(collect_group)


In [24]:
# ------------------------------------------------------------------
# 5. COLLAPSE DUPLICATE OWNER/PCT COMBOS WITHIN FACILITY
#    (keep earliest association_date for that combo)
# ------------------------------------------------------------------
fac_date["combo_key"] = fac_date["owners"] + "||" + fac_date["pcts"]
fac_unique = (
    fac_date.sort_values("association_date")  # earliest first
            .drop_duplicates(subset=["cms_certification_number", "combo_key"], keep="first")
            .reset_index(drop=True)
            .sort_values(["cms_certification_number", "association_date"])
)
print(f"[fac_unique] unique combos: {len(fac_unique):,}")

[fac_unique] unique combos: 39,321


In [25]:
# ------------------------------------------------------------------
# 6. SPLIT VALID vs MISSING CCN
# ------------------------------------------------------------------
fac_missing = fac_unique[fac_unique["cms_certification_number"].isna()].copy()
fac_valid   = fac_unique[fac_unique["cms_certification_number"].notna()].copy()
print(f"[split] valid rows: {len(fac_valid):,}  |  missing CCN rows: {len(fac_missing):,}")

# ------------------------------------------------------------------
# 7. ASSIGN GROUP NUMBERS (chronological within each CCN)
# ------------------------------------------------------------------
fac_valid = fac_valid.sort_values(["cms_certification_number", "association_date"])
fac_valid["grp_n"] = fac_valid.groupby("cms_certification_number").cumcount() + 1
fac_valid["grp_n"] = fac_valid["grp_n"].astype(int)

# Sanity check: should be zero duplicates now
dups = fac_valid.duplicated(["cms_certification_number", "grp_n"], keep=False)
print(f"[check] duplicate (CCN, grp_n) pairs: {dups.sum()}")

[split] valid rows: 38,092  |  missing CCN rows: 1,229
[check] duplicate (CCN, grp_n) pairs: 0


In [26]:
# ------------------------------------------------------------------
# 8. PIVOT WIDE (safe with pivot_table)
# ------------------------------------------------------------------
w_owners = fac_valid.pivot_table(index="cms_certification_number", columns="grp_n", values="owners", aggfunc="first")
w_owners.columns = [f"group{i}_owners" for i in w_owners.columns]

w_pcts = fac_valid.pivot_table(index="cms_certification_number", columns="grp_n", values="pcts", aggfunc="first")
w_pcts.columns = [f"group{i}_pcts" for i in w_pcts.columns]

w_dates = fac_valid.pivot_table(index="cms_certification_number", columns="grp_n", values="association_date", aggfunc="first")
w_dates.columns = [f"group{i}_date" for i in w_dates.columns]

In [27]:
# ------------------------------------------------------------------
# 9. COMBINE + ADD PROVIDER NAME
# ------------------------------------------------------------------
facility_signatures = (
    pd.concat([w_owners, w_pcts, w_dates], axis=1)
      .merge(canon_name, left_index=True, right_index=True, how="left")
      .reset_index()
)

# ORDER COLUMNS
import re as _re
group_nums = sorted(set(int(_re.search(r"group(\d+)_", c).group(1))
                        for c in facility_signatures.columns
                        if _re.search(r"group(\d+)_", c)))

cols = ["cms_certification_number", "provider_name"]
for n in group_nums:
    cols += [f"group{n}_owners", f"group{n}_pcts", f"group{n}_date"]

facility_signatures = facility_signatures[cols]
print(f"[final] facility_signatures shape: {facility_signatures.shape}")

[final] facility_signatures shape: (13893, 50)


In [29]:
# ------------------------------------------------------------------
# 10. SAVE OUTPUTS
# ------------------------------------------------------------------
out_sig  = OUT_DIR / "facility_signatures.csv"
out_miss = OUT_DIR / "facility_signatures_missing_ccn.csv"

facility_signatures.to_csv(out_sig, index=False)
fac_missing.to_csv(out_miss, index=False)

print(f"[save] wrote:\n  {out_sig}\n  {out_miss}")

[save] wrote:
  C:\Users\wrthj\OneDrive\Documents\Honors_Thesis\data\data\facility_signatures.csv
  C:\Users\wrthj\OneDrive\Documents\Honors_Thesis\data\data\facility_signatures_missing_ccn.csv


In [28]:
ccn_test = "015009"
row = facility_signatures[facility_signatures["cms_certification_number"] == ccn_test]
if not row.empty:
    print(f"\n[check] CCN {ccn_test}:\n", row.T)
else:
    print(f"\n[check] CCN {ccn_test} not found in facility_signatures.")


[check] CCN 015009:
                                                          0
cms_certification_number                            015009
provider_name                     BURNS NURSING HOME, INC.
group1_owners             DEARMAN, LARRY | DEARMAN, MARTHA
group1_pcts                                      10% | 81%
group1_date                            1969-09-01 00:00:00
group2_owners                             DEARMAN, CAMERON
group2_pcts                                             5%
group2_date                            2012-01-25 00:00:00
group3_owners                                          NaN
group3_pcts                                            NaN
group3_date                                            NaT
group4_owners                                          NaN
group4_pcts                                            NaN
group4_date                                            NaT
group5_owners                                          NaN
group5_pcts                       