In [8]:
# ==============================================================================
# CMS Ownership — Build CHOW-ready monthly panel (highest-level snapshot per CCN-month)
# Priority: INDIRECT > DIRECT > PARTNERSHIP
# Output: one row per (cms_certification_number, processing month)
# Lists are ordered and aligned (name[i], pct[i], assoc_date[i], type[i], role[i])
# ==============================================================================

import os, re, json, hashlib, math
import numpy as np
import pandas as pd
from pathlib import Path

# ------------------------------------------------------------------------------
# 1) Paths
# ------------------------------------------------------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
OWN_DIR     = RAW_DIR / "ownership-files"
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
OWN_DIR.mkdir(parents=True, exist_ok=True)
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

IN_COMBINED = OWN_DIR / "ownership_combined.csv"                 # your big standardized file
OUT_PANEL   = INTERIM_DIR / "ownership_ccn_month_chow_ready_v2.csv"

print(f"[paths] OWN_DIR={OWN_DIR}")
print(f"[paths] INTERIM_DIR={INTERIM_DIR}")
print(f"[load]  {IN_COMBINED}")

# ------------------------------------------------------------------------------
# 2) Load
# ------------------------------------------------------------------------------
usecols = None  # or narrow if you want faster loads
df = pd.read_csv(
    IN_COMBINED,
    dtype={
        "cms_certification_number": "string",
        "role": "string",
        "owner_type": "string",
        "owner_name": "string",
        "ownership_percentage": "string",   # we'll coerce to numeric later
    },
    parse_dates=["association_date","processing_date"],
    low_memory=False,
    usecols=usecols,
)
print(f"[loaded] rows={len(df):,}, cols={df.shape[1]}")

# Basic hygiene
df["cms_certification_number"] = (
    df["cms_certification_number"].astype("string").str.replace(r"\D","",regex=True).str.zfill(6)
)

# Keep only the three role levels of interest
ROLE_PRIORITY = {"INDIRECT": 0, "DIRECT": 1, "PARTNERSHIP": 2}
df["role"] = df["role"].str.upper().str.strip()
df = df[df["role"].isin(ROLE_PRIORITY.keys())].copy()

# Structure by processing month (snapshot month)
df["month_ts"] = df["processing_date"].values.astype("datetime64[M]")

# ------------------------------------------------------------------------------
# 3) Owner name normalization & helpers
# ------------------------------------------------------------------------------
CORP_SUFFIX_RE = re.compile(
    r"\b(CORP(ORATION)?|INCORPORATED|INC|LLC|L\.L\.C\.|LP|L\.P\.|LLP|CO|COMPANY|HOLDINGS?|INVESTMENTS?)\b\.?",
    re.I
)
PUNCT_RE = re.compile(r"[,\.&]")

def norm_owner_name(name: str) -> str:
    s = str(name).upper().strip()
    s = PUNCT_RE.sub(" ", s)
    s = CORP_SUFFIX_RE.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def choose_top_level(roles: pd.Series) -> str | None:
    present = [r for r in roles.dropna().astype(str).str.upper().unique() if r in ROLE_PRIORITY]
    if not present:
        return None
    return sorted(present, key=lambda r: ROLE_PRIORITY[r])[0]

def json_list(values, as_date=False):
    # Produce JSON array with None for NaNs; if as_date, cast to ISO date
    out = []
    if as_date:
        vals = pd.to_datetime(values, errors="coerce")
        for v in vals:
            out.append(None if pd.isna(v) else v.date().isoformat())
    else:
        for v in values:
            if isinstance(v, float) and np.isnan(v):
                out.append(None)
            elif pd.isna(v):
                out.append(None)
            else:
                out.append(v)
    return json.dumps(out, ensure_ascii=False)

def pct_fill_tag(pcts) -> str:
    s = pd.Series(pcts, dtype="float64")
    if s.empty or s.notna().sum()==0:
        return "all_null"
    if s.notna().sum()==len(s):
        return "none_null"
    return "some_null"

def make_snapshot_id(ccn: str, month: pd.Timestamp, used_level: str, names: list, pcts: list) -> str:
    """Deterministic fingerprint of the snapshot for change tracking."""
    payload = json.dumps({
        "ccn": str(ccn),
        "month": None if pd.isna(month) else pd.to_datetime(month).date().isoformat(),
        "level": used_level,
        "owners": names,
        "pcts": [None if (x is None or (isinstance(x,float) and math.isnan(x))) else float(x) for x in pcts],
    }, ensure_ascii=False)
    return hashlib.sha1(payload.encode("utf-8")).hexdigest()

# ------------------------------------------------------------------------------
# 4) Build a single highest-level snapshot per (CCN, month)
#    - Pick highest role available that month
#    - Normalize names; dedupe by normalized name (sum %s, earliest assoc_date)
#    - Stable sort: by % desc, then name_norm asc (ensures aligned lists)
# ------------------------------------------------------------------------------
rows = []
grouped = df.groupby(["cms_certification_number","month_ts"], sort=True, as_index=False)

for (ccn, month), g in grouped:
    used = choose_top_level(g["role"])
    if used is None:
        continue

    gm = g[g["role"]==used].copy()

    # Normalize names; coerce percentage to numeric
    gm["name_norm"] = gm["owner_name"].map(norm_owner_name)
    gm["pct_num"]   = pd.to_numeric(gm["ownership_percentage"], errors="coerce")

    # Association date: if missing, fall back to processing_date
    if "association_date" in gm.columns:
        gm["association_date"] = gm["association_date"].fillna(gm["processing_date"])
    else:
        gm["association_date"] = gm["processing_date"]

    # Aggregate to normalized owner: sum %; earliest assoc date; modal type/role
    agg = (gm.groupby("name_norm", dropna=False, as_index=False)
             .agg(
                 owner_name=("owner_name","first"),
                 owner_type=("owner_type", lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
                 role=("role", lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
                 pct_sum=("pct_num","sum"),
                 assoc_min=("association_date","min"),
             ))

    # Stable order inside the month: percentage desc, then normalized name asc
    agg = agg.sort_values(["pct_sum","name_norm"], ascending=[False, True], kind="mergesort").reset_index(drop=True)

    # Prepare aligned lists
    names_raw   = agg["owner_name"].tolist()
    names_norm  = agg["name_norm"].tolist()
    types_list  = agg["owner_type"].tolist()
    roles_list  = agg["role"].tolist()
    pcts_list   = agg["pct_sum"].astype(float).tolist()
    assoc_list  = agg["assoc_min"].tolist()

    # Derived fields
    tag = pct_fill_tag(pcts_list)
    total_pct = float(np.nansum(pcts_list)) if len(pcts_list) else np.nan
    has_pct = any(pd.notna(pcts_list))

    # Optional human-readable summary ("A: 40%; B: 30%; ...")
    def fmt_pct(x):
        return "NA" if pd.isna(x) else f"{float(x):.2f}%"
    owner_summary = " | ".join([f"{n}: {fmt_pct(p)}" for n,p in zip(names_norm, pcts_list)])

    snapshot_id = make_snapshot_id(ccn, month, used, names_norm, pcts_list)

    rows.append({
        "cms_certification_number": str(ccn).zfill(6),
        "month_ts": pd.to_datetime(month),
        "processing_date": pd.to_datetime(month).date().isoformat(),   # month anchor
        "used_level": used,
        "n_owners": int(len(agg)),

        # JSON lists (aligned order)
        "owner_names": json_list(names_raw),
        "name_norms": json_list(names_norm),
        "owner_types": json_list(types_list),
        "roles": json_list(roles_list),
        "ownership_percentages": json.dumps(
            [None if pd.isna(x) else float(x) for x in pcts_list], ensure_ascii=False
        ),
        "association_dates": json_list(assoc_list, as_date=True),

        # Diagnostics
        "pct_fill_tag": tag,
        "total_pct": total_pct,
        "has_percent": has_pct,
        "owner_summary": owner_summary,

        # Fingerprint for change detection
        "snapshot_id": snapshot_id,
    })

panel = pd.DataFrame(rows).sort_values(["cms_certification_number","month_ts"]).reset_index(drop=True)

# Strong dtypes
panel["cms_certification_number"] = panel["cms_certification_number"].astype(str).str.zfill(6)
panel["month_ts"] = pd.to_datetime(panel["month_ts"], errors="coerce")

# ------------------------------------------------------------------------------
# 5) Save + quick QC
# ------------------------------------------------------------------------------
panel.to_csv(OUT_PANEL, index=False)
print(f"[save] CHOW-ready panel → {OUT_PANEL}  (rows={len(panel):,})")

print("\n=== QC snapshot ===")
print("Distinct CCNs:", panel["cms_certification_number"].nunique())
years = sorted(panel["month_ts"].dropna().dt.year.unique().tolist())
print("Years covered:", (years[:5] + ["..."] + years[-5:]) if len(years) > 10 else years)

with pd.option_context("display.max_colwidth", 120):
    print("\nHead:")
    print(panel.head(3).to_string(index=False))

[paths] OWN_DIR=C:\Users\Owner\OneDrive\NursingHomeData\ownership-files
[paths] INTERIM_DIR=C:\Repositories\white-bowblis-nhmc\data\interim
[load]  C:\Users\Owner\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv
[loaded] rows=5,075,773, cols=7
[save] CHOW-ready panel → C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_month_chow_ready_v2.csv  (rows=1,173,176)

=== QC snapshot ===
Distinct CCNs: 14075
Years covered: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Head:
cms_certification_number   month_ts processing_date used_level  n_owners  owner_names  name_norms    owner_types      roles ownership_percentages association_dates pct_fill_tag  total_pct  has_percent    owner_summary                              snapshot_id
                  005125 2018-10-01      2018-10-01     DIRECT         1 ["VO, TIEN"] ["VO TIEN"] ["Individual"] ["DIRECT"]               [100.0]    ["2016-10-24"]    none_null      100.0         True VO TIEN: 100.00% 2537ebced62eaf4

In [9]:
df2 = pd.read_csv(INTERIM_DIR / "ownership_ccn_month_chow_ready_v2.csv")

In [11]:
print("Rows:", len(df2))
print("Columns:", len(df2.columns))
print("Column names:", df2.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1173176 entries, 0 to 1173175
Data columns (total 16 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   cms_certification_number  1173176 non-null  int64  
 1   month_ts                  1173176 non-null  object 
 2   processing_date           1173176 non-null  object 
 3   used_level                1173176 non-null  object 
 4   n_owners                  1173176 non-null  int64  
 5   owner_names               1173176 non-null  object 
 6   name_norms                1173176 non-null  object 
 7   owner_types               1173176 non-null  object 
 8   roles                     1173176 non-null  object 
 9   ownership_percentages     1173176 non-null  object 
 10  association_dates         1173176 non-null  object 
 11  pct_fill_tag              1173176 non-null  object 
 12  total_pct                 1173176 non-null  float64
 13  has_percent               1

In [12]:
# first 3 rows
print(df2.head(3).to_string(index=False))
# last 3 rows
print(df2.tail(3).to_string(index=False))

 cms_certification_number   month_ts processing_date used_level  n_owners  owner_names  name_norms    owner_types      roles ownership_percentages association_dates pct_fill_tag  total_pct  has_percent    owner_summary                              snapshot_id
                     5125 2018-10-01      2018-10-01     DIRECT         1 ["VO, TIEN"] ["VO TIEN"] ["Individual"] ["DIRECT"]               [100.0]    ["2016-10-24"]    none_null      100.0         True VO TIEN: 100.00% 2537ebced62eaf4a57584e08b510ce51842a7f40
                     5125 2018-11-01      2018-11-01     DIRECT         1 ["VO, TIEN"] ["VO TIEN"] ["Individual"] ["DIRECT"]               [100.0]    ["2016-10-24"]    none_null      100.0         True VO TIEN: 100.00% d8326acf851f149d5a47185218e00e982db2217e
                     5125 2018-12-01      2018-12-01     DIRECT         1 ["VO, TIEN"] ["VO TIEN"] ["Individual"] ["DIRECT"]               [100.0]    ["2016-10-24"]    none_null      100.0         True VO TIEN: 100.00% 7

In [13]:
print("Distinct CCNs:", df2["cms_certification_number"].nunique())
print("Date range:", df2["month_ts"].min(), "→", df2["month_ts"].max())
print("Levels used:", df2["used_level"].value_counts(dropna=False))

Distinct CCNs: 14075
Date range: 2017-01-01 → 2025-07-01
Levels used: used_level
DIRECT         608720
INDIRECT       558079
PARTNERSHIP      6377
Name: count, dtype: int64


In [14]:
print("pct_fill_tag counts:")
print(df2["pct_fill_tag"].value_counts(dropna=False))

print("\nTotal % (sample 5):")
print(df2[["cms_certification_number","month_ts","total_pct"]].sample(5))


pct_fill_tag counts:
pct_fill_tag
none_null    1173176
Name: count, dtype: int64

Total % (sample 5):
         cms_certification_number    month_ts  total_pct
511082                     235442  2019-03-01       96.0
3151                        15084  2023-01-01        0.0
232539                     125047  2022-06-01        0.0
1063757                    535059  2025-05-01      100.0
753923                     365259  2023-11-01      100.0


In [15]:
sample = df2.sample(1, random_state=42).iloc[0]
print("CCN:", sample["cms_certification_number"], "Month:", sample["month_ts"])
print("Owners:", sample["owner_names"])
print("Percents:", sample["ownership_percentages"])
print("Assoc dates:", sample["association_dates"])


CCN: 396001 Month: 2017-11-01
Owners: ["MAIN LINE HEALTH, INC."]
Percents: [100.0]
Assoc dates: ["1985-07-01"]


In [16]:
zero_totals = (df2["total_pct"] == 0).sum()
print("Zero-total CCN-months:", zero_totals, "of", len(df2))

Zero-total CCN-months: 458427 of 1173176


In [17]:
print(df2["n_owners"].describe())
print("Top 5 by owner count:")
print(df2.nlargest(5, "n_owners")[["cms_certification_number","month_ts","n_owners","total_pct"]])

count    1.173176e+06
mean     3.487939e+00
std      3.511477e+00
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      4.000000e+00
max      3.400000e+01
Name: n_owners, dtype: float64
Top 5 by owner count:
        cms_certification_number    month_ts  n_owners  total_pct
271360                    145786  2022-10-01        34        0.0
271361                    145786  2022-11-01        34        0.0
271362                    145786  2023-01-01        34        0.0
271363                    145786  2023-02-01        34        0.0
115560                     65305  2022-09-01        33        0.0


In [18]:
print(df2.groupby(df2["month_ts"].dt.year).size())

AttributeError: Can only use .dt accessor with datetimelike values

In [19]:
bad = df2[df2.apply(lambda r: len(eval(r["owner_names"])) != len(eval(r["ownership_percentages"])), axis=1)]
print("Rows with misaligned lists:", len(bad))

Rows with misaligned lists: 0
