In [3]:
# ==============================================================================
# CMS Ownership — Compact to one row per (CCN, month) with aligned owner lists
# ==============================================================================

import os
import re
import json
import pandas as pd
import numpy as np
from pathlib import Path

# ------------------------------------------------------------------------------
# 1) Project-wide paths (portable)
# ------------------------------------------------------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR      = Path(os.getenv("NH_DATA_DIR",  PROJECT_ROOT / "data" / "raw"))
OWN_DIR      = RAW_DIR / "ownership-files"
INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"

OWN_DIR.mkdir(parents=True, exist_ok=True)
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

COMBINED_FP  = OWN_DIR / "ownership_combined.csv"
OUT_COMPACT  = INTERIM_DIR / "ownership_ccn_month_compact.csv"

print(f"[paths] OWN_DIR={OWN_DIR}")
print(f"[paths] INTERIM_DIR={INTERIM_DIR}")
print(f"[load]  {COMBINED_FP}")

# ------------------------------------------------------------------------------
# 2) Load combined ownership data
# ------------------------------------------------------------------------------
# Note: we read dates as strings first, then parse with to_datetime for control.
df = pd.read_csv(COMBINED_FP, dtype={
    "cms_certification_number": "string",
    "role": "string",
    "owner_type": "string",
    "owner_name": "string",
    "ownership_percentage": "float64",
    "association_date": "string",
    "processing_date": "string",
}, low_memory=False)

# ------------------------------------------------------------------------------
# 3) Normalize types & fields
# ------------------------------------------------------------------------------
# CCN → 6-digit, left-padded string
df["cms_certification_number"] = (
    df["cms_certification_number"]
      .astype("string")
      .str.replace(r"\D", "", regex=True)
      .str.zfill(6)
)

# Dates → datetime; association_date already non-null by your pipeline
df["processing_date"]  = pd.to_datetime(df["processing_date"], errors="coerce")
df["association_date"] = pd.to_datetime(df["association_date"], errors="coerce")

# Keep only standardized roles (should already be the case)
df = df[df["role"].isin(["DIRECT", "INDIRECT", "PARTNERSHIP"])].copy()

# Month key (floor to first of month)
df["month"] = df["processing_date"].values.astype("datetime64[M]")  # month-start timestamps

# ------------------------------------------------------------------------------
# 4) Deterministic ordering of owners within each CCN-month
#    DIRECT → INDIRECT → PARTNERSHIP, then ownership % desc, then owner name
# ------------------------------------------------------------------------------
role_order = {"DIRECT": 0, "INDIRECT": 1, "PARTNERSHIP": 2}
df["_role_ord"] = df["role"].map(role_order).fillna(9).astype(int)

# for sorting only; do not overwrite the stored ownership_percentage values
_pct_sort = pd.to_numeric(df["ownership_percentage"], errors="coerce")
df["_pct_sort"] = _pct_sort.fillna(-1e9)  # NaNs sorted to the end within role

df = df.sort_values(
    ["cms_certification_number", "month", "_role_ord", "_pct_sort", "owner_name"],
    ascending=[True, True, True, False, True],
    kind="mergesort",
)

# ------------------------------------------------------------------------------
# 5) Pack aligned lists per (CCN, month) into JSON arrays
# ------------------------------------------------------------------------------
def _json_list(series, as_date=False):
    """Convert a Series to a JSON array string; NaN→null; datetimes→YYYY-MM-DD."""
    out = []
    if as_date:
        # make sure we coerce to datetime and then to ISO date string
        s = pd.to_datetime(series, errors="coerce")
        for v in s:
            out.append(None if pd.isna(v) else v.date().isoformat())
    else:
        for v in series:
            if isinstance(v, float) and np.isnan(v):
                out.append(None)
            elif pd.isna(v):
                out.append(None)
            else:
                out.append(v)
    return json.dumps(out, ensure_ascii=False)

# group and build compact rows
grp = df.groupby(["cms_certification_number", "month"], sort=True, as_index=False)

compact = grp.apply(lambda g: pd.Series({
    "processing_date": g["month"].iloc[0].date().isoformat(),  # YYYY-MM-01
    "n_owners": int(len(g)),
    "roles": _json_list(g["role"]),
    "owner_types": _json_list(g["owner_type"]),
    "owner_names": _json_list(g["owner_name"]),
    "ownership_percentages": json.dumps(
        [None if pd.isna(x) else float(x) for x in pd.to_numeric(g["ownership_percentage"], errors="coerce").tolist()],
        ensure_ascii=False
    ),
    "association_dates": _json_list(g["association_date"], as_date=True),
})).reset_index(drop=False)  # keep the group keys

# The group keys come back as columns named like the original index names
# Ensure final column order
compact = compact.rename(columns={"month": "month_ts"})
compact = compact[[
    "cms_certification_number",
    "processing_date",   # human-readable month key (YYYY-MM-01)
    "n_owners",
    "roles",
    "owner_types",
    "owner_names",
    "ownership_percentages",
    "association_dates",
    "month_ts"           # optional raw month timestamp for joins (can drop if you like)
]]

# ------------------------------------------------------------------------------
# 6) Save & quick QC
# ------------------------------------------------------------------------------
compact.to_csv(OUT_COMPACT, index=False)
print(f"[save] compact panel → {OUT_COMPACT} (rows={len(compact):,})")

print("\n=== QC: compact snapshot ===")
print("Distinct CCNs:", compact["cms_certification_number"].nunique())
years = sorted(compact["month_ts"].dt.year.dropna().unique().tolist())
print("Years covered:", years[:10], "..." if len(years) > 10 else "")
print("\nExample row:")
with pd.option_context("display.max_colwidth", 120):
    print(compact.head(1).to_string(index=False))

[paths] OWN_DIR=C:\Users\Owner\OneDrive\NursingHomeData\ownership-files
[paths] INTERIM_DIR=C:\Repositories\white-bowblis-nhmc\data\interim
[load]  C:\Users\Owner\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv


  compact = grp.apply(lambda g: pd.Series({


[save] compact panel → C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_month_compact.csv (rows=1,173,176)

=== QC: compact snapshot ===
Distinct CCNs: 14075
Years covered: [2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025] 

Example row:
cms_certification_number processing_date  n_owners      roles    owner_types  owner_names ownership_percentages association_dates   month_ts
                  005125      2018-10-01         1 ["DIRECT"] ["Individual"] ["VO, TIEN"]               [100.0]    ["2016-10-24"] 2018-10-01


In [9]:
# ------------------------------------------------------------------------------
# Inspect compact ownership panel
# ------------------------------------------------------------------------------

import pandas as pd
from pathlib import Path
import json

# Point to interim directory and file
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"
COMPACT_FP   = INTERIM_DIR / "ownership_ccn_month_compact.csv"

# Load compact file
compact = pd.read_csv(COMPACT_FP, dtype=str, low_memory=False)

# Ensure month_ts is datetime
compact["month_ts"] = pd.to_datetime(compact["month_ts"], errors="coerce")

print(f"[load] {COMPACT_FP}  rows={len(compact):,}")

# ---------------- Basic structure ----------------
print("\nColumns & dtypes:")
print(compact.dtypes)

print("\nHead (first 3 rows):")
with pd.option_context("display.max_colwidth", 100):
    print(compact.head(3).to_string(index=False))

# ---------------- Summary checks -----------------
print("\nDistinct CCNs:", compact["cms_certification_number"].nunique())
print("Date range:", compact["month_ts"].min().date(), "→", compact["month_ts"].max().date())
print("Mean owners per CCN-month:", compact["n_owners"].astype(int).mean().round(2))

# Distribution of owners-per-month (to catch outliers)
print("\nOwners per CCN-month (5-number summary):")
print(compact["n_owners"].astype(int).describe())

# Spot-check: how many CCN-months only have DIRECT owners?
has_direct = compact["roles"].str.contains("DIRECT", na=False)
has_indir  = compact["roles"].str.contains("INDIRECT", na=False)
has_part   = compact["roles"].str.contains("PARTNERSHIP", na=False)

only_direct = ((has_direct) & (~has_indir) & (~has_part)).sum()
print(f"\nRows with only DIRECT: {only_direct:,}")

# ---------------- Peek at JSON fields ----------------
example = compact.iloc[0]
print("\nExample expanded JSON lists:")
print("roles:", json.loads(example["roles"]))
print("owner_types:", json.loads(example["owner_types"]))
print("owner_names:", json.loads(example["owner_names"])[:5], "...")
print("ownership_percentages:", json.loads(example["ownership_percentages"]))
print("association_dates:", json.loads(example["association_dates"]))

[load] C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_month_compact.csv  rows=1,173,176

Columns & dtypes:
cms_certification_number            object
processing_date                     object
n_owners                            object
roles                               object
owner_types                         object
owner_names                         object
ownership_percentages               object
association_dates                   object
month_ts                    datetime64[ns]
dtype: object

Head (first 3 rows):
cms_certification_number processing_date n_owners      roles    owner_types  owner_names ownership_percentages association_dates   month_ts
                  005125      2018-10-01        1 ["DIRECT"] ["Individual"] ["VO, TIEN"]               [100.0]    ["2016-10-24"] 2018-10-01
                  005125      2018-11-01        1 ["DIRECT"] ["Individual"] ["VO, TIEN"]               [100.0]    ["2016-10-24"] 2018-11-01
                  005125      2018-