In [None]:
# ==============================================================================
# Fast CHOW-ready monthly snapshot builder (INDIRECT > DIRECT > PARTNERSHIP)
#   - Reads the big NursingHomeData/ownership_combined.csv directly
#   - Vectorized; no per-row Python loops
#   - Produces aligned lists per (CCN, month) with stable order
# ==============================================================================

import os, re, json, math, hashlib
import numpy as np
import pandas as pd
from pathlib import Path

# -------------------- Paths --------------------
# Point directly at your NursingHomeData folder
NH_DATA_DIR = Path(r"C:\Users\Owner\OneDrive\NursingHomeData")
OWN_DIR     = NH_DATA_DIR / "ownership-files"   # adjust if your combined lives here
IN_COMBINED = OWN_DIR / "ownership_combined.csv"

# Where to save outputs in this repo
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)
OUT_PANEL = INTERIM_DIR / "ownership_ccn_month_chow_ready_v2.csv"

print(f"[load] {IN_COMBINED}")

# -------------------- Read narrow, typed --------------------
USECOLS = [
    "cms_certification_number",
    "role",
    "owner_type",
    "owner_name",
    "ownership_percentage",
    "association_date",
    "processing_date",
]
DTYPES = {
    "cms_certification_number": "string",
    "role": "string",
    "owner_type": "string",
    "owner_name": "string",
    "ownership_percentage": "string",  # parse later
}
df = pd.read_csv(
    IN_COMBINED,
    usecols=USECOLS,
    dtype=DTYPES,
    parse_dates=["association_date", "processing_date"],
    low_memory=False,
    memory_map=True,
)
print(f"[loaded] rows={len(df):,}, cols={df.shape[1]}")

# -------------------- Hygiene & vector transforms --------------------
# CCN normalize
df["cms_certification_number"] = (
    df["cms_certification_number"].str.replace(r"\D", "", regex=True).str.zfill(6)
)

# Role priority and keep only the three levels
ROLE_PRIORITY = {"INDIRECT": 0, "DIRECT": 1, "PARTNERSHIP": 2}
df["role"] = df["role"].str.upper().str.strip()
df = df[df["role"].isin(ROLE_PRIORITY)].copy()
df["role_num"] = df["role"].map(ROLE_PRIORITY).astype("int8")

# Month key from processing_date
df["month_ts"] = df["processing_date"].values.astype("datetime64[M]")

# Owner name normalization — fully vectorized
CORP_SUFFIX_RE = re.compile(
    r"\b(CORP(ORATION)?|INCORPORATED|INC|LLC|L\.L\.C\.|LP|L\.P\.|LLP|CO|COMPANY|HOLDINGS?|INVESTMENTS?)\b\.?",
    re.I
)
def norm_series(s: pd.Series) -> pd.Series:
    s2 = s.str.upper().str.strip()
    s2 = s2.str.replace(r"[,\.&]", " ", regex=True)
    s2 = s2.str.replace(CORP_SUFFIX_RE, "", regex=True)
    s2 = s2.str.replace(r"\s+", " ", regex=True).str.strip()
    return s2

df["name_norm"] = norm_series(df["owner_name"])

# Percent numeric & assoc dates
df["pct_num"] = pd.to_numeric(df["ownership_percentage"], errors="coerce")
df["association_date"] = df["association_date"].fillna(df["processing_date"])

# -------------------- Choose highest level per (CCN, month) --------------------
# Compute the min role_num available in each (ccn, month)
min_role = df.groupby(["cms_certification_number", "month_ts"])["role_num"].transform("min")
df = df[min_role.eq(df["role_num"])].copy()  # keep only rows at the top level present

# -------------------- Aggregate by normalized owner --------------------
# Sum pct, earliest assoc date, modal owner_type/role
agg = (
    df.groupby(["cms_certification_number", "month_ts", "name_norm"], sort=False)
      .agg(
          owner_name=("owner_name", "first"),  # example raw
          owner_type=("owner_type", lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
          role=("role", lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]),
          pct_sum=("pct_num", "sum"),
          assoc_min=("association_date", "min"),
      )
      .reset_index()
)

# -------------------- Stable order within month: pct desc, name asc --------------------
agg.sort_values(
    ["cms_certification_number", "month_ts", "pct_sum", "name_norm"],
    ascending=[True, True, False, True],
    inplace=True,
)

# -------------------- Pack aligned lists per (CCN, month) --------------------
def to_json_list(vals):
    out = []
    for v in vals:
        if isinstance(v, float) and (np.isnan(v) or np.isinf(v)):
            out.append(None)
        elif pd.isna(v):
            out.append(None)
        else:
            out.append(v)
    return json.dumps(out, ensure_ascii=False)

def to_json_dates(vals):
    out = []
    for v in vals:
        if pd.isna(v):
            out.append(None)
        else:
            out.append(pd.to_datetime(v).date().isoformat())
    return json.dumps(out, ensure_ascii=False)

packed = (
    agg.groupby(["cms_certification_number", "month_ts"], sort=False)
       .agg(
           n_owners=("name_norm", "size"),
           used_level=("role", "first"),  # all same within group
           owner_names=("owner_name", list),
           name_norms=("name_norm", list),
           owner_types=("owner_type", list),
           roles=("role", list),
           ownership_percentages=("pct_sum", list),
           association_dates=("assoc_min", list),
           total_pct=("pct_sum", "sum"),
       )
       .reset_index()
)

# Compute pct_fill_tag
def pct_fill_tag(lst):
    arr = pd.Series(lst, dtype="float64")
    if arr.empty or arr.notna().sum() == 0: return "all_null"
    if arr.notna().sum() == len(arr):       return "none_null"
    return "some_null"

packed["pct_fill_tag"] = packed["ownership_percentages"].apply(pct_fill_tag)
packed["has_percent"]  = packed["ownership_percentages"].apply(lambda lst: any(pd.notna(lst)))

# JSON-encode aligned lists
packed["owner_names"]            = packed["owner_names"].apply(to_json_list)
packed["name_norms"]             = packed["name_norms"].apply(to_json_list)
packed["owner_types"]            = packed["owner_types"].apply(to_json_list)
packed["roles"]                  = packed["roles"].apply(to_json_list)
packed["ownership_percentages"]  = packed["ownership_percentages"].apply(
    lambda lst: json.dumps([None if pd.isna(x) else float(x) for x in lst], ensure_ascii=False)
)
packed["association_dates"]      = packed["association_dates"].apply(to_json_dates)

# Processing date string for the month anchor
packed["processing_date"] = packed["month_ts"].dt.date.astype(str)

# Deterministic snapshot_id (optional, cheap)
def snapshot_id_row(row):
    payload = json.dumps({
        "ccn": row["cms_certification_number"],
        "month": pd.to_datetime(row["month_ts"]).date().isoformat() if pd.notna(row["month_ts"]) else None,
        "level": row["used_level"],
        "owners": json.loads(row["name_norms"]),
        "pcts": json.loads(row["ownership_percentages"]),
    }, ensure_ascii=False)
    return hashlib.sha1(payload.encode("utf-8")).hexdigest()

packed["snapshot_id"] = packed.apply(snapshot_id_row, axis=1)

# Final column order
cols = [
    "cms_certification_number", "month_ts", "processing_date", "used_level", "n_owners",
    "roles", "owner_types", "owner_names", "name_norms",
    "ownership_percentages", "association_dates",
    "pct_fill_tag", "total_pct", "has_percent", "snapshot_id"
]
panel = packed[cols].sort_values(["cms_certification_number","month_ts"]).reset_index(drop=True)

# Save
panel.to_csv(OUT_PANEL, index=False)
print(f"[save] CHOW-ready panel → {OUT_PANEL}  (rows={len(panel):,})")

# Peek
with pd.option_context("display.max_colwidth", 120):
    print(panel.head(3).to_string(index=False))

[load] C:\Users\Owner\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv
[loaded] rows=5,330,314, cols=7
