In [3]:
# ────────────────────────────────────────────────────────────────────────────────
# CMS Ownership Ingest → Clean monthlies → Combine (normalized) → Equity-only
# ────────────────────────────────────────────────────────────────────────────────
import os, re, zipfile
from io import BytesIO
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm

# ----------------------------- Config / Paths ---------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR    = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
NH_ZIP_DIR = RAW_DIR / "nh-compare"
OWN_DIR    = RAW_DIR / "ownership-files"
OWN_DIR.mkdir(parents=True, exist_ok=True)

OUT_COMBINED_CSV = OWN_DIR / "ownership_combined.csv"
OUT_EQUITY_CSV   = OWN_DIR / "ownership_equity_combined.csv"

# Optional knobs
SKIP_EXISTING_MONTHS = True
SPEED_MODE            = False
MIN_YEAR, MIN_MONTH   = 2017, 1

print(f"[paths] RAW_DIR={RAW_DIR}")
print(f"[paths] NH_ZIP_DIR={NH_ZIP_DIR}")
print(f"[paths] OWN_DIR={OWN_DIR}")

# ----------------------------- Header Map --------------------------------------
COLUMN_MAP = {
    # CCN & Provider
    "cms certification number (ccn)": "cms_certification_number",
    "federal provider number":        "cms_certification_number",
    "provnum":                        "cms_certification_number",
    "provider name":                  "provider_name",
    "provname":                       "provider_name",

    # Role
    "role played by owner or manager in facility": "role",
    "role played by owner in facility":            "role",
    "role of owner or manager":                    "role",
    "owner role":                                  "role",
    "role_desc":                                   "role",
    "role desc":                                   "role",

    # Owner attrs
    "owner type":                 "owner_type",
    "owner name":                 "owner_name",

    # Ownership %
    "ownership percentage":       "ownership_percentage",
    "owner percentage":           "ownership_percentage",

    # Association date (expanded variants)
    "association date":                 "association_date",
    "association effective date":       "association_date",
    "assoc effective date":             "association_date",
    "assoc date":                       "association_date",
    "owner association date":           "association_date",
    "ownership association date":       "association_date",

    # Processing date (expanded variants)
    "processing date":            "processing_date",
    "processingdate":             "processing_date",
    "process date":               "processing_date",
    "processdate":                "processing_date",
    "filedate":                   "processing_date",
}
EXPECTED = [
    "cms_certification_number","provider_name","role","owner_type","owner_name",
    "ownership_percentage","association_date","processing_date"
]
CANON_ROLES  = {"DIRECT","INDIRECT","PARTNERSHIP","MANAGER","OTHER"}
EQUITY_ROLES = {"DIRECT","INDIRECT","PARTNERSHIP"}

# ----------------------------- Helpers -----------------------------------------
def safe_read_csv(raw: bytes, nrows=None) -> pd.DataFrame:
    for enc in ("utf-8", "latin-1"):
        try:
            return pd.read_csv(BytesIO(raw), dtype=str, encoding=enc,
                               low_memory=False, engine="c", nrows=nrows)
        except Exception:
            pass
    return pd.read_csv(BytesIO(raw), dtype=str, encoding="utf-8",
                       encoding_errors="replace", low_memory=False,
                       engine="c", nrows=nrows)

def norm_header(h: str) -> str:
    s = str(h or "").strip().lower().replace("_", " ")
    s = re.sub(r"\s+", " ", s)
    return s

def map_headers(cols):
    return [COLUMN_MAP.get(norm_header(c), norm_header(c)) for c in cols]

def parse_month_year_from_zipname(name: str):
    s = Path(name).stem
    toks = [t for t in re.split(r"[^0-9]+", s) if t]
    if len(toks) >= 2:
        a, b = toks[-2], toks[-1]
        if len(a) == 4 and 1 <= int(b) <= 12:   # a=YYYY, b=MM
            return int(b), int(a)
        if len(b) == 4 and 1 <= int(a) <= 12:   # b=YYYY, a=MM
            return int(a), int(b)
        # fallback: prefer 4-digit as year
        aa, bb = int(a), int(b)
        if len(a) == 4:  # a=year
            return max(1, min(bb, 12)), aa
        if len(b) == 4:  # b=year
            return max(1, min(aa, 12)), bb
    return None, None

def parse_year_from_archive(outer_name: str):
    m = re.search(r"(20\d{2})", outer_name)
    return int(m.group(1)) if m else None

def is_ownership_file(filename: str) -> bool:
    f = filename.lower()
    if not f.endswith(".csv"):
        return False
    return (
        ("ownership" in f and "download" in f)
        or ("ownership" in f and "display" in f)
        or f.startswith("nh_ownership_")
        or f == "nh_ownership.csv"
    )

def looks_like_ownership_csv_bytes(b: bytes) -> bool:
    try:
        probe = safe_read_csv(b, nrows=5)
    except Exception:
        return False
    cols = [norm_header(c) for c in probe.columns]
    keys = set(cols)
    needed_any = [
        {"cms certification number (ccn)", "federal provider number", "provnum"},
        {"provider name", "provname"},
        {"owner name"},
        {"ownership percentage", "owner percentage"},
        {"role", "role desc", "role_desc",
         "role played by owner in facility", "role of owner or manager",
         "role played by owner or manager in facility"},
    ]
    hits = sum(1 for group in needed_any if keys & group)
    return hits >= 3

# --- association_date parsing (robust) ---
DATE_WORD_PAT = re.compile(r"(?:since|as of|effective|eff\.)\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})", re.I)
CLEAN_NULLS = {"", "na", "n/a", "null", "no date provided", "no date", "unknown", "not available"}

def parse_association_date_series(s: pd.Series) -> pd.Series:
    """
    Normalize association_date per-month:
    - Treats placeholders as null
    - Extracts dates from phrases like 'since 12/31/2024'
    - Tries US (%m/%d/%Y), ISO (%Y-%m-%d), then general parse
    """
    raw = s.astype(str).str.strip()
    norm = raw.str.lower().str.strip()
    raw = raw.mask(norm.isin(CLEAN_NULLS))

    # extract explicit trailing date inside phrases
    extracted = raw.str.extract(DATE_WORD_PAT, expand=False)

    out = pd.to_datetime(extracted, errors="coerce", format="%m/%d/%Y")

    # US format
    mask = out.isna() & raw.notna()
    out.loc[mask] = pd.to_datetime(raw[mask], errors="coerce", format="%m/%d/%Y")

    # ISO format
    mask = out.isna() & raw.notna()
    out.loc[mask] = pd.to_datetime(raw[mask], errors="coerce", format="%Y-%m-%d")

    # last resort parse
    mask = out.isna() & raw.notna()
    out.loc[mask] = pd.to_datetime(raw[mask], errors="coerce")

    return out

def infer_month_year_from_inner_csvs(zipfile_obj: zipfile.ZipFile, default_year=None):
    candidates = []
    for entry in zipfile_obj.namelist():
        if not entry.lower().endswith(".csv"):
            continue
        try:
            df = safe_read_csv(zipfile_obj.read(entry), nrows=1000)
        except Exception:
            continue
        df.columns = map_headers(df.columns)
        if "processing_date" in df.columns:
            s = pd.to_datetime(df["processing_date"], errors="coerce").dropna()
            if not s.empty:
                dt = s.iloc[0]
                candidates.append((int(dt.month), int(dt.year)))
    if candidates:
        sr = pd.Series(candidates).value_counts()
        (m, y) = sr.index[0]
        return m, y
    return (None, int(default_year)) if default_year else (None, None)

def normalize_role_series(role_series: pd.Series) -> pd.Series:
    r = role_series.fillna("").str.upper()
    out = pd.Series("OTHER", index=r.index, dtype="object")

    # Ownership
    out[r.str.contains("INDIRECT OWNERSHIP")] = "INDIRECT"
    out[r.str.contains("DIRECT OWNERSHIP")]   = "DIRECT"

    # Partnership
    part_mask = (
        r.str.contains("PARTNERSHIP INTEREST") |
        r.str.contains("LIMITED PARTNERSHIP") |
        r.str.contains("GENERAL PARTNERSHIP")
    )
    out[part_mask] = "PARTNERSHIP"

    # Management (non-equity)
    man_mask = (
        r.str.contains("OPERATIONAL/MANAGERIAL CONTROL") |
        r.str.contains("MANAGING EMPLOYEE")
    )
    out[man_mask] = "MANAGER"

    return out

def read_csv_bytes_month(b: bytes, source_name: str, month: int, year: int) -> pd.DataFrame:
    df = safe_read_csv(b)
    df.columns = map_headers(df.columns)
    for col in EXPECTED:
        if col not in df.columns:
            df[col] = pd.NA

    # CCN
    df["cms_certification_number"] = (
        df["cms_certification_number"].astype(str)
          .str.replace(r"\D", "", regex=True)
          .str.zfill(6)
    )

    # Role canonicalization
    df["role"] = normalize_role_series(df["role"])
    assert set(df["role"].dropna().unique()) <= CANON_ROLES, f"Role map failed for {source_name}"

    # Ownership % → float
    pct = df["ownership_percentage"].fillna("")
    pct = pct.str.replace("%","", regex=False).str.replace(",","", regex=False).str.strip()
    pct = pct.mask(pct.eq("") | pct.str.contains("NO PERCENTAGE", case=False))
    df["ownership_percentage"] = pd.to_numeric(pct, errors="coerce")

    # association_date per-month parsing
    if "association_date" in df.columns:
        df["association_date"] = parse_association_date_series(df["association_date"])

    # processing_date (prefer column, else synthesize first-of-month)
    df["processing_date"] = pd.to_datetime(df.get("processing_date"), errors="coerce")
    synth = pd.Timestamp(year=int(year), month=int(month), day=1)
    df.loc[df["processing_date"].isna(), "processing_date"] = synth

    # month/year/date/source
    df["month"] = int(month)
    df["year"]  = int(year)
    df["date"]  = synth
    df["source_file"] = source_name

    # De-dup within month
    dedup_keys = [
        "cms_certification_number","provider_name","role","owner_type","owner_name",
        "ownership_percentage","association_date","year","month"
    ]
    df = df.sort_values(["cms_certification_number","owner_name"]).drop_duplicates(dedup_keys)

    keep = [
        "cms_certification_number","provider_name","role","owner_type","owner_name",
        "ownership_percentage","association_date","processing_date",
        "source_file","month","year","date"
    ]
    return df[[c for c in keep if c in df.columns]]

# ----------------------------- Ingest & Write Monthlies ------------------------
summary_rows = []
written_monthlies = set()
archives = sorted(NH_ZIP_DIR.glob("nh_archive_*.zip"))
print(f"[scan] Found {len(archives)} yearly nh_archive_*.zip files")

already = {p.name for p in OWN_DIR.glob("ownership_download_*.csv")} if SKIP_EXISTING_MONTHS else set()

for yearly_zip in tqdm(archives, desc="Yearly archives"):
    outer_year = parse_year_from_archive(yearly_zip.name)
    with zipfile.ZipFile(yearly_zip, "r") as yz:
        inner_zips = sorted(n for n in yz.namelist() if n.lower().endswith(".zip"))
        for mzip in inner_zips:
            # Step 1: parse from inner zip name
            month, year = parse_month_year_from_zipname(mzip)

            # Step 2: fallback to inferring from CSV contents
            if not month or not year:
                try:
                    with zipfile.ZipFile(BytesIO(yz.read(mzip)), "r") as mz:
                        month, year = infer_month_year_from_inner_csvs(mz, default_year=outer_year)
                        if not month or not year:
                            print(f"[warn] Could not infer month/year from {mzip}; skipping.")
                            continue
                except KeyError:
                    print(f"[warn] Missing {mzip} inside {yearly_zip.name}; skipping.")
                    continue

            if SPEED_MODE:
                if (year < MIN_YEAR) or (year == MIN_YEAR and month < MIN_MONTH):
                    continue

            out_name = f"ownership_download_{year:04d}_{month:02d}.csv"
            if SKIP_EXISTING_MONTHS and out_name in already:
                written_monthlies.add((year, month))
                continue

            print(f"[month] {year:04d}-{month:02d} → scanning ownership CSVs...")
            try:
                with zipfile.ZipFile(BytesIO(yz.read(mzip)), "r") as mz:
                    rows_accum, num_named, num_sniffed = [], 0, 0

                    # Pass 1: filename filter
                    for entry in mz.namelist():
                        fname = Path(entry).name
                        if not entry.lower().endswith(".csv"):
                            continue
                        if is_ownership_file(fname):
                            num_named += 1
                            try:
                                df = read_csv_bytes_month(
                                    mz.read(entry),
                                    source_name=f"{year:04d}-{month:02d}:{fname}",
                                    month=month, year=year
                                )
                                rows_accum.append(df)
                            except Exception as e:
                                print(f"[warn] Failed member {fname} in {mzip}: {e}")

                    # Pass 2: header-sniff fallback if nothing matched names
                    if num_named == 0 and not rows_accum:
                        for entry in mz.namelist():
                            if not entry.lower().endswith(".csv"):
                                continue
                            blob = mz.read(entry)
                            if not looks_like_ownership_csv_bytes(blob):
                                continue
                            num_sniffed += 1
                            try:
                                df = read_csv_bytes_month(
                                    blob,
                                    source_name=f"{year:04d}-{month:02d}:{Path(entry).name}",
                                    month=month, year=year
                                )
                                rows_accum.append(df)
                            except Exception as e:
                                print(f"[warn] Sniffed member failed {entry} in {mzip}: {e}")
                        print(f"[month] {year:04d}-{month:02d} → sniffed {num_sniffed} ownership file(s)")

                    print(f"[month] {year:04d}-{month:02d} → named={num_named}, total_rowsets={len(rows_accum)}")
                    if rows_accum:
                        out_df = pd.concat(rows_accum, ignore_index=True)
                        out_path = OWN_DIR / out_name
                        out_df.to_csv(out_path, index=False)
                        written_monthlies.add((year, month))
                        summary_rows.append({
                            "file": out_path.name,
                            "rows": len(out_df),
                            "pct_null_role": out_df["role"].isna().mean() * 100.0,
                            "pct_null_ownership_pct": out_df["ownership_percentage"].isna().mean() * 100.0,
                        })
            except KeyError:
                print(f"[warn] Missing {mzip} inside {yearly_zip.name}; continuing")

# ----------------------------- Coverage (last 18 months) -----------------------
from pandas import PeriodIndex
if written_monthlies:
    cov = (pd.DataFrame(list(written_monthlies), columns=["year","month"])
             .sort_values(["year","month"])
             .assign(ym=lambda d: PeriodIndex.from_fields(
                 year=d["year"].to_numpy(), month=d["month"].to_numpy(), freq="M"
             ).astype(str)))
    print("\n[coverage written] last 18 months:")
    print(cov.tail(18).to_string(index=False))
else:
    print("\n[coverage written] No monthly outputs written. Check patterns and zip contents.")

# ----------------------------- Combine (post-concat normalize) -----------------
all_monthlies = sorted(OWN_DIR.glob("ownership_download_*.csv"))
if not all_monthlies:
    raise RuntimeError("No monthly ownership_download_YYYY_MM.csv files found to combine.")

# read only columns we actually keep (speed)
usecols = [
    "cms_certification_number","provider_name","role","owner_type","owner_name",
    "ownership_percentage","association_date","processing_date",
    "source_file","month","year","date"
]

frames = []
for p in tqdm(all_monthlies, desc="Combining monthly CSVs"):
    header_cols = list(pd.read_csv(p, nrows=0).columns)
    frames.append(pd.read_csv(
        p,
        usecols=[c for c in usecols if c in header_cols],
        dtype={
            "cms_certification_number":"string",
            "provider_name":"string",
            "role":"string",
            "owner_type":"string",
            "owner_name":"string",
            "ownership_percentage":"string",
            "source_file":"string",
        },
        low_memory=False
    ))

combined = pd.concat(frames, ignore_index=True)

# Post-concat normalization (roles, CCN, %, dates)
combined["cms_certification_number"] = (
    combined["cms_certification_number"].astype(str)
      .str.replace(r"\D","", regex=True)
      .str.zfill(6)
)

role_raw = combined["role"].fillna("").str.upper()
combined["role"] = role_raw.where(role_raw.isin(CANON_ROLES), None)
mask = combined["role"].isna()
combined.loc[mask, "role"] = normalize_role_series(role_raw[mask])

pct = combined["ownership_percentage"].fillna("")
pct = pct.str.replace("%","", regex=False).str.replace(",","", regex=False).str.strip()
pct = pct.mask(pct.eq("") | pct.str.contains("NO PERCENTAGE", case=False))
combined["ownership_percentage"] = pd.to_numeric(pct, errors="coerce")

# Dates: 'date' & 'processing_date' are consistent; 'association_date' already parsed monthly
if "date" in combined.columns:
    combined["date"] = pd.to_datetime(combined["date"], errors="coerce", format="%Y-%m-%d")
if "processing_date" in combined.columns:
    combined["processing_date"] = pd.to_datetime(combined["processing_date"], errors="coerce", format="%Y-%m-%d")
# DO NOT re-parse 'association_date' (already handled in monthlies)

combined = combined.sort_values(["cms_certification_number","date","owner_name"])

# ----------------------------- Equity-only filter ------------------------------
before_rows = len(combined)
equity = combined[combined["role"].isin(EQUITY_ROLES)].copy()
after_rows  = len(equity)
dropped     = before_rows - after_rows

# ----------------------------- Write CSV outputs -------------------------------
combined.to_csv(OUT_COMBINED_CSV, index=False)
print(f"\n[save] wrote combined CSV: {OUT_COMBINED_CSV} ({before_rows:,} rows)")

equity.to_csv(OUT_EQUITY_CSV, index=False)
print(f"[save] wrote equity-only CSV: {OUT_EQUITY_CSV} ({after_rows:,} rows; dropped {dropped:,})")

# ----------------------------- QC Summaries ------------------------------------
null_df = pd.DataFrame(summary_rows)
if not null_df.empty:
    print("\n--- ROLE NULL % (by file) ---")
    print(null_df["pct_null_role"].describe().round(2))
    print("\n--- OWNERSHIP % NULL % (by file) ---")
    print(null_df["pct_null_ownership_pct"].describe().round(2))

# Role distribution (full vs equity)
print("\n--- ROLE COUNTS (full) ---")
print(combined["role"].value_counts(dropna=False).to_string())

print("\n--- ROLE COUNTS (equity only) ---")
print(equity["role"].value_counts(dropna=False).to_string())

# Nulls by role (equity only)
role_nulls = (
    equity.assign(is_null_pct = equity["ownership_percentage"].isna())
          .groupby("role")["is_null_pct"]
          .agg(total="size", nulls="sum")
)
role_nulls["pct_null"] = (role_nulls["nulls"] / role_nulls["total"] * 100.0).round(2)
print("\n--- OWNERSHIP % NULL (by role, equity only) ---")
print(role_nulls[["total","nulls","pct_null"]].sort_values("pct_null", ascending=False))

# Recent coverage: last 18 months (equity only)
recent_cov = (
    equity.groupby(["year","month","role"])
          .size().rename("rows")
          .reset_index()
          .sort_values(["year","month","role"])
)
if not recent_cov.empty:
    tail_months = (
        recent_cov.assign(ym=recent_cov["year"].astype(int).astype(str) + "-" +
                               recent_cov["month"].astype(int).astype(str).str.zfill(2))
                   .groupby("ym")["rows"].sum()
                   .sort_index()
                   .tail(18)
    )
    print("\n[recent equity coverage] last 18 months (total rows by month):")
    print(tail_months.to_string())

# Unique CCNs per month (equity only)
fac_month = (
    equity.groupby(["year","month"])["cms_certification_number"]
          .nunique()
          .rename("unique_ccn")
          .sort_index()
          .tail(18)
)
print("\n[recent equity coverage] last 18 months (unique CCNs by month):")
print(fac_month.to_string())

# Owner type distribution (equity only) — top 10
if "owner_type" in equity.columns:
    ot = (equity["owner_type"].fillna("").str.strip().str.title())
    print("\n--- OWNER TYPE (equity only, top 10) ---")
    print(ot.value_counts().head(10).to_string())

# Date sanity — compute min/max as datetimes regardless of stray strings
print("\n--- DATE RANGES ---")
for col in ["association_date","processing_date","date"]:
    if col in combined.columns:
        s = combined[col]
        if pd.api.types.is_datetime64_any_dtype(s):
            s2 = s.dropna()
        else:
            s2 = pd.to_datetime(s, errors="coerce").dropna()
        mn = s2.min() if not s2.empty else "NaT"
        mx = s2.max() if not s2.empty else "NaT"
        print(f"{col:>18}: min={mn}  max={mx}")

[paths] RAW_DIR=C:\Users\wrthj\OneDrive\NursingHomeData
[paths] NH_ZIP_DIR=C:\Users\wrthj\OneDrive\NursingHomeData\nh-compare
[paths] OWN_DIR=C:\Users\wrthj\OneDrive\NursingHomeData\ownership-files
[scan] Found 9 yearly nh_archive_*.zip files


Yearly archives:   0%|          | 0/9 [00:00<?, ?it/s]


[coverage written] last 18 months:
 year  month      ym
 2024      1 2024-01
 2024      2 2024-02
 2024      3 2024-03
 2024      4 2024-04
 2024      5 2024-05
 2024      6 2024-06
 2024      7 2024-07
 2024      8 2024-08
 2024      9 2024-09
 2024     10 2024-10
 2024     11 2024-11
 2024     12 2024-12
 2025      2 2025-02
 2025      3 2025-03
 2025      4 2025-04
 2025      5 2025-05
 2025      6 2025-06
 2025      7 2025-07


Combining monthly CSVs:   0%|          | 0/200 [00:00<?, ?it/s]


[save] wrote combined CSV: C:\Users\wrthj\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv (31,284,982 rows)
[save] wrote equity-only CSV: C:\Users\wrthj\OneDrive\NursingHomeData\ownership-files\ownership_equity_combined.csv (10,394,650 rows; dropped 20,890,332)

--- ROLE COUNTS (full) ---
role
OTHER          13003143
MANAGER         7887189
DIRECT          7300154
INDIRECT        2942638
PARTNERSHIP      151858

--- ROLE COUNTS (equity only) ---
role
DIRECT         7300154
INDIRECT       2942638
PARTNERSHIP     151858

--- OWNERSHIP % NULL (by role, equity only) ---
               total    nulls  pct_null
role                                   
PARTNERSHIP   151858   151858    100.00
INDIRECT     2942638  2383910     81.01
DIRECT       7300154  3684661     50.47

[recent equity coverage] last 18 months (total rows by month):
ym
2024-01    97183
2024-02    97371
2024-03    97544
2024-04    97806
2024-05    97763
2024-06    98119
2024-07    98535
2024-08    98654
2024-09

  s2 = pd.to_datetime(s, errors="coerce").dropna()


  association_date: min=NaT  max=NaT
   processing_date: min=2017-01-01 00:00:00  max=2025-07-01 00:00:00
              date: min=2017-01-01 00:00:00  max=2025-07-01 00:00:00
