In [1]:
import os, re, json
from pathlib import Path
from zipfile import ZipFile

# ---------- Config ----------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR    = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
NH_ZIP_DIR = RAW_DIR / "nh-compare"   # folder with nh_archive_YYYY.zip
OUT_FP     = PROJECT_ROOT / "nh_archive_manifest.json"  # change if you like

print(f"RAW_DIR: {RAW_DIR}")
print(f"NH_ZIP_DIR: {NH_ZIP_DIR}")

yearlies = sorted(p for p in NH_ZIP_DIR.glob("nh_archive_*.zip") if p.is_file())
if not yearlies:
    raise FileNotFoundError(f"No nh_archive_*.zip files found in {NH_ZIP_DIR}")

manifest = []  # list of dict rows

def safe_list_zip(zip_path: Path):
    with ZipFile(zip_path) as z:
        return z.namelist()

for yzip in yearlies:
    try:
        lvl1 = safe_list_zip(yzip)
    except Exception as e:
        print(f"[ERROR] Could not open {yzip}: {e}")
        continue

    # record top-level contents
    manifest.append({
        "level": 1,
        "yearly_zip": yzip.name,
        "inner_zip": None,
        "path_in_zip": None,
        "filename": None,
    })

    # nested monthly zips typically look like nh_archive_MM_YYYY.zip
    inner_zips = [n for n in lvl1 if n.lower().endswith(".zip")]
    for inner in sorted(inner_zips):
        manifest.append({
            "level": 2,
            "yearly_zip": yzip.name,
            "inner_zip": Path(inner).name,
            "path_in_zip": inner,
            "filename": None,
        })

        # open the nested zip in-memory
        try:
            with ZipFile(yzip) as outer:
                with outer.open(inner) as inner_bytes:
                    from io import BytesIO
                    with ZipFile(BytesIO(inner_bytes.read())) as inner_zip:
                        for member in sorted(inner_zip.namelist()):
                            manifest.append({
                                "level": 3,
                                "yearly_zip": yzip.name,
                                "inner_zip": Path(inner).name,
                                "path_in_zip": member,
                                "filename": Path(member).name,
                            })
        except Exception as e:
            print(f"[WARN] Could not read nested {inner} in {yzip.name}: {e}")

# Write JSON manifest (easy to share)
with open(OUT_FP, "w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2)

print(f"\nWrote manifest with {len(manifest)} rows to: {OUT_FP}")

# Optional: print a small preview
for row in manifest[:40]:
    print(row)

RAW_DIR: C:\Users\wrthj\OneDrive\NursingHomeData
NH_ZIP_DIR: C:\Users\wrthj\OneDrive\NursingHomeData\nh-compare

Wrote manifest with 2043 rows to: C:\Repositories\white-bowblis-nhmc\nh_archive_manifest.json
{'level': 1, 'yearly_zip': 'nh_archive_2017.zip', 'inner_zip': None, 'path_in_zip': None, 'filename': None}
{'level': 2, 'yearly_zip': 'nh_archive_2017.zip', 'inner_zip': 'nh_archive_01_2017.zip', 'path_in_zip': 'nh_archive_01_2017.zip', 'filename': None}
{'level': 3, 'yearly_zip': 'nh_archive_2017.zip', 'inner_zip': 'nh_archive_01_2017.zip', 'path_in_zip': 'Deficiencies_Download.csv', 'filename': 'Deficiencies_Download.csv'}
{'level': 3, 'yearly_zip': 'nh_archive_2017.zip', 'inner_zip': 'nh_archive_01_2017.zip', 'path_in_zip': 'NH_HlthInspecCutpointsState_Jan2017.csv', 'filename': 'NH_HlthInspecCutpointsState_Jan2017.csv'}
{'level': 3, 'yearly_zip': 'nh_archive_2017.zip', 'inner_zip': 'nh_archive_01_2017.zip', 'path_in_zip': 'Ownership_Download.csv', 'filename': 'Ownership_Download

In [6]:
import json, re
from pathlib import Path
import pandas as pd

# Point to the manifest you just created
MANIFEST_FP = Path(r"C:\Repositories\white-bowblis-nhmc") / "nh_archive_manifest.json"


with open(MANIFEST_FP, "r", encoding="utf-8") as f:
    rows = json.load(f)

m = pd.DataFrame(rows)

# Extract year/month from the inner zip name when present
def parse_mm_yyyy(inner):
    if not isinstance(inner, str): 
        return (None, None)
    # Match either nh_archive_MM_YYYY.zip OR nh_archive_YYYY_MM.zip
    m1 = re.search(r"nh_archive_(\d{2})_(\d{4})\.zip", inner)
    m2 = re.search(r"nh_archive_(\d{4})_(\d{2})\.zip", inner)
    if m1:
        return (int(m1.group(1)), int(m1.group(2)))
    if m2:
        return (int(m2.group(2)), int(m2.group(1)))
    return (None, None)

m[["month","year"]] = m["inner_zip"].apply(lambda s: pd.Series(parse_mm_yyyy(s)))

# Filter to level 3 (files inside monthly zips)
files = m[m["level"] == 3].copy()
files["basename"] = files["filename"].astype(str)

# Ownership file variants
own_mask = files["basename"].str.lower().str.contains(r"ownership_.*\.(csv|txt|zip)$", na=False)
ownership_files = files[own_mask].copy()
ownership_files["variant"] = ownership_files["basename"].str.extract(r"(Ownership_[A-Za-z]+)\.csv", expand=False)

print("Coverage by year → distinct months present:")
print(ownership_files.groupby("year")["month"].nunique().sort_index())

print("\nOwnership filename variants by year/month (sample):")
print(ownership_files.loc[:, ["year","month","basename"]].sort_values(["year","month"]).head(24))

print("\nCounts of variants:")
print(ownership_files["variant"].value_counts(dropna=False))

Coverage by year → distinct months present:
year
2017.0    12
2018.0    12
2019.0    11
2020.0     7
Name: month, dtype: int64

Ownership filename variants by year/month (sample):
       year  month                basename
4    2017.0    1.0  Ownership_Download.csv
15   2017.0    2.0  Ownership_Download.csv
26   2017.0    3.0  Ownership_Download.csv
37   2017.0    4.0  Ownership_Download.csv
48   2017.0    5.0  Ownership_Download.csv
59   2017.0    6.0  Ownership_Download.csv
70   2017.0    7.0  Ownership_Download.csv
81   2017.0    8.0  Ownership_Download.csv
92   2017.0    9.0  Ownership_Download.csv
103  2017.0   10.0  Ownership_Download.csv
114  2017.0   11.0  Ownership_Download.csv
125  2017.0   12.0  Ownership_Download.csv
137  2018.0    1.0  Ownership_Download.csv
149  2018.0    2.0   Ownership_Display.csv
161  2018.0    3.0  Ownership_Download.csv
173  2018.0    4.0  Ownership_Download.csv
185  2018.0    5.0  Ownership_Download.csv
197  2018.0    6.0  Ownership_Download.csv
209

  own_mask = files["basename"].str.lower().str.contains(r"ownership_.*\.(csv|txt|zip)$", na=False)


In [None]:
import os, re
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile
import numpy as np
import pandas as pd

# ---------- Config ----------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR    = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
NH_ZIP_DIR = RAW_DIR / "nh-compare"

# Ownership filename patterns to try (CMS changes names across months/years)
OWN_PATTERNS = [
    re.compile(r"^Ownership_Download\.csv$", re.I),
    re.compile(r"^Ownership_Display\.csv$", re.I),
]

# Column name candidates (CMS has varied naming)
CCN_CANDS  = ["CCN","cms_certification_number","PROVNUM","provnum","prov_num"]
ROLE_CANDS = ["ROLE_DESC","ROLE","role_desc","role"]
TYPE_CANDS = ["OWNER_TYPE","OWN_TYPE","owner_type","own_type"]
PCT_CANDS  = ["OWNER_PERCENTAGE","OWNERSHIP_PERCENTAGE","OWN_PCT","owner_percentage","percent"]

def find_col(cols, cands):
    cl = {c.lower(): c for c in cols}
    for cand in cands:
        if cand.lower() in cl: return cl[cand.lower()]
    return None

def normalize_role(s):
    if pd.isna(s): return np.nan
    t = str(s).lower()
    return "Direct" if "direct" in t else ("Indirect" if "indirect" in t else "Other")

def clean_pct(x):
    if pd.isna(x): return np.nan
    s = str(x).replace(",","").strip()
    if s.endswith("%"):
        try: return float(s[:-1])
        except: return np.nan
    try:
        v = float(s)
        return v*100.0 if 0 <= v <= 1 else v
    except:
        return np.nan

def load_ownership_from_inner(outer_zip: Path, inner_name: str):
    """Return a DataFrame for the first matching Ownership_* file inside the inner monthly zip."""
    with ZipFile(outer_zip) as outer:
        with outer.open(inner_name) as inner_bytes:
            with ZipFile(BytesIO(inner_bytes.read())) as inner_zip:
                names = inner_zip.namelist()
                # find first ownership file by priority of OWN_PATTERNS
                target = None
                for pat in OWN_PATTERNS:
                    matches = [n for n in names if pat.search(Path(n).name)]
                    if matches:
                        target = matches[0]
                        break
                if target is None:
                    return None, None  # no ownership file in this monthly archive
                df = pd.read_csv(inner_zip.open(target), dtype=str)
                return df, Path(target).name

def profile_df(df: pd.DataFrame):
    ccn  = find_col(df.columns, CCN_CANDS)
    role = find_col(df.columns, ROLE_CANDS)
    typ  = find_col(df.columns, TYPE_CANDS)
    pct  = find_col(df.columns, PCT_CANDS)
    if ccn is None:
        return {"ccn_missing": True}

    work = pd.DataFrame({
        "CCN": df[ccn].str.strip(),
        "ROLE": df[role].map(normalize_role) if role in df else np.nan,
        "OWNER_TYPE": df[typ].replace("", np.nan) if typ in df else np.nan,
        "OWNER_PERCENTAGE": df[pct].map(clean_pct) if pct in df else np.nan,
    })

    # presence classes per CCN
    has_direct = (work["ROLE"]=="Direct").groupby(work["CCN"]).any()
    has_indir  = (work["ROLE"]=="Indirect").groupby(work["CCN"]).any()
    presence_class = pd.Series(index=has_direct.index, dtype=object)
    presence_class[(has_direct)&(has_indir)] = "both"
    presence_class[(has_direct)&(~has_indir)] = "only_direct"
    presence_class[(~has_direct)&(has_indir)] = "only_indirect"
    presence_class.fillna("neither", inplace=True)

    # % sum-to-100 checks per CCN
    def approx100(s, tol=2.0):
        if s.isna().all(): return False
        S = s.fillna(0).sum()
        return (100 - tol) <= S <= (100 + tol)

    g = work.groupby("CCN", dropna=False)
    pct_ok_all  = g["OWNER_PERCENTAGE"].apply(approx100)
    pct_ok_dir  = g.apply(lambda d: approx100(d.loc[d["ROLE"]=="Direct","OWNER_PERCENTAGE"]))
    pct_ok_ind  = g.apply(lambda d: approx100(d.loc[d["ROLE"]=="Indirect","OWNER_PERCENTAGE"]))

    return {
        "ccn_missing": False,
        "n_rows": len(work),
        "role_counts": work["ROLE"].value_counts(dropna=False).to_dict(),
        "type_top": work["OWNER_TYPE"].value_counts(dropna=False).head(10).to_dict(),
        "presence_class_counts": presence_class.value_counts(dropna=False).to_dict(),
        "pct_ok_share": {
            "ok_all": float(pct_ok_all.mean()) if len(pct_ok_all) else np.nan,
            "ok_direct": float(pct_ok_dir.mean()) if len(pct_ok_dir) else np.nan,
            "ok_indirect": float(pct_ok_ind.mean()) if len(pct_ok_ind) else np.nan,
        }
    }

# ---------- Walk all yearlies & monthlies ----------
yearlies = sorted(p for p in NH_ZIP_DIR.glob("nh_archive_*.zip") if p.is_file())
if not yearlies:
    raise FileNotFoundError(f"No nh_archive_*.zip files found in {NH_ZIP_DIR}")

summary_rows = []
for yzip in yearlies:
    with ZipFile(yzip) as outer:
        for inner in sorted([n for n in outer.namelist() if n.lower().endswith(".zip")]):
            # Parse month/year from the inner zip name if possible
            mm_yyyy = re.search(r"nh_archive_(\d{2})_(\d{4})\.zip", Path(inner).name)
            mm, yyyy = (int(mm_yyyy.group(1)), int(mm_yyyy.group(2))) if mm_yyyy else (None, None)

            df, picked_name = load_ownership_from_inner(yzip, inner)
            if df is None:
                summary_rows.append({
                    "year": yyyy, "month": mm, "yearly_zip": yzip.name,
                    "inner_zip": Path(inner).name, "ownership_file": None, "note": "No ownership file found"
                })
                continue

            prof = profile_df(df)
            summary_rows.append({
                "year": yyyy, "month": mm, "yearly_zip": yzip.name,
                "inner_zip": Path(inner).name, "ownership_file": picked_name,
                "ccn_missing": prof["ccn_missing"],
                "n_rows": prof.get("n_rows"),
                "role_counts": prof.get("role_counts"),
                "type_top": prof.get("type_top"),
                "presence_class_counts": prof.get("presence_class_counts"),
                "pct_ok_share": prof.get("pct_ok_share"),
            })

# Pretty print a compact summary
sum_df = pd.DataFrame(summary_rows).sort_values(["year","month"])
print(sum_df.loc[:, ["year","month","ownership_file","n_rows","pct_ok_share"]].head(24))
print("\nOwnership variants encountered:")
print(sum_df["ownership_file"].value_counts(dropna=False))