In [37]:
import os
import zipfile
from io import BytesIO
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm

In [38]:
# ------------------------------------------------------------------------------
# 1. Project‐wide paths (portable)
# ------------------------------------------------------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
NH_ZIP_DIR  = RAW_DIR / "nh-compare"
OWN_DIR     = RAW_DIR / "ownership-files"       # flat directory for cleaned CSVs
OWN_DIR.mkdir(parents=True, exist_ok=True)

In [39]:
# ------------------------------------------------------------------------------
# 2. Column mapping (normalized lowercase keys)
# ------------------------------------------------------------------------------
COLUMN_MAP = {
    "cms certification number (ccn)":    "cms_certification_number",
    "federal provider number":           "cms_certification_number",
    "provnum":                           "cms_certification_number",
    "provider name":                     "provider_name",
    "provname":                          "provider_name",
    "role played by owner or manager in facility": "role",
    "role played by owner in facility":           "role",
    "role of owner or manager":                    "role",
    "owner role":                                   "role",
    "role_desc":                                   "role",
    "role desc":                                   "role",
    "owner type":                                 "owner_type",
    "owner name":                                 "owner_name",
    "ownership percentage":                      "ownership_percentage",
    "owner percentage":                         "ownership_percentage",
    "association date":                          "association_date",
    "processing date":                           "processing_date",
    "processingdate":                            "processing_date",
    "process date":                              "processing_date",
    "processdate":                                "processing_date",
    "filedate":                                   "processing_date",
}

In [40]:
# ------------------------------------------------------------------------------
# 3. Safe CSV reader (UTF-8 → Latin-1 → fallback replace)
# ------------------------------------------------------------------------------
def safe_read_csv(raw_bytes: bytes) -> pd.DataFrame:
    for enc in ("utf-8", "latin-1"):
        try:
            return pd.read_csv(BytesIO(raw_bytes), dtype=str, encoding=enc)
        except Exception:
            pass
    return pd.read_csv(BytesIO(raw_bytes), dtype=str,
                       encoding="utf-8", encoding_errors="replace")

In [41]:
# ------------------------------------------------------------------------------
# 4. Extract, Clean, Rename, and Write Individual CSVs (flat OWN_DIR)
# ------------------------------------------------------------------------------
OWN_PATTERNS = ("Ownership_Download.csv", "NH_Ownership_")
null_summary = []

for yearly_zip in tqdm(sorted(NH_ZIP_DIR.glob("nh_archive_*.zip"))):
    with zipfile.ZipFile(yearly_zip, "r") as yz:
        monthly_zips = [n for n in yz.namelist() if n.lower().endswith(".zip")]
        for mzip_name in sorted(monthly_zips):
            parts = Path(mzip_name).stem.split("_")
            month, year = parts[-2], parts[-1]
            with zipfile.ZipFile(BytesIO(yz.read(mzip_name)), "r") as mz:
                for entry in mz.namelist():
                    fname = Path(entry).name
                    if any(fname == p or fname.startswith(p) for p in OWN_PATTERNS):
                        # Read and standardize
                        df = safe_read_csv(mz.read(entry))
                        df.columns = [
                            COLUMN_MAP.get(c.strip().lower(), c.strip().lower())
                            for c in df.columns
                        ]

                        # Annotate source, month, year, date
                        out_name = f"ownership_download_{month}_{year}.csv"
                        df["source_file"] = out_name
                        df["month"]       = int(month)
                        df["year"]        = int(year)
                        df["date"]        = pd.to_datetime({
                                                "year": df["year"],
                                                "month": df["month"],
                                                "day": 1
                                            })
                        
                        # Record null‐count
                        missing = int(df["cms_certification_number"].isna().sum())
                        total   = len(df)
                        null_summary.append({
                            "file": f"{year}/{month}/{fname}",
                            "missing_ccn": missing,
                            "total_rows": total
                        })
                        # Write cleaned CSV with unified naming
                        out_name = f"ownership_download_{month}_{year}.csv"
                        df.to_csv(OWN_DIR / out_name, index=False)

  0%|          | 0/9 [00:00<?, ?it/s]

In [42]:
# ------------------------------------------------------------------------------
# 5. Combine All Cleaned CSVs into One (in OWN_DIR)
# ------------------------------------------------------------------------------
all_dfs = []
for csv_path in sorted(OWN_DIR.glob("ownership_download_*.csv")):
    all_dfs.append(pd.read_csv(csv_path, dtype=str))

combined = pd.concat(all_dfs, ignore_index=True)
combined.to_csv(OWN_DIR / "ownership_combined.csv", index=False)

In [43]:
# ------------------------------------------------------------------------------
# 6. Print Null Summary
# ------------------------------------------------------------------------------
print(pd.DataFrame(null_summary).to_string(index=False))
print(f"\n✅ Cleaned files written to {OWN_DIR} and combined file at {OWN_DIR/'ownership_combined.csv'}")

                            file  missing_ccn  total_rows
  2017/01/Ownership_Download.csv            0      178084
  2017/02/Ownership_Download.csv            0      175735
  2017/03/Ownership_Download.csv            0      178036
  2017/04/Ownership_Download.csv            0      177337
  2017/05/Ownership_Download.csv            0      176566
  2017/06/Ownership_Download.csv            0      176473
  2017/07/Ownership_Download.csv            0      175720
  2017/08/Ownership_Download.csv            0      175567
  2017/09/Ownership_Download.csv            0      175067
  2017/10/Ownership_Download.csv            0      174624
  2017/11/Ownership_Download.csv            0      174990
  2017/12/Ownership_Download.csv            0      174846
  2018/01/Ownership_Download.csv            0      172388
  2018/03/Ownership_Download.csv            0      171019
  2018/04/Ownership_Download.csv            0      173462
  2018/05/Ownership_Download.csv            0      173474
  2018/06/Owne