In [18]:
import zipfile
import tempfile
import os
import glob
from io import BytesIO
import pandas as pd

In [15]:
# ── CONFIG: set this to your folder containing nh_archive_*.zip ──
ARCHIVE_DIR = Path(r"C:\Users\Owner\OneDrive\NursingHomeData\nh-compare")

# find all matching zips
zip_files = sorted(ARCHIVE_DIR.glob("nh_archive_*.zip"))
if not zip_files:
    print(f"No files found matching nh_archive_*.zip in {ARCHIVE_DIR}")
else:
    for zip_path in zip_files:
        print(f"\n=== Archive: {zip_path.name} ===")
        with zipfile.ZipFile(zip_path, 'r') as z:
            for entry in sorted(z.namelist()):
                if entry.endswith('/'):
                    print(f"Folder: {entry}")
                else:
                    print(f"  File:   {entry}")


=== Archive: nh_archive_2017.zip ===
  File:   nh_archive_01_2017.zip
  File:   nh_archive_02_2017.zip
  File:   nh_archive_03_2017.zip
  File:   nh_archive_04_2017.zip
  File:   nh_archive_05_2017.zip
  File:   nh_archive_06_2017.zip
  File:   nh_archive_07_2017.zip
  File:   nh_archive_08_2017.zip
  File:   nh_archive_09_2017.zip
  File:   nh_archive_10_2017.zip
  File:   nh_archive_11_2017.zip
  File:   nh_archive_12_2017.zip

=== Archive: nh_archive_2018.zip ===
  File:   nh_archive_01_2018.zip
  File:   nh_archive_02_2018.zip
  File:   nh_archive_03_2018.zip
  File:   nh_archive_04_2018.zip
  File:   nh_archive_05_2018.zip
  File:   nh_archive_06_2018.zip
  File:   nh_archive_07_2018.zip
  File:   nh_archive_08_2018.zip
  File:   nh_archive_09_2018.zip
  File:   nh_archive_10_2018.zip
  File:   nh_archive_11_2018.zip
  File:   nh_archive_12_2018.zip

=== Archive: nh_archive_2019.zip ===
  File:   nh_archive_01_2019.zip
  File:   nh_archive_02_2019.zip
  File:   nh_archive_03_2019

In [16]:
# iterate over each top‐level archive
for outer_zip_path in sorted(ARCHIVE_DIR.glob("nh_archive_*.zip")):
    print(f"\n=== Outer Archive: {outer_zip_path.name} ===")
    with zipfile.ZipFile(outer_zip_path, 'r') as outer_z:
        # find each nested .zip inside
        nested_names = [name for name in outer_z.namelist() if name.lower().endswith('.zip')]
        if not nested_names:
            print("  (no nested ZIPs found)")
            continue

        for nested_name in sorted(nested_names):
            print(f"\n  -- Nested ZIP: {nested_name}")
            # read the nested ZIP into memory
            nested_bytes = outer_z.read(nested_name)
            with zipfile.ZipFile(BytesIO(nested_bytes), 'r') as nested_z:
                # list every file/folder inside the nested ZIP
                for entry in sorted(nested_z.namelist()):
                    if entry.endswith('/'):
                        print(f"     Folder: {entry}")
                    else:
                        print(f"     File:   {entry}")


=== Outer Archive: nh_archive_2017.zip ===

  -- Nested ZIP: nh_archive_01_2017.zip
     File:   Deficiencies_Download.csv
     File:   NH_HlthInspecCutpointsState_Jan2017.csv
     File:   Ownership_Download.csv
     File:   Penalties_Download.csv
     File:   ProviderInfo_Download.csv
     File:   QualityMsrClaims_Download.csv
     File:   QualityMsrMDS_Download.csv
     File:   StateAverages_Download.csv
     File:   SurveySummary_Download.csv

  -- Nested ZIP: nh_archive_02_2017.zip
     File:   DataMedicareGov_MetadataAllTabs_v12.xls
     File:   Deficiencies_Download.csv
     File:   NH_HlthInspecCutpointsState_Feb2017.csv
     File:   Ownership_Download.csv
     File:   Penalties_Download.csv
     File:   ProviderInfo_Download.csv
     File:   QualityMsrClaims_Download.csv
     File:   QualityMsrMDS_Download.csv
     File:   StateAverages_Download.csv
     File:   SurveySummary_Download.csv

  -- Nested ZIP: nh_archive_03_2017.zip
     File:   DataMedicareGov_MetadataAllTabs_v12

In [17]:
# loop over each yearly archive
for outer_zip in sorted(ARCHIVE_DIR.glob("nh_archive_*.zip")):
    print(f"\n=== {outer_zip.name} ===")
    with zipfile.ZipFile(outer_zip, 'r') as oz:
        # find each nested monthly ZIP
        nested_zips = [n for n in oz.namelist() if n.lower().endswith('.zip')]
        for nested_name in sorted(nested_zips):
            # load nested ZIP into memory
            data = oz.read(nested_name)
            with zipfile.ZipFile(BytesIO(data), 'r') as nz:
                # filter for just the ownership files
                matches = []
                for entry in nz.namelist():
                    # skip folders
                    if entry.endswith('/'):
                        continue
                    fn = Path(entry).name
                    if fn == "Ownership_Download.csv" or fn.startswith("NH_Ownership_"):
                        matches.append(entry)
                
                # if we found any, print them
                if matches:
                    print(f"  -> {nested_name}:")
                    for e in matches:
                        print(f"       • {e}")


=== nh_archive_2017.zip ===
  -> nh_archive_01_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_02_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_03_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_04_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_05_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_06_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_07_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_08_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_09_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_10_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_11_2017.zip:
       • Ownership_Download.csv
  -> nh_archive_12_2017.zip:
       • Ownership_Download.csv

=== nh_archive_2018.zip ===
  -> nh_archive_01_2018.zip:
       • Ownership_Download.csv
  -> nh_archive_03_2018.zip:
       • Ownership_Download.csv
  -> nh_archive_04_2018.zip:
       • Ownership_Download.csv
  -> nh_archive_05_2018.zip

In [19]:
# We'll collect schema info in a list of dicts
schema_info = []

for outer_zip_path in sorted(ARCHIVE_DIR.glob("nh_archive_*.zip")):
    with zipfile.ZipFile(outer_zip_path, 'r') as outer_z:
        # find each nested monthly ZIP
        nested_zips = [n for n in outer_z.namelist() if n.lower().endswith('.zip')]
        for nested_name in sorted(nested_zips):
            # read nested ZIP into memory
            nested_bytes = outer_z.read(nested_name)
            with zipfile.ZipFile(BytesIO(nested_bytes), 'r') as nested_z:
                # scan for the ownership CSV variants
                for entry in nested_z.namelist():
                    fn = Path(entry).name
                    if fn == "Ownership_Download.csv" or fn.startswith("NH_Ownership_"):
                        # read a sample to infer dtypes
                        with nested_z.open(entry) as f:
                            df = pd.read_csv(f, nrows=100)
                        schema_info.append({
                            "year_archive": outer_zip_path.name,
                            "month_zip": nested_name,
                            "file": fn,
                            "columns": list(df.columns),
                            "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}
                        })

# print out the collected schema
for info in schema_info:
    print(f"\nArchive: {info['year_archive']} → {info['month_zip']} → {info['file']}")
    print(" Columns:")
    for col in info["columns"]:
        print(f"   • {col} ({info['dtypes'][col]})")


Archive: nh_archive_2017.zip → nh_archive_01_2017.zip → Ownership_Download.csv
 Columns:
   • PROVNUM (int64)
   • PROVNAME (object)
   • ADDRESS (object)
   • CITY (object)
   • STATE (object)
   • ZIP (int64)
   • ROLE_DESC (object)
   • OWNER_TYPE (object)
   • OWNER_NAME (object)
   • OWNER_PERCENTAGE (object)
   • ASSOCIATION_DATE (object)
   • filedate (object)

Archive: nh_archive_2017.zip → nh_archive_02_2017.zip → Ownership_Download.csv
 Columns:
   • PROVNUM (int64)
   • PROVNAME (object)
   • ADDRESS (object)
   • CITY (object)
   • STATE (object)
   • ZIP (int64)
   • ROLE_DESC (object)
   • OWNER_TYPE (object)
   • OWNER_NAME (object)
   • OWNER_PERCENTAGE (object)
   • ASSOCIATION_DATE (object)
   • filedate (object)

Archive: nh_archive_2017.zip → nh_archive_03_2017.zip → Ownership_Download.csv
 Columns:
   • PROVNUM (int64)
   • PROVNAME (object)
   • ADDRESS (object)
   • CITY (object)
   • STATE (object)
   • ZIP (int64)
   • ROLE_DESC (object)
   • OWNER_TYPE (objec

In [23]:
# the original provider‐ID columns we want to check
ID_COLS = [
    "PROVNUM",
    "Federal Provider Number",
    "CMS Certification Number (CCN)"
]

def safe_read_csv(mz: zipfile.ZipFile, entry: str) -> pd.DataFrame:
    """
    Try reading with UTF-8, then Latin-1, then fallback with replace errors.
    """
    raw = mz.read(entry)
    for enc in ("utf-8", "latin-1"):
        try:
            return pd.read_csv(BytesIO(raw), dtype=str, encoding=enc)
        except UnicodeDecodeError:
            continue
    # final fallback: replace invalid chars
    return pd.read_csv(BytesIO(raw), dtype=str, encoding="utf-8", encoding_errors="replace")

records = []

for outer_zip in sorted(ARCHIVE_DIR.glob("nh_archive_*.zip")):
    with zipfile.ZipFile(outer_zip, "r") as oz:
        for monthly_name in sorted(n for n in oz.namelist() if n.lower().endswith(".zip")):
            buf = oz.read(monthly_name)
            with zipfile.ZipFile(BytesIO(buf), "r") as mz:
                for entry in mz.namelist():
                    fn = Path(entry).name
                    if fn == "Ownership_Download.csv" or fn.startswith("NH_Ownership_"):
                        # read safely
                        df = safe_read_csv(mz, entry)
                        total = len(df)
                        rec = {
                            "source": f"{outer_zip.name}/{monthly_name}/{fn}",
                            "total_rows": total
                        }
                        # count nulls for each original ID column
                        for col in ID_COLS:
                            if col in df.columns:
                                rec[f"null_{col}"] = int(df[col].isna().sum())
                            else:
                                rec[f"null_{col}"] = None
                        records.append(rec)

# assemble into a DataFrame and print
report = pd.DataFrame(records)
print(report.to_string(index=False))

                                                                                                 source  total_rows  null_PROVNUM  null_Federal Provider Number  null_CMS Certification Number (CCN)
                                      nh_archive_2017.zip/nh_archive_01_2017.zip/Ownership_Download.csv      178084           0.0                           NaN                                  NaN
                                      nh_archive_2017.zip/nh_archive_02_2017.zip/Ownership_Download.csv      175735           0.0                           NaN                                  NaN
                                      nh_archive_2017.zip/nh_archive_03_2017.zip/Ownership_Download.csv      178036           0.0                           NaN                                  NaN
                                      nh_archive_2017.zip/nh_archive_04_2017.zip/Ownership_Download.csv      177337           0.0                           NaN                                  NaN
               