In [6]:
import os, io, zipfile
from pathlib import Path
import pandas as pd

# ── 0. Paths (reuse your portable logic) ─────────────────────────────────────
RAW_DIR    = Path(os.getenv("NH_DATA_DIR"))
NH_ZIP_DIR = RAW_DIR / "nh-compare"
OWN_DIR    = RAW_DIR / "ownership-files"

# ── 1. Iterator over every raw CSV ────────────────────────────────────────────
def iter_raw_sources():
    # loose CSVs
    for fp in sorted(OWN_DIR.glob("ownership_download_*.csv")):
        yield fp.name, fp.open("rb")
    # inside each zip
    for zipf in sorted(NH_ZIP_DIR.glob("nh_archive_*.zip")):
        with zipfile.ZipFile(zipf) as z:
            for m in sorted(z.infolist(), key=lambda x: x.filename):
                if m.is_dir() or not m.filename.lower().endswith(".csv"):
                    continue
                with z.open(m) as fh:
                    txt = io.TextIOWrapper(fh, encoding="utf-8", errors="ignore")
                    yield f"{zipf.name} :: {m.filename}", txt

# ── 2. Build a uniform summary list ───────────────────────────────────────────
summary = []
for source, handle in iter_raw_sources():
    row = {
        "source":    source,
        "ccn_col":   pd.NA,
        "nulls":     pd.NA,
        "mixed_cnt": pd.NA,
        "sample_bad":pd.NA,
        "error":     ""
    }
    try:
        df = pd.read_csv(handle, dtype=str, low_memory=False)
    except Exception as e:
        row["error"] = f"read error: {e}"
        summary.append(row)
        continue

    # find CCN column (case-insensitive exact match)
    ccn_cols = [c for c in df.columns if c.strip().lower()=="ccn"]
    if not ccn_cols:
        row["error"] = "no CCN col"
        summary.append(row)
        continue

    # compute stats
    col = ccn_cols[0]
    s = df[col]
    total = len(s)
    nulls = int(s.isna().sum())
    numeric = int(s.dropna().str.fullmatch(r"\d+").sum())
    mixed = total - nulls - numeric
    bad_samples = s[~s.dropna().str.fullmatch(r"\d+")].dropna().unique()[:5]

    # fill in row
    row.update({
        "ccn_col":    col,
        "nulls":      nulls,
        "mixed_cnt":  mixed,
        "sample_bad": "; ".join(bad_samples.astype(str)),
    })
    summary.append(row)

# ── 3. Create DataFrame & sort safely ────────────────────────────────────────
audit = pd.DataFrame(summary)

# ensure columns exist
for col in ["nulls","mixed_cnt"]:
    if col not in audit.columns:
        audit[col] = pd.NA

audit = audit.sort_values(
    by=["nulls","mixed_cnt"],
    ascending=False,
    na_position="last"
)

# ── 4. Display the top offenders ─────────────────────────────────────────────
display(
    audit[[
        "source","ccn_col","error","nulls","mixed_cnt","sample_bad"
    ]].head(30)
)


Unnamed: 0,source,ccn_col,error,nulls,mixed_cnt,sample_bad
0,ownership_download_01_2017.csv,,no CCN col,,,
1,ownership_download_01_2018.csv,,read error: 'utf-8' codec can't decode byte 0x...,,,
2,ownership_download_01_2019.csv,,read error: 'utf-8' codec can't decode byte 0x...,,,
3,ownership_download_01_2020.csv,,read error: 'utf-8' codec can't decode byte 0x...,,,
4,ownership_download_01_2021.csv,,read error: 'utf-8' codec can't decode byte 0x...,,,
5,ownership_download_01_2022.csv,,read error: 'utf-8' codec can't decode byte 0x...,,,
6,ownership_download_01_2023.csv,,read error: 'utf-8' codec can't decode byte 0x...,,,
7,ownership_download_01_2024.csv,,no CCN col,,,
8,ownership_download_02_2017.csv,,read error: 'utf-8' codec can't decode byte 0x...,,,
9,ownership_download_02_2019.csv,,read error: 'utf-8' codec can't decode byte 0x...,,,


In [7]:
import zipfile
from pathlib import Path

# assuming you’ve already set RAW_DIR and NH_ZIP_DIR correctly
zip_files = sorted((Path(os.getenv("NH_DATA_DIR")) / "nh-compare").glob("nh_archive_*.zip"))

for zippath in zip_files[:3]:          # inspect first three archives
    print(f"\n=== {zippath.name} ===")
    with zipfile.ZipFile(zippath) as z:
        for fn in z.namelist():
            if fn.lower().endswith(".csv"):
                print("  ", fn)


=== nh_archive_2017.zip ===

=== nh_archive_2018.zip ===

=== nh_archive_2019.zip ===


In [8]:
for zippath in zip_files[:3]:   # just inspect the first three
    print(f"\n=== Contents of {zippath.name} ===")
    with zipfile.ZipFile(zippath) as z:
        for fn in z.namelist():
            print(" ", fn)


=== Contents of nh_archive_2017.zip ===
  nh_archive_01_2017.zip
  nh_archive_02_2017.zip
  nh_archive_03_2017.zip
  nh_archive_04_2017.zip
  nh_archive_05_2017.zip
  nh_archive_06_2017.zip
  nh_archive_08_2017.zip
  nh_archive_07_2017.zip
  nh_archive_09_2017.zip
  nh_archive_10_2017.zip
  nh_archive_11_2017.zip
  nh_archive_12_2017.zip

=== Contents of nh_archive_2018.zip ===
  nh_archive_01_2018.zip
  nh_archive_02_2018.zip
  nh_archive_03_2018.zip
  nh_archive_04_2018.zip
  nh_archive_05_2018.zip
  nh_archive_06_2018.zip
  nh_archive_07_2018.zip
  nh_archive_08_2018.zip
  nh_archive_09_2018.zip
  nh_archive_10_2018.zip
  nh_archive_11_2018.zip
  nh_archive_12_2018.zip

=== Contents of nh_archive_2019.zip ===
  nh_archive_01_2019.zip
  nh_archive_02_2019.zip
  nh_archive_03_2019.zip
  nh_archive_04_2019.zip
  nh_archive_05_2019.zip
  nh_archive_06_2019.zip
  nh_archive_07_2019.zip
  nh_archive_08_2019.zip
  nh_archive_09_2019.zip
  nh_archive_10_2019.zip
  nh_archive_12_2019.zip


In [None]:
# ── 1. Iterate nested zip → zip → CSV ─────────────────────────────────────────
def iter_monthly_csvs():
    for yearly in sorted(NH_ZIP_DIR.glob("nh_archive_*.zip")):
        with zipfile.ZipFile(yearly) as z_year:
            for mon_member in sorted(z_year.namelist()):
                if not mon_member.lower().endswith(".zip"):
                    continue
                # read the monthly zip into memory
                with z_year.open(mon_member) as by:
                    buf = io.BytesIO(by.read())
                with zipfile.ZipFile(buf) as z_mon:
                    for csv_name in sorted(z_mon.namelist()):
                        if not csv_name.lower().endswith(".csv"):
                            continue
                        with z_mon.open(csv_name) as fh:
                            # wrap so pandas can read text
                            txt = io.TextIOWrapper(fh, encoding="utf-8", errors="ignore")
                            source = f"{yearly.name}::{mon_member}::{csv_name}"
                            yield source, txt

# ── 2. Audit each CSV’s CCN column ─────────────────────────────────────────────
summary = []
for source, handle in iter_monthly_csvs():
    row = {"source": source, "ccn_col": pd.NA, "nulls": pd.NA, "mixed_cnt": pd.NA, "sample_bad": pd.NA, "error": ""}
    try:
        df = pd.read_csv(handle, dtype=str, low_memory=False)
    except Exception as e:
        row["error"] = f"read error: {e}"
        summary.append(row)
        continue

    # detect CCN column
    ccn_cols = [c for c in df.columns if c.strip().lower()=="ccn"]
    if not ccn_cols:
        row["error"] = "no CCN col"
        summary.append(row)
        continue

    s = df[ccn_cols[0]]
    total   = len(s)
    nulls   = int(s.isna().sum())
    numeric = int(s.dropna().str.fullmatch(r"\d+").sum())
    mixed   = total - nulls - numeric
    bad     = s[~s.dropna().str.fullmatch(r"\d+")].dropna().unique()[:5]

    row.update({
        "ccn_col":   ccn_cols[0],
        "nulls":     nulls,
        "mixed_cnt": mixed,
        "sample_bad":"; ".join(bad.astype(str))
    })
    summary.append(row)

# ── 3. Build & display audit table ────────────────────────────────────────────
audit = pd.DataFrame(summary)
audit = audit.sort_values(["nulls","mixed_cnt"], ascending=False, na_position="last")
display(audit.head(30))