In [1]:
import os
import zipfile
from io import BytesIO
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm

# ------------------------------------------------------------------------------
# 1. Project‐wide paths (portable)
# ------------------------------------------------------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR      = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
NH_ZIP_DIR   = RAW_DIR / "nh-compare"
PROV_OUT_DIR = RAW_DIR / "provider-info-files"
PROV_OUT_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
# ------------------------------------------------------------------------------
# 2. Which files to pick (in priority order)
# ------------------------------------------------------------------------------
PRIORITY = [
    "providerinfo_download.csv",
    "providerinfo_display.csv",
    "nh_providerinfo"
]

In [3]:
# ------------------------------------------------------------------------------
# 3. A little helper to read any encoding
# ------------------------------------------------------------------------------
def safe_read_csv(raw: bytes) -> pd.DataFrame:
    for enc in ("utf-8", "latin-1"):
        try:
            return pd.read_csv(BytesIO(raw), dtype=str, encoding=enc)
        except Exception:
            pass
    # fallback
    return pd.read_csv(BytesIO(raw),
                       dtype=str,
                       encoding="utf-8",
                       encoding_errors="replace")

In [4]:
# ------------------------------------------------------------------------------
# 4. Your one‐off synonyms for join‐keys
# ------------------------------------------------------------------------------
COLUMN_SYNONYMS = {
    # CMS Certification Number
    "provnum":                    "cms_certification_number",
    "provnum":                    "cms_certification_number",
    "federal_provider_number":    "cms_certification_number",
    "cms_certification_number_ccn":"cms_certification_number",
    # Provider Name
    "provname":                   "provider_name",
    # Address / City / State / ZIP
    "provider_address":           "address",
    "address":                    "address",
    "provider_city":              "city",
    "city":                       "city",
    "citytown":                   "city",
    "provider_state":             "state",
    "state":                      "state",
    "zip":                        "zip_code",
    "zip_code":                   "zip_code",
    "provider_zip_code":          "zip_code",
    # Phone
    "phone":                      "phone",
    "provider_phone_number":      "phone",
}

In [5]:
# ------------------------------------------------------------------------------
# 5. Scan → Clean → Write
# ------------------------------------------------------------------------------
summary = []
seen_columns = {}

for yearly_zip in tqdm(sorted(NH_ZIP_DIR.glob("nh_archive_*.zip")), desc="Years"):
    with zipfile.ZipFile(yearly_zip, "r") as yz:
        # find every monthly .zip inside
        for mzip in sorted(n for n in yz.namelist() if n.lower().endswith(".zip")):
            parts = Path(mzip).stem.split("_")
            month, year = parts[-2], parts[-1]

            with zipfile.ZipFile(BytesIO(yz.read(mzip)), "r") as mz:
                # pick the best CSV
                entries = mz.namelist()
                chosen = None
                for pat in PRIORITY:
                    for e in entries:
                        if pat in Path(e).name.lower():
                            chosen = e
                            break
                    if chosen:
                        break
                if not chosen:
                    continue

                # read raw
                df = safe_read_csv(mz.read(chosen))

                # record raw columns + dtypes
                for col in df.columns:
                    seen_columns.setdefault(col, set()).add(str(df[col].dtype))

                # 1) normalize every column to lower_snake_case
                cols = (
                    pd.Index(df.columns)
                      .str.strip()
                      .str.lower()
                      .str.replace(r"\s+", "_", regex=True)
                      .str.replace(r"[^0-9a-z_]", "", regex=True)
                )
                df.columns = cols

                # 2) apply your join‐key synonyms
                df = df.rename(columns=COLUMN_SYNONYMS)

                # 3) annotate with source/month/year/date
                out_name = f"provider_info_{month}_{year}.csv"
                df["source_file"] = out_name
                df["month"]       = int(month)
                df["year"]        = int(year)
                df["date"]        = pd.to_datetime({
                                        "year": df["year"],
                                        "month": df["month"],
                                        "day": 1
                                    })

                # 4) record null % on your join keys
                summary.append({
                    "file": out_name,
                    "rows": len(df),
                    "pct_null_ccn" : df["cms_certification_number"].isna().mean() * 100,
                    "pct_null_name": df["provider_name"].isna().mean()              * 100,
                    "pct_null_addr": df["address"].isna().mean()                  * 100,
                    "pct_null_city": df["city"].isna().mean()                     * 100,
                    "pct_null_st"  : df["state"].isna().mean()                    * 100,
                    "pct_null_zip" : df["zip_code"].isna().mean()                 * 100,
                })

                # 5) write your cleaned monthly CSV
                df.to_csv(PROV_OUT_DIR / out_name, index=False)

Years:   0%|          | 0/9 [00:00<?, ?it/s]

In [6]:
# ------------------------------------------------------------------------------
# 6. Report back
# ------------------------------------------------------------------------------
print("\n=== UNIQUE RAW COLUMNS & DTYPEs ===")
for col, dtypes in sorted(seen_columns.items()):
    print(f"  {col!r:50} → {', '.join(sorted(dtypes))}")

null_summary = pd.DataFrame(summary)
print("\n--- SAMPLE NULL % SUMMARY ---")
print(null_summary.head())

null_summary.to_csv(
    PROJECT_ROOT / "data" / "interim" / "providerinfo_null_summary.csv",
    index=False
)

print(f"\n✅ Cleaned files written to {PROV_OUT_DIR}")


=== UNIQUE RAW COLUMNS & DTYPEs ===
  'ABUSE_ICON'                                       → object
  'ADDRESS'                                          → object
  'ADJ_AIDE'                                         → object
  'ADJ_LPN'                                          → object
  'ADJ_RN'                                           → object
  'ADJ_TOTAL'                                        → object
  'AIDHRD'                                           → object
  'Abuse Icon'                                       → object
  'Adjusted CNA Staffing Hours per Resident per Day' → object
  'Adjusted LPN Staffing Hours per Resident per Day' → object
  'Adjusted Nurse Aide Staffing Hours per Resident per Day' → object
  'Adjusted RN Staffing Hours per Resident per Day'  → object
  'Adjusted Total Nurse Staffing Hours per Resident per Day' → object
  'Adjusted Weekend Total Nurse Staffing Hours per Resident per Day' → object
  'Administrator turnover footnote'                  → object
  

In [None]:
# ------------------------------------------------------------------------------
# 7. Combine all monthly provider_info files into one
# ------------------------------------------------------------------------------
monthly_paths = sorted(PROV_OUT_DIR.glob("provider_info_*.csv"))
combined_dfs = []
for path in monthly_paths:
    # read each month; parse the 'date' column as datetime
    combined_dfs.append(
        pd.read_csv(path, dtype=str, parse_dates=["date"])
    )

combined = pd.concat(combined_dfs, ignore_index=True)

# write out
combined.to_csv(PROV_OUT_DIR / "provider_info_combined.csv", index=False)

print(f"✅ Combined {len(monthly_paths)} files into provider_info_combined.csv "
      f"→ {combined.shape[0]:,} rows × {combined.shape[1]} cols")

  pd.read_csv(path, dtype=str, parse_dates=["date"])
