In [5]:
from pathlib import Path
import os, re
import pandas as pd
import numpy as np

# --- Paths ---
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent
RAW_DIR = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw")).resolve()
MCR_DIR = RAW_DIR / "medicare-cost-reports"
MCR_GLOB = "mcr_flatfile_20??.csv"

print(f"[paths] RAW_DIR={RAW_DIR}")
print(f"[paths] MCR_DIR={MCR_DIR}")

_TRY_ENCODINGS = ["utf-8","utf-8-sig","cp1252","latin1"]
_TRY_SEPS = [",", "|", "\t", ";", "~"]  # try comma first since your files look CSV
TARGET_COLS = ["PRVDR_NUM","S2_2_chow","S2_2_chowdate"]

def _sniff(fp: Path):
    """Return (sep, enc, cols) using header-only reads."""
    last_err = None
    for enc in _TRY_ENCODINGS:
        for sep in _TRY_SEPS:
            try:
                hdr = pd.read_csv(fp, sep=sep, nrows=0, engine="python", encoding=enc)
                if hdr.shape[1] > 0:
                    return sep, enc, list(hdr.columns)
            except Exception as e:
                last_err = e
    raise last_err or RuntimeError(f"Could not sniff {fp}")

def _read_three_cols(fp: Path) -> pd.DataFrame:
    sep, enc, cols = _sniff(fp)
    # Case-insensitive match to our targets
    colmap = {c.upper(): c for c in cols}
    usecols = [colmap[t] for t in TARGET_COLS if t in colmap]

    # Choose engine: use fast C engine for commas; python otherwise (without low_memory!)
    engine = None if sep == "," else "python"

    if usecols:
        df = pd.read_csv(fp, sep=sep, encoding=enc, engine=engine, usecols=usecols)
    else:
        # fallback: read full and then subset (rare)
        df = pd.read_csv(fp, sep=sep, encoding=enc, engine=engine)

    # Standardize exact names
    rename = {colmap.get("PRVDR_NUM","PRVDR_NUM"): "PRVDR_NUM"}
    if "S2_2_CHOW" in colmap:      rename[colmap["S2_2_CHOW"]] = "S2_2_chow"
    if "S2_2_CHOWDATE" in colmap:  rename[colmap["S2_2_CHOWDATE"]] = "S2_2_chowdate"
    df = df.rename(columns=rename)

    # Ensure all 3 exist even if missing in a year
    for c in TARGET_COLS:
        if c not in df.columns:
            df[c] = pd.NA

    out = df[TARGET_COLS].copy()
    # Normalize provider number -> zero-padded 6-char string
    out["PRVDR_NUM"] = (
        out["PRVDR_NUM"].astype("string").str.strip()
        .str.replace(r"\D", "", regex=True)
        .str.zfill(6)
    )
    out["S2_2_chow"] = out["S2_2_chow"].astype("string").str.strip()
    out["S2_2_chowdate"] = pd.to_datetime(out["S2_2_chowdate"], errors="coerce")
    print(f"[read] {fp.name} sep='{sep}' enc={enc} -> {len(out):,} rows")
    return out

# --- Load, stack, dedupe, sort ---
files = sorted(MCR_DIR.glob(MCR_GLOB))
if not files:
    raise FileNotFoundError(f"No files matched {MCR_DIR / MCR_GLOB}")

frames = []
for fp in files:
    try:
        frames.append(_read_three_cols(fp))
    except Exception as e:
        print(f"[warn] {fp.name}: {e}")

mcr_chow_df = (
    pd.concat(frames, ignore_index=True)
      .drop_duplicates()
      .sort_values(["PRVDR_NUM", "S2_2_chowdate"], kind="mergesort")
      .reset_index(drop=True)
)

print(f"[done] rows={len(mcr_chow_df):,}  unique providers={mcr_chow_df['PRVDR_NUM'].nunique():,}")
mcr_chow_df.head()

[paths] RAW_DIR=C:\Users\wrthj\OneDrive\NursingHomeData
[paths] MCR_DIR=C:\Users\wrthj\OneDrive\NursingHomeData\medicare-cost-reports
[read] mcr_flatfile_2016.csv sep=',' enc=utf-8 -> 15,103 rows
[read] mcr_flatfile_2017.csv sep=',' enc=utf-8 -> 15,429 rows
[read] mcr_flatfile_2018.csv sep=',' enc=utf-8 -> 15,129 rows
[read] mcr_flatfile_2019.csv sep=',' enc=utf-8 -> 15,183 rows
[read] mcr_flatfile_2020.csv sep=',' enc=utf-8 -> 14,949 rows
[read] mcr_flatfile_2021.csv sep=',' enc=utf-8 -> 15,071 rows
[read] mcr_flatfile_2022.csv sep=',' enc=utf-8 -> 14,966 rows
[read] mcr_flatfile_2023.csv sep=',' enc=utf-8 -> 12,816 rows
[read] mcr_flatfile_2024.csv sep=',' enc=utf-8 -> 164 rows
[done] rows=15,124  unique providers=15,124


Unnamed: 0,PRVDR_NUM,S2_2_chow,S2_2_chowdate
0,15009,,NaT
1,15010,,NaT
2,15014,,NaT
3,15015,,NaT
4,15016,,NaT


In [7]:
mcr_chow_df.isnull().sum()

PRVDR_NUM            0
S2_2_chow        15124
S2_2_chowdate    15124
dtype: int64