In [10]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

# ---------------- Paths ----------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw")).resolve()
MCR_DIR = RAW_DIR / "medicare-cost-reports"
MCR_GLOB = "mcr_flatfile_20??.csv"

print(f"[paths] RAW_DIR={RAW_DIR}")
print(f"[paths] MCR_DIR={MCR_DIR}")

# ------------- Reader (robust & simple) -------------
_TRY_SEPS = [",", "|", "\t", ";", "~"]
_TRY_ENCODINGS = ["utf-8","utf-8-sig","cp1252","latin1"]
TARGET_UP = {"PRVDR_NUM","S2_2_CHOW","S2_2_CHOWDATE"}

def _sniff_sep_enc(fp: Path):
    last_err = None
    for enc in _TRY_ENCODINGS:
        for sep in _TRY_SEPS:
            try:
                hdr = pd.read_csv(fp, sep=sep, nrows=0, engine="python", encoding=enc)
                if hdr.shape[1] > 0:
                    return sep, enc
            except Exception as e:
                last_err = e
    raise last_err or RuntimeError(f"Could not sniff {fp}")

def _usecols_ci(colname: str) -> bool:
    return str(colname).upper().strip() in TARGET_UP

def _read_three_raw(fp: Path) -> pd.DataFrame:
    sep, enc = _sniff_sep_enc(fp)
    engine = None if sep == "," else "python"  # C engine for comma csv
    # Read ONLY the 3 columns (case-insensitive)
    df = pd.read_csv(
        fp, sep=sep, encoding=enc, engine=engine,
        usecols=_usecols_ci, dtype=str
    )
    print(f"[read] {fp.name} sep='{sep}' enc={enc} -> cols={list(df.columns)} rows={len(df):,}")
    return df

# ------------- Load & Stack (raw) -------------
files = sorted(MCR_DIR.glob(MCR_GLOB))
if not files:
    raise FileNotFoundError(f"No files matched {MCR_DIR / MCR_GLOB}")

frames = []
for fp in files:
    try:
        frames.append(_read_three_raw(fp))
    except Exception as e:
        print(f"[warn] {fp.name}: {e}")

mcr_raw = pd.concat(frames, ignore_index=True)
print(f"[stack] combined rows={len(mcr_raw):,}")
print("[stack] non-null counts pre-normalize:\n", mcr_raw.notna().sum())

# ------------- Normalize once (after stack) -------------
# Standardize exact column names
ren = {c: c.upper().strip() for c in mcr_raw.columns}
mcr_raw = mcr_raw.rename(columns=ren)

# Provider number -> zero-padded 6 digits
mcr_raw["PRVDR_NUM"] = (
    mcr_raw["PRVDR_NUM"]
    .astype("string").str.strip()
    .str.replace(r"\D", "", regex=True)
    .str.zfill(6)
)

# Parse CHOW date (keep NaT if blank)
mcr_raw["S2_2_CHOWDATE"] = pd.to_datetime(
    mcr_raw["S2_2_CHOWDATE"].astype("string").str.strip(),
    errors="coerce"
)

# Normalize flag (string -> bool/NA)
true_set  = {"Y","YES","TRUE","T","1"}
false_set = {"N","NO","FALSE","F","0","",None,"NA","N/A"}

flag_str = mcr_raw["S2_2_CHOW"].astype("string").str.strip().str.upper()
flag_bool = flag_str.map(lambda v: True if v in true_set else (False if v in false_set else pd.NA))

has_flag_yes = flag_bool.fillna(False)
has_date     = mcr_raw["S2_2_CHOWDATE"].notna()

print(f"[diag] rows with Y-like flag: {int(has_flag_yes.sum()):,}")
print(f"[diag] rows with non-null date: {int(has_date.sum()):,}")

# Keep rows with a Y-like flag OR a valid date
keep_mask = has_flag_yes | has_date
m = mcr_raw.loc[keep_mask].copy()

# Drop exact dupes but keep first; then sort by provider/date
m = (
    m.drop_duplicates(subset=["PRVDR_NUM","S2_2_CHOW","S2_2_CHOWDATE"], keep="first")
     .sort_values(["PRVDR_NUM","S2_2_CHOWDATE"], kind="mergesort")
     .reset_index(drop=True)
)

# Final in-memory table (requested 3 columns; flag is kept as raw)
mcr_chow_clean = m[["PRVDR_NUM","S2_2_CHOW","S2_2_CHOWDATE"]].rename(
    columns={"S2_2_CHOW":"S2_2_chow","S2_2_CHOWDATE":"S2_2_chowdate"}
)

print(f"[result] kept {len(mcr_chow_clean):,} rows across {mcr_chow_clean['PRVDR_NUM'].nunique():,} providers")
print(mcr_chow_clean.head(10))

[paths] RAW_DIR=C:\Users\Owner\OneDrive\NursingHomeData
[paths] MCR_DIR=C:\Users\Owner\OneDrive\NursingHomeData\medicare-cost-reports
[read] mcr_flatfile_2016.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,103
[read] mcr_flatfile_2017.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,429
[read] mcr_flatfile_2018.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,129
[read] mcr_flatfile_2019.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,183
[read] mcr_flatfile_2020.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=14,949
[read] mcr_flatfile_2021.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,071
[read] mcr_flatfile_2022.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=14,966
[read] mcr_flatfile_2023.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_c

  has_flag_yes = flag_bool.fillna(False)
