In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

# ---------------- Paths ----------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw")).resolve()
MCR_DIR = RAW_DIR / "medicare-cost-reports"
MCR_GLOB = "mcr_flatfile_20??.csv"

INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

OUT_EVENTS_LONG = INTERIM_DIR / "mcr_chow_events_long.csv"
OUT_PROVIDER_ALL = INTERIM_DIR / "mcr_chow_provider_events_all.csv"

print(f"[paths] RAW_DIR={RAW_DIR}")
print(f"[paths] MCR_DIR={MCR_DIR}")
print(f"[paths] INTERIM_DIR={INTERIM_DIR}")

# ------------- Reader (robust & simple) -------------
_TRY_SEPS = [",", "|", "\t", ";", "~"]
_TRY_ENCODINGS = ["utf-8","utf-8-sig","cp1252","latin1"]
TARGET_UP = {"PRVDR_NUM","S2_2_CHOW","S2_2_CHOWDATE"}

def _sniff_sep_enc(fp: Path):
    last_err = None
    for enc in _TRY_ENCODINGS:
        for sep in _TRY_SEPS:
            try:
                hdr = pd.read_csv(fp, sep=sep, nrows=0, engine="python", encoding=enc)
                if hdr.shape[1] > 0:
                    return sep, enc
            except Exception as e:
                last_err = e
    raise last_err or RuntimeError(f"Could not sniff {fp}")

def _usecols_ci(colname: str) -> bool:
    return str(colname).upper().strip() in TARGET_UP

def _read_three_raw(fp: Path) -> pd.DataFrame:
    sep, enc = _sniff_sep_enc(fp)
    engine = None if sep == "," else "python"  # C engine for comma csv
    df = pd.read_csv(fp, sep=sep, encoding=enc, engine=engine,
                     usecols=_usecols_ci, dtype=str)
    print(f"[read] {fp.name} sep='{sep}' enc={enc} -> cols={list(df.columns)} rows={len(df):,}")
    return df

# ------------- Load & Stack (raw) -------------
files = sorted(MCR_DIR.glob(MCR_GLOB))
if not files:
    raise FileNotFoundError(f"No files matched {MCR_DIR / MCR_GLOB}")

frames = []
for fp in files:
    try:
        frames.append(_read_three_raw(fp))
    except Exception as e:
        print(f"[warn] {fp.name}: {e}")

mcr_raw = pd.concat(frames, ignore_index=True)
print(f"[stack] combined rows={len(mcr_raw):,}")
print("[stack] non-null counts:\n", mcr_raw.notna().sum())

# ------------- Normalize -------------
# Standardize column names to uppercase
mcr_raw = mcr_raw.rename(columns={c: c.upper().strip() for c in mcr_raw.columns})

# Provider number -> zero-padded 6 digits
mcr_raw["PRVDR_NUM"] = (
    mcr_raw["PRVDR_NUM"].astype("string").str.replace(r"\D", "", regex=True).str.zfill(6)
)

# Parse CHOW date (keep NaT if blank)
mcr_raw["S2_2_CHOWDATE"] = pd.to_datetime(
    mcr_raw["S2_2_CHOWDATE"].astype("string").str.strip(),
    errors="coerce"
)

# ---------- Build provider universe (all providers, incl. those without CHOWs) ----------
all_providers = (
    mcr_raw["PRVDR_NUM"].dropna().astype("string").str.zfill(6).drop_duplicates()
    .to_frame(name="cms_certification_number")
)

# ---------- Long events table (dated CHOWs only) ----------
events = (
    mcr_raw.loc[mcr_raw["S2_2_CHOWDATE"].notna(), ["PRVDR_NUM", "S2_2_CHOWDATE"]]
          .drop_duplicates()
          .sort_values(["PRVDR_NUM", "S2_2_CHOWDATE"], kind="mergesort")
          .reset_index(drop=True)
)
events = events.rename(columns={"PRVDR_NUM": "cms_certification_number"})
events["chow_order"] = events.groupby("cms_certification_number")["S2_2_CHOWDATE"].rank(method="first").astype(int)

# Save long events (optional but handy)
events.to_csv(OUT_EVENTS_LONG, index=False)
print(f"[saved] events-long -> {OUT_EVENTS_LONG}  rows={len(events):,} providers={events['cms_certification_number'].nunique():,}")

# ---------- Wide per-provider table ----------
wide = (
    events.pivot(index="cms_certification_number", columns="chow_order", values="S2_2_CHOWDATE")
          .sort_index()
          .reset_index()
)
# Rename pivoted columns chow_1_date, chow_2_date, ...
if wide.shape[1] > 1:
    wide.columns = ["cms_certification_number"] + [f"chow_{k}_date" for k in wide.columns[1:]]
else:
    wide.columns = ["cms_certification_number"]

# Counts and flags
counts = events.groupby("cms_certification_number", as_index=False).size().rename(columns={"size":"n_chow"})
provider_wide = all_providers.merge(wide, on="cms_certification_number", how="left")
provider_wide = provider_wide.merge(counts, on="cms_certification_number", how="left")

provider_wide["n_chow"] = provider_wide["n_chow"].fillna(0).astype("Int16")
provider_wide["is_chow"] = (provider_wide["n_chow"] > 0).astype("Int8")

# Convert date columns to ISO strings for CSV readability
date_cols = [c for c in provider_wide.columns if c.startswith("chow_") and c.endswith("_date")]
for c in date_cols:
    provider_wide[c] = pd.to_datetime(provider_wide[c], errors="coerce").dt.strftime("%Y-%m-%d")

# Order columns nicely
ordered = ["cms_certification_number", "n_chow", "is_chow"] + date_cols
provider_wide = provider_wide[ordered].sort_values("cms_certification_number").reset_index(drop=True)

print(f"[result] providers total={len(provider_wide):,}  with CHOWs={int((provider_wide['n_chow']>0).sum()):,}  max_n_chow={int(provider_wide['n_chow'].max())}")
print(provider_wide.head(10))

# ---------- Save provider-wide (all providers) ----------
provider_wide.to_csv(OUT_PROVIDER_ALL, index=False)
print(f"[saved] provider-wide (ALL providers) -> {OUT_PROVIDER_ALL}")

[paths] RAW_DIR=C:\Users\Owner\OneDrive\NursingHomeData
[paths] MCR_DIR=C:\Users\Owner\OneDrive\NursingHomeData\medicare-cost-reports
[paths] INTERIM_DIR=C:\Repositories\white-bowblis-nhmc\data\interim
[read] mcr_flatfile_2016.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,103
[read] mcr_flatfile_2017.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,429
[read] mcr_flatfile_2018.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,129
[read] mcr_flatfile_2019.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,183
[read] mcr_flatfile_2020.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=14,949
[read] mcr_flatfile_2021.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=15,071
[read] mcr_flatfile_2022.csv sep=',' enc=utf-8 -> cols=['PRVDR_NUM', 'S2_2_chow', 'S2_2_chowdate'] rows=14,966
[read] mcr_flatfile_2

In [2]:
import pandas as pd
from pathlib import Path

fp = Path("C:/Repositories/white-bowblis-nhmc/data/interim/mcr_chow_provider_events_all.csv")
df = pd.read_csv(fp, dtype={"cms_certification_number":"string"})

print(df.head(10))  # preview first rows
print(df.info())    # check column types
print(df["n_chow"].value_counts().head(10))  # distribution of counts
print(df["is_chow"].value_counts())          # should be 0/1
print(df.loc[df["n_chow"] > 3].head())       # example of multi-CHOW providers

  cms_certification_number  n_chow  is_chow chow_1_date chow_2_date  \
0                   015009       0        0         NaN         NaN   
1                   015010       0        0         NaN         NaN   
2                   015014       0        0         NaN         NaN   
3                   015015       0        0         NaN         NaN   
4                   015016       0        0         NaN         NaN   
5                   015019       0        0         NaN         NaN   
6                   015023       0        0         NaN         NaN   
7                   015024       1        1  2022-01-01         NaN   
8                   015027       0        0         NaN         NaN   
9                   015028       0        0         NaN         NaN   

  chow_3_date chow_4_date chow_5_date  
0         NaN         NaN         NaN  
1         NaN         NaN         NaN  
2         NaN         NaN         NaN  
3         NaN         NaN         NaN  
4         NaN     