In [1]:
import os, re, warnings
from pathlib import Path
import numpy as np
import pandas as pd

# --- Paths (from your previous config) ---
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent
RAW_DIR  = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw")).resolve()
PBJ_DIR  = RAW_DIR / "pbj-nurse"
PBJ_GLOB = "pbj_nurse_????_Q[1-4].csv"

# interim output
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)
OUT_FP = INTERIM_DIR / "pbj_monthly_panel.csv"

print(f"[paths] RAW_DIR={RAW_DIR}")
print(f"[paths] PBJ_DIR={PBJ_DIR}")
print(f"[paths] OUT_FP={OUT_FP}")

# --- Robust CSV reader (mixed encodings happen) ---
_TRY_ENCODINGS = ["utf-8", "utf-8-sig", "cp1252", "latin1"]
def read_csv_robust(fp: Path) -> pd.DataFrame:
    last_err = None
    for enc in _TRY_ENCODINGS:
        try:
            df = pd.read_csv(fp, low_memory=False, encoding=enc, encoding_errors="strict")
            print(f"[read] {fp.name} (encoding={enc})")
            return df
        except Exception as e:
            last_err = e
    for enc in ["cp1252","latin1"]:
        try:
            df = pd.read_csv(fp, low_memory=False, encoding=enc, encoding_errors="replace", on_bad_lines="skip")
            print(f"[read] {fp.name} (encoding={enc}, replace+skip)")
            return df
        except Exception as e:
            last_err = e
    raise last_err

# --- Minimal per-file cleaner for columns we need ---
def zero_pad_ccn(s: pd.Series) -> pd.Series:
    s = s.astype("string").str.strip()
    s = s.where(s.ne("<NA>"))
    return s.str.zfill(6)

def to_date_from_int_yyyymmdd(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s.astype("Int64"), format="%Y%m%d", errors="coerce")

def normalize_needed_columns(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()
    df.columns = [c.strip().lower() for c in df.columns]

    # rename essentials
    if "provnum" in df.columns and "cms_certification_number" not in df.columns:
        df.rename(columns={"provnum":"cms_certification_number"}, inplace=True)
    if "mdscensus" in df.columns and "mds_census" not in df.columns:
        df.rename(columns={"mdscensus":"mds_census"}, inplace=True)

    # ensure hour cols exist
    for col in ["hrs_rn","hrs_lpn","hrs_cna"]:
        if col not in df.columns:
            df[col] = 0.0

    # IDs
    if "cms_certification_number" in df.columns:
        df["cms_certification_number"] = zero_pad_ccn(df["cms_certification_number"])
    else:
        warnings.warn("Missing cms_certification_number/provnum")

    # workdate
    if "workdate" in df.columns:
        if pd.api.types.is_integer_dtype(df["workdate"]) or pd.api.types.is_string_dtype(df["workdate"]):
            df["workdate"] = to_date_from_int_yyyymmdd(df["workdate"])
        else:
            df["workdate"] = pd.to_datetime(df["workdate"], errors="coerce")
    else:
        raise ValueError("Missing workdate")

    # hours numeric, float32
    for c in ["hrs_rn","hrs_lpn","hrs_cna"]:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("float32").fillna(0.0)

    # census optional -> float32
    if "mds_census" not in df.columns:
        df["mds_census"] = np.nan
    df["mds_census"] = pd.to_numeric(df["mds_census"], errors="coerce").astype("float32")

    keep = ["cms_certification_number","workdate","hrs_rn","hrs_lpn","hrs_cna","mds_census"]
    return df[keep]

# --- Coverage + IQR outlier removal within facility-month, then monthly agg ---
def process_file_monthly(fp: Path,
                         coverage_threshold: float = 0.5,
                         iqr_mult: float = 1.5) -> pd.DataFrame:
    """
    Returns monthly totals per CCN from a single CSV, with n = # reported days in that month (before outlier drop).
    Adds per-patient metrics using average monthly mds_census.
    """
    df = normalize_needed_columns(read_csv_robust(fp))

    # daily facility-level totals
    daily = (df
             .groupby(["cms_certification_number","workdate"], as_index=False)
             .agg(hrs_rn=("hrs_rn","sum"),
                  hrs_lpn=("hrs_lpn","sum"),
                  hrs_cna=("hrs_cna","sum"),
                  mds_census=("mds_census","mean"))
            )
    daily["total_hours"] = daily[["hrs_rn","hrs_lpn","hrs_cna"]].sum(axis=1).astype("float32")
    daily["year_month"]  = daily["workdate"].dt.to_period("M")
    daily["days_in_month"] = daily["workdate"].dt.days_in_month

    # coverage
    cov = (daily
           .groupby(["cms_certification_number","year_month"], as_index=False)
           .agg(days_reported=("workdate","nunique"),
                days_in_month=("days_in_month","max"))
          )
    cov["coverage_ratio"] = cov["days_reported"] / cov["days_in_month"]
    cov_ok = cov.loc[cov["coverage_ratio"] >= coverage_threshold,
                     ["cms_certification_number","year_month","days_reported"]]
    if cov_ok.empty:
        return pd.DataFrame(columns=[
            "cms_certification_number","month",
            "hrs_rn","hrs_lpn","hrs_cna","total_hours","mds_census",
            "hrs_rn_per_patient","hrs_lpn_per_patient","hrs_cna_per_patient","total_hours_per_patient","n"
        ])

    good = daily.merge(cov_ok, on=["cms_certification_number","year_month"], how="inner")

    # IQR bounds
    KEYS = ["cms_certification_number","year_month"]
    stats = (good.groupby(KEYS)
                 .agg(rn_q1=('hrs_rn',      lambda s: s.quantile(0.25)),
                      rn_q3=('hrs_rn',      lambda s: s.quantile(0.75)),
                      lpn_q1=('hrs_lpn',    lambda s: s.quantile(0.25)),
                      lpn_q3=('hrs_lpn',    lambda s: s.quantile(0.75)),
                      cna_q1=('hrs_cna',    lambda s: s.quantile(0.25)),
                      cna_q3=('hrs_cna',    lambda s: s.quantile(0.75)),
                      tot_q1=('total_hours',lambda s: s.quantile(0.25)),
                      tot_q3=('total_hours',lambda s: s.quantile(0.75)))
                 .reset_index())
    for pref in ["rn","lpn","cna","tot"]:
        q1, q3 = f"{pref}_q1", f"{pref}_q3"
        stats[f"{pref}_iqr"] = (stats[q3] - stats[q1])
        stats[f"{pref}_lo"]  = stats[q1] - iqr_mult * stats[f"{pref}_iqr"]
        stats[f"{pref}_hi"]  = stats[q3] + iqr_mult * stats[f"{pref}_iqr"]
        z = stats[f"{pref}_iqr"] == 0
        stats.loc[z, f"{pref}_lo"] = stats.loc[z, q1]
        stats.loc[z, f"{pref}_hi"] = stats.loc[z, q3]

    bounds = stats[KEYS + [f"{p}_{b}" for p in ["rn","lpn","cna","tot"] for b in ["lo","hi"]]]
    good = good.merge(bounds, on=KEYS, how="left")

    is_outlier = (
        (good["hrs_rn"]  < good["rn_lo"])  | (good["hrs_rn"]  > good["rn_hi"])  |
        (good["hrs_lpn"] < good["lpn_lo"]) | (good["hrs_lpn"] > good["lpn_hi"]) |
        (good["hrs_cna"] < good["cna_lo"]) | (good["hrs_cna"] > good["cna_hi"]) |
        (good["total_hours"] < good["tot_lo"]) | (good["total_hours"] > good["tot_hi"])
    )
    kept = good.loc[~is_outlier,
                    ["cms_certification_number","year_month","hrs_rn","hrs_lpn","hrs_cna","total_hours","mds_census","days_reported"]]

    # monthly totals and avg census (across reported days kept)
    monthly = (kept
               .groupby(["cms_certification_number","year_month"], as_index=False)
               .agg(hrs_rn=("hrs_rn","sum"),
                    hrs_lpn=("hrs_lpn","sum"),
                    hrs_cna=("hrs_cna","sum"),
                    total_hours=("total_hours","sum"),
                    mds_census=("mds_census","mean"),
                    n=("days_reported","max"))
              )

    # per-patient metrics
    denom = monthly["mds_census"].replace({0: np.nan})
    monthly["hrs_rn_per_patient"]     = monthly["hrs_rn"]     / denom
    monthly["hrs_lpn_per_patient"]    = monthly["hrs_lpn"]    / denom
    monthly["hrs_cna_per_patient"]    = monthly["hrs_cna"]    / denom
    monthly["total_hours_per_patient"]= monthly["total_hours"]/ denom

    # Month label like 01/2017
    monthly["month"] = monthly["year_month"].dt.strftime("%m/%Y")

    # Final column order + dtypes
    monthly = monthly[[
        "cms_certification_number","month",
        "hrs_rn","hrs_lpn","hrs_cna","total_hours","mds_census",
        "hrs_rn_per_patient","hrs_lpn_per_patient","hrs_cna_per_patient","total_hours_per_patient","n"
    ]]
    for c in ["hrs_rn","hrs_lpn","hrs_cna","total_hours",
              "mds_census","hrs_rn_per_patient","hrs_lpn_per_patient",
              "hrs_cna_per_patient","total_hours_per_patient"]:
        monthly[c] = monthly[c].astype("float32")
    monthly["n"] = monthly["n"].astype("Int16")
    return monthly

# --- Run file-by-file and concatenate ---
all_files = sorted(PBJ_DIR.glob(PBJ_GLOB))
print(f"[scan] {len(all_files)} files to process")

monthly_frames = []
for fp in all_files:
    try:
        m = process_file_monthly(fp, coverage_threshold=0.5, iqr_mult=1.5)
        print(f"[ok] {fp.name}: {len(m):,} rows")
        if not m.empty:
            monthly_frames.append(m)
    except Exception as e:
        print(f"[fail] {fp.name}: {e}")

monthly_panel = (
    pd.concat(monthly_frames, ignore_index=True)
    if monthly_frames else
    pd.DataFrame(columns=[
        "cms_certification_number","month",
        "hrs_rn","hrs_lpn","hrs_cna","total_hours","mds_census",
        "hrs_rn_per_patient","hrs_lpn_per_patient","hrs_cna_per_patient","total_hours_per_patient","n"
    ])
)

print(f"[done] monthly_panel rows = {len(monthly_panel):,}")

# --- Save to interim ---
monthly_panel.to_csv(OUT_FP, index=False)
print(f"[saved] {OUT_FP}")

[paths] RAW_DIR=C:\Users\wrthj\OneDrive\NursingHomeData
[paths] PBJ_DIR=C:\Users\wrthj\OneDrive\NursingHomeData\pbj-nurse
[paths] OUT_FP=C:\Repositories\white-bowblis-nhmc\data\interim\pbj_monthly_panel.csv
[scan] 33 files to process
[read] pbj_nurse_2017_Q1.csv (encoding=utf-8)


KeyboardInterrupt: 