In [10]:
import os, re, warnings
from pathlib import Path
import numpy as np
import pandas as pd

# --- Paths (from your previous config) ---
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent
RAW_DIR  = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw")).resolve()
PBJ_DIR  = RAW_DIR / "pbj-nurse"
PBJ_GLOB = "pbj_nurse_????_Q[1-4].csv"

print(f"[paths] RAW_DIR={RAW_DIR}")
print(f"[paths] PBJ_DIR={PBJ_DIR}")

# --- Robust CSV reader (mixed encodings happen) ---
_TRY_ENCODINGS = ["utf-8", "utf-8-sig", "cp1252", "latin1"]
def read_csv_robust(fp: Path) -> pd.DataFrame:
    last_err = None
    for enc in _TRY_ENCODINGS:
        try:
            df = pd.read_csv(fp, low_memory=False, encoding=enc, encoding_errors="strict")
            print(f"[read] {fp.name} (encoding={enc})")
            return df
        except Exception as e:
            last_err = e
    # final fallback with replacements
    for enc in ["cp1252","latin1"]:
        try:
            df = pd.read_csv(fp, low_memory=False, encoding=enc, encoding_errors="replace", on_bad_lines="skip")
            print(f"[read] {fp.name} (encoding={enc}, replace+skip)")
            return df
        except Exception as e:
            last_err = e
    raise last_err

# --- Minimal per-file cleaner for columns we need ---
def zero_pad_ccn(s: pd.Series) -> pd.Series:
    s = s.astype("string").str.strip()
    s = s.where(s.ne("<NA>"))
    return s.str.zfill(6)

def to_date_from_int_yyyymmdd(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s.astype("Int64"), format="%Y%m%d", errors="coerce")

def normalize_needed_columns(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()
    df.columns = [c.strip().lower() for c in df.columns]

    # rename essentials
    if "provnum" in df.columns and "cms_certification_number" not in df.columns:
        df.rename(columns={"provnum":"cms_certification_number"}, inplace=True)
    if "mdscensus" in df.columns and "mds_census" not in df.columns:
        df.rename(columns={"mdscensus":"mds_census"}, inplace=True)

    # coalesce legacy hour names into canonical if needed (keep only RN/LPN/CNA)
    # (We aggregate only hrs_rn, hrs_lpn, hrs_cna — not admin/don)
    # Some quarters have legacy alt names; ensure canonical columns exist:
    for col in ["hrs_rn","hrs_lpn","hrs_cna"]:
        if col not in df.columns:
            df[col] = 0.0

    # IDs
    if "cms_certification_number" in df.columns:
        df["cms_certification_number"] = zero_pad_ccn(df["cms_certification_number"])
    else:
        warnings.warn("Missing cms_certification_number/provnum")

    # workdate
    if "workdate" in df.columns:
        if pd.api.types.is_integer_dtype(df["workdate"]) or pd.api.types.is_string_dtype(df["workdate"]):
            df["workdate"] = to_date_from_int_yyyymmdd(df["workdate"])
        else:
            df["workdate"] = pd.to_datetime(df["workdate"], errors="coerce")
    else:
        raise ValueError("Missing workdate")

    # hours numeric, float32
    for c in ["hrs_rn","hrs_lpn","hrs_cna"]:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("float32").fillna(0.0)

    # keep only what we need for speed
    keep = ["cms_certification_number","workdate","hrs_rn","hrs_lpn","hrs_cna"]
    return df[keep]

# --- Coverage + IQR outlier removal within facility-month, then monthly agg ---
def process_file_monthly(fp: Path,
                         coverage_threshold: float = 0.5,
                         iqr_mult: float = 1.5) -> pd.DataFrame:
    """
    Returns monthly totals per CCN from a single CSV, with n = # reported days in that month (before outlier drop).
    Columns: ccn, month (Period[M]), hrs_rn, hrs_lpn, hrs_cna, total_hours, n
    """
    # read + normalize minimal columns
    df = normalize_needed_columns(read_csv_robust(fp))

    # daily facility-level totals (RN/LPN/CNA)
    daily = (df
             .groupby(["cms_certification_number","workdate"], as_index=False)
             .agg(hrs_rn=("hrs_rn","sum"),
                  hrs_lpn=("hrs_lpn","sum"),
                  hrs_cna=("hrs_cna","sum"))
            )
    # add total + month period
    daily["total_hours"] = daily[["hrs_rn","hrs_lpn","hrs_cna"]].sum(axis=1).astype("float32")
    daily["year_month"]  = daily["workdate"].dt.to_period("M")
    daily["days_in_month"] = daily["workdate"].dt.days_in_month

    # coverage per facility-month
    cov = (daily
           .groupby(["cms_certification_number","year_month"], as_index=False)
           .agg(days_reported=("workdate","nunique"),
                days_in_month=("days_in_month","max"))
          )
    cov["coverage_ratio"] = cov["days_reported"] / cov["days_in_month"]

    # keep well-reported months
    cov_ok = cov.loc[cov["coverage_ratio"] >= coverage_threshold,
                     ["cms_certification_number","year_month","days_reported"]]

    if cov_ok.empty:
        # nothing to keep from this file
        return pd.DataFrame(columns=[
            "cms_certification_number","month","hrs_rn","hrs_lpn","hrs_cna","total_hours","n"
        ])

    good = daily.merge(cov_ok, on=["cms_certification_number","year_month"], how="inner")

    # per facility-month IQR bounds for RN/LPN/CNA/total
    KEYS = ["cms_certification_number","year_month"]
    stats = (good.groupby(KEYS)
                 .agg(rn_q1=('hrs_rn',      lambda s: s.quantile(0.25)),
                      rn_q3=('hrs_rn',      lambda s: s.quantile(0.75)),
                      lpn_q1=('hrs_lpn',    lambda s: s.quantile(0.25)),
                      lpn_q3=('hrs_lpn',    lambda s: s.quantile(0.75)),
                      cna_q1=('hrs_cna',    lambda s: s.quantile(0.25)),
                      cna_q3=('hrs_cna',    lambda s: s.quantile(0.75)),
                      tot_q1=('total_hours',lambda s: s.quantile(0.25)),
                      tot_q3=('total_hours',lambda s: s.quantile(0.75)))
                 .reset_index())
    for pref in ["rn","lpn","cna","tot"]:
        q1 = f"{pref}_q1"; q3 = f"{pref}_q3"
        stats[f"{pref}_iqr"] = (stats[q3] - stats[q1])
        stats[f"{pref}_lo"]  = stats[q1] - iqr_mult * stats[f"{pref}_iqr"]
        stats[f"{pref}_hi"]  = stats[q3] + iqr_mult * stats[f"{pref}_iqr"]
        z = stats[f"{pref}_iqr"] == 0
        stats.loc[z, f"{pref}_lo"] = stats.loc[z, q1]
        stats.loc[z, f"{pref}_hi"] = stats.loc[z, q3]

    bounds = stats[KEYS + [f"{p}_{b}" for p in ["rn","lpn","cna","tot"] for b in ["lo","hi"]]]
    good = good.merge(bounds, on=KEYS, how="left")

    is_outlier = (
        (good["hrs_rn"]  < good["rn_lo"])  | (good["hrs_rn"]  > good["rn_hi"])  |
        (good["hrs_lpn"] < good["lpn_lo"]) | (good["hrs_lpn"] > good["lpn_hi"]) |
        (good["hrs_cna"] < good["cna_lo"]) | (good["hrs_cna"] > good["cna_hi"]) |
        (good["total_hours"] < good["tot_lo"]) | (good["total_hours"] > good["tot_hi"])
    )

    # keep only non-outlier daily rows
    good_kept = good.loc[~is_outlier, ["cms_certification_number","year_month","hrs_rn","hrs_lpn","hrs_cna","total_hours","days_reported"]]

    # monthly totals; n = original days_reported (requested)
    monthly = (good_kept
               .groupby(["cms_certification_number","year_month"], as_index=False)
               .agg(hrs_rn=("hrs_rn","sum"),
                    hrs_lpn=("hrs_lpn","sum"),
                    hrs_cna=("hrs_cna","sum"),
                    total_hours=("total_hours","sum"),
                    n=("days_reported","max"))  # same for all rows in group
              )

    # Friendly Month label like 01/2017
    monthly["month"] = monthly["year_month"].dt.strftime("%m/%Y")

    # Final column order
    monthly = monthly[["cms_certification_number","month","hrs_rn","hrs_lpn","hrs_cna","total_hours","n"]]

    # Dtypes
    for c in ["hrs_rn","hrs_lpn","hrs_cna","total_hours"]:
        monthly[c] = monthly[c].astype("float32")
    monthly["n"] = monthly["n"].astype("Int16")

    return monthly

# --- Run file-by-file and concatenate ---
all_files = sorted(PBJ_DIR.glob(PBJ_GLOB))
print(f"[scan] {len(all_files)} files to process")

monthly_frames = []
for fp in all_files:
    try:
        m = process_file_monthly(fp, coverage_threshold=0.5, iqr_mult=1.5)
        print(f"[ok] {fp.name}: {len(m):,} monthly rows")
        if not m.empty:
            monthly_frames.append(m)
    except Exception as e:
        print(f"[fail] {fp.name}: {e}")

monthly_panel = pd.concat(monthly_frames, ignore_index=True) if monthly_frames else pd.DataFrame(
    columns=["cms_certification_number","month","hrs_rn","hrs_lpn","hrs_cna","total_hours","n"]
)

print(f"[done] monthly_panel rows = {len(monthly_panel):,}")
monthly_panel.head()

[paths] RAW_DIR=C:\Users\Owner\OneDrive\NursingHomeData
[paths] PBJ_DIR=C:\Users\Owner\OneDrive\NursingHomeData\pbj-nurse
[scan] 33 files to process
[read] pbj_nurse_2017_Q1.csv (encoding=utf-8)
[ok] pbj_nurse_2017_Q1.csv: 41,787 monthly rows
[read] pbj_nurse_2017_Q2.csv (encoding=utf-8)
[ok] pbj_nurse_2017_Q2.csv: 41,952 monthly rows
[read] pbj_nurse_2017_Q3.csv (encoding=utf-8)
[ok] pbj_nurse_2017_Q3.csv: 43,431 monthly rows
[read] pbj_nurse_2017_Q4.csv (encoding=utf-8)
[ok] pbj_nurse_2017_Q4.csv: 42,426 monthly rows
[read] pbj_nurse_2018_Q1.csv (encoding=cp1252)
[ok] pbj_nurse_2018_Q1.csv: 44,031 monthly rows
[read] pbj_nurse_2018_Q2.csv (encoding=cp1252)
[ok] pbj_nurse_2018_Q2.csv: 43,986 monthly rows
[read] pbj_nurse_2018_Q3.csv (encoding=cp1252)
[ok] pbj_nurse_2018_Q3.csv: 44,613 monthly rows
[read] pbj_nurse_2018_Q4.csv (encoding=utf-8)
[ok] pbj_nurse_2018_Q4.csv: 44,748 monthly rows
[read] pbj_nurse_2019_Q1.csv (encoding=utf-8)
[ok] pbj_nurse_2019_Q1.csv: 45,174 monthly rows
[r

Unnamed: 0,cms_certification_number,month,hrs_rn,hrs_lpn,hrs_cna,total_hours,n
0,15009,01/2017,398.410004,949.25,3039.120117,4386.779785,31
1,15009,02/2017,399.679993,897.179993,2938.429932,4235.290039,28
2,15009,03/2017,554.640015,1638.25,4624.609863,6817.5,31
3,15012,01/2017,398.519989,1621.75,4618.0,6638.27002,31
4,15012,02/2017,443.919983,1295.900024,3619.610107,5359.430176,28


In [13]:
monthly_panel.to_csv(r"C:\Repositories\white-bowblis-nhmc\data\interim\pbj.csv")