In [3]:
# ────────────────────────────────────────────────────────────────────────────────
# Build CHOW event-study dummies and merge into PBJ provider-month panel
# Inputs you have:
#   - C:\Repositories\white-bowblis-nhmc\data\interim\ccn_chow_lite.csv
#   - C:\Repositories\white-bowblis-nhmc\data\interim\mcr_chow_provider_events_all.csv
#   - C:\Users\Owner\OneDrive\NursingHomeData\provider-info-files\provider_resides_in_hospital_by_ccn.csv
#   - C:\Users\wrthj\OneDrive\NursingHomeData\pbj-nurse\pbj_monthly_panel.csv
#
# Output:
#   - C:\Repositories\white-bowblis-nhmc\data\clean\pbj_panel_with_chow_dummies.csv
# ────────────────────────────────────────────────────────────────────────────────

import pandas as pd
import numpy as np
import re
from pathlib import Path

# ----- Paths -----
REPO = Path(r"C:\Repositories\white-bowblis-nhmc")
INTERIM = REPO / "data" / "interim"
CLEAN = REPO / "data" / "clean"; CLEAN.mkdir(parents=True, exist_ok=True)

lite_fp = INTERIM / "ccn_chow_lite.csv"
mcr_fp  = INTERIM / "mcr_chow_provider_events_all.csv"
hosp_fp = Path(r"C:\Users\wrthj\OneDrive\NursingHomeData\provider-info-files\provider_resides_in_hospital_by_ccn.csv")

pbj_fp  = Path(r"C:\Users\wrthj\OneDrive\NursingHomeData\pbj-nurse\pbj_monthly_panel.csv")
out_fp  = CLEAN / "pbj_panel_with_chow_dummies.csv"

# ----- Helpers -----
def std_ccn(df, col="cms_certification_number"):
    df[col] = (
        df[col].astype(str)
        .str.extract(r"(\d+)", expand=False)
        .fillna("")
        .str.zfill(6)
    )
    return df

def to_boolish(s):
    s = s.astype(str).str.strip().str.lower()
    return s.map({"1": True, "y": True, "yes": True, "true": True, "t": True,
                  "0": False, "n": False, "no": False, "false": False, "f": False}).astype("boolean")

def to_monthstart(series):
    s = pd.to_datetime(series, errors="coerce")
    return s.dt.to_period("M").dt.to_timestamp()

def month_diff(a, b):
    if pd.isna(a) or pd.isna(b):
        return np.nan
    return (pd.Period(a, "M") - pd.Period(b, "M")).n

# ----- Load -----
lite = pd.read_csv(lite_fp, dtype={"cms_certification_number": str})
mcr  = pd.read_csv(mcr_fp,  dtype={"cms_certification_number": str})
hosp = pd.read_csv(hosp_fp, dtype={"cms_certification_number": str})

for df in (lite, mcr, hosp):
    std_ccn(df)

# ----- Hospital filter (drop in-hospital = True) -----
hosp_col = "provider_resides_in_hospital"
if hosp_col not in hosp.columns:
    alt = [c for c in hosp.columns if re.search(r"inhosp|resides", c, re.I)]
    if not alt:
        raise ValueError("Couldn't find hospital flag column in the hospital file.")
    hosp_col = alt[0]

hosp["in_hosp"] = to_boolish(hosp[hosp_col])

lite = lite.merge(hosp[["cms_certification_number","in_hosp"]], on="cms_certification_number", how="left")
mcr  = mcr.merge(hosp[["cms_certification_number","in_hosp"]],  on="cms_certification_number", how="left")

lite = lite[lite["in_hosp"] != True].copy()
mcr  = mcr[mcr["in_hosp"]  != True].copy()

# ----- Derive counts + FIRST EVENT MONTH from LITE -----
def derive_lite_counts_and_firstmonth(df):
    out = df.copy()

    # Count CHOWs from either num_chows or chow_date_* columns
    if "num_chows" in out.columns:
        num = pd.to_numeric(out["num_chows"], errors="coerce").fillna(0).astype(int)
    else:
        chow_cols = [c for c in out.columns if re.match(r"chow_date_\d+$", c)]
        num = out[chow_cols].notna().sum(axis=1).astype(int) if chow_cols else pd.Series(0, index=out.index)

    # First event month: prefer the minimum across chow_date_* if present
    first_month = pd.NaT
    chow_cols = [c for c in out.columns if re.match(r"chow_date_\d+$", c)]
    if chow_cols:
        # Coerce each to datetime, then take row-wise min
        temp = out[chow_cols].apply(pd.to_datetime, errors="coerce").apply(lambda r: r.min(), axis=1)
        first_month = to_monthstart(temp)

    # If still NaT but num>0 and a single date-like column exists, try to infer
    if isinstance(first_month, pd._libs.tslibs.timestamps.Timestamp) or first_month is pd.NaT:
        # normalize to a series
        first_month = pd.Series(first_month, index=out.index)

    # Package
    res = pd.DataFrame({
        "cms_certification_number": out["cms_certification_number"],
        "num_chows_lite": num,
        "first_event_month_lite": first_month
    }).drop_duplicates("cms_certification_number")

    res["is_chow_lite"] = res["num_chows_lite"] > 0
    return res

lite_counts = derive_lite_counts_and_firstmonth(lite)

# ----- Derive counts + FIRST EVENT MONTH from MCR (schema-agnostic) -----
def derive_mcr_counts_and_firstmonth(df):
    d = df.copy()

    # Make reasonable strings where possible
    for c in d.columns:
        if d[c].dtype == "bool":
            d[c] = d[c].astype("boolean")
        else:
            try:
                d[c] = d[c].astype(str)
            except Exception:
                pass

    # Candidate columns that might identify CHOW rows
    flag_cols = [c for c in d.columns if c.lower() == "is_chow"]
    text_cols = [c for c in d.columns if re.search(r"(event|type|name|desc)", c, re.I)]

    # Candidate date cols
    date_cols = [c for c in d.columns if re.search(r"date", c, re.I)]

    # Determine a row-level chow flag
    chow_mask = pd.Series(False, index=d.index)
    if flag_cols:
        c = flag_cols[0]
        chow_mask = d[c].str.strip().str.lower().map(
            {"1":True,"y":True,"yes":True,"true":True,"t":True,"0":False,"n":False,"no":False,"false":False,"f":False}
        ).fillna(False)
    elif text_cols:
        patt = re.compile(r"\bchange of ownership\b|\bchow\b", re.I)
        for c in text_cols:
            try:
                chow_mask = chow_mask | d[c].str.contains(patt, na=False)
            except Exception:
                pass
    # else: we’ll fall back to counting unique dates (coarse)

    # Choose a date column to timestamp
    date_col = None
    if date_cols:
        # Prefer something like 'effective_date' or 'event_date'
        pri = [c for c in date_cols if re.search(r"(effective|event).*date", c, re.I)]
        date_col = pri[0] if pri else date_cols[0]

    # Build per-CCN aggregation
    if date_col is not None:
        d["_event_month"] = to_monthstart(pd.to_datetime(d[date_col], errors="coerce"))
        # If we had a chow_mask, filter to those; else count all dated rows as events
        if chow_mask.any():
            ev = d.loc[chow_mask & d["_event_month"].notna(), ["cms_certification_number","_event_month"]]
        else:
            ev = d.loc[d["_event_month"].notna(), ["cms_certification_number","_event_month"]]
    else:
        # No date column found: fall back to CCN-level counts (zero info on months)
        ev = d[["cms_certification_number"]].copy()
        ev["_event_month"] = pd.NaT

    ev = ev.dropna(subset=["cms_certification_number"]).drop_duplicates()

    g = (ev.groupby("cms_certification_number")["_event_month"]
            .agg(n_events_mcr="count", first_event_month_mcr="min"))
    g = g.reset_index()

    g["is_chow_mcr"] = g["n_events_mcr"] > 0
    return g

mcr_counts = derive_mcr_counts_and_firstmonth(mcr)

# ----- Merge + agreement logic (0/0 or 1/1 with SAME month) -----
merged = (lite_counts
          .merge(mcr_counts, on="cms_certification_number", how="outer")
          .fillna({"num_chows_lite":0, "n_events_mcr":0}))

# Booleans
merged["is_chow_lite"] = merged["num_chows_lite"] > 0
merged["is_chow_mcr"]  = merged["n_events_mcr"]   > 0

# Agreement label + chosen change_month
def label_and_pick_month(r):
    # default
    label = "mismatch"
    change_month = pd.NaT

    if (r["num_chows_lite"]==0) and (r["n_events_mcr"]==0):
        label = "match_0"
        change_month = pd.NaT
    elif (r["num_chows_lite"]==1) and (r["n_events_mcr"]==1):
        # Require same month (both present and equal)
        a, b = r["first_event_month_lite"], r["first_event_month_mcr"]
        if pd.notna(a) and pd.notna(b) and (a==b):
            label = "match_1_same_month"
            change_month = a  # == b
        elif pd.notna(a) and pd.notna(b):
            label = "match_1_diff_month"
        else:
            label = "match_1_unknown_month"
    return pd.Series({"agreement": label, "change_month": change_month})

lab = merged.apply(label_and_pick_month, axis=1)
merged = pd.concat([merged, lab], axis=1)

# Keep only match_0 and match_1_same_month for clean identification
agree = merged.loc[merged["agreement"].isin(["match_0","match_1_same_month"])].copy()

# ----- Load PBJ panel and merge in -----
pbj = pd.read_csv(pbj_fp, low_memory=False)

# Find CCN + month columns (PBJ schema agnostic)
def find_col(cols, candidates):
    lower = {c.lower(): c for c in cols}
    for cand in candidates:
        if cand.lower() in lower:
            return lower[cand.lower()]
    return None

pbj_ccn_col   = find_col(pbj.columns, ["ccn","provider_number","provider_id","cms_certification_number"])
pbj_month_col = find_col(pbj.columns, ["month","pbj_month","date","period_month"])

if pbj_ccn_col is None or pbj_month_col is None:
    raise ValueError(f"PBJ panel must have a CCN and a month column. Columns={list(pbj.columns)}")

pbj = pbj.rename(columns={pbj_ccn_col:"cms_certification_number", pbj_month_col:"month"})
std_ccn(pbj, "cms_certification_number")
pbj["month"] = to_monthstart(pbj["month"])

# Merge agreed set onto PBJ
panel = pbj.merge(
    agree[["cms_certification_number","agreement","change_month"]],
    on="cms_certification_number",
    how="inner"  # keep only CCNs in the agreed universe
)

# ----- Build dummies -----
# treat_post: 0 for match_0; for match_1_same_month: 1 starting IN the change month and onward
panel["treat_post"] = 0
is_match1 = panel["agreement"].eq("match_1_same_month") & panel["change_month"].notna()
panel.loc[is_match1, "treat_post"] = (panel.loc[is_match1, "month"] >= panel.loc[is_match1, "change_month"]).astype(int)

# If you prefer to start the month AFTER the change, use '>' instead of '>=':
# panel.loc[is_match1, "treat_post"] = (panel.loc[is_match1, "month"] > panel.loc[is_match1, "change_month"]).astype(int)

# event_time: NaN for match_0; integer months relative to change month for match_1
panel["event_time"] = np.nan
panel.loc[is_match1, "event_time"] = (
    (panel.loc[is_match1, "month"].values.astype("datetime64[M]") -
     panel.loc[is_match1, "change_month"].values.astype("datetime64[M]")).astype(int)
)

# Optional: clip window for tidy event-study plots
# panel.loc[panel["event_time"].notna(), "event_time"] = panel.loc[panel["event_time"].notna(), "event_time"].clip(-24, 24)

# ever treated (diagnostic)
ever = (panel.groupby("cms_certification_number", as_index=False)["treat_post"].max()
             .rename(columns={"treat_post":"ever_treated"}))
panel = panel.merge(ever, on="cms_certification_number", how="left")

# Save
panel = panel.sort_values(["cms_certification_number","month"]).reset_index(drop=True)
panel.to_csv(out_fp, index=False)

print(f"[save] {out_fp} rows={len(panel):,}, cols={panel.shape[1]}")
print(panel[["cms_certification_number","month","agreement","change_month","treat_post","event_time"]].head(12))

  s = pd.to_datetime(series, errors="coerce")


[save] C:\Repositories\white-bowblis-nhmc\data\clean\pbj_panel_with_chow_dummies.csv rows=627,967, cols=17
   cms_certification_number      month agreement change_month  treat_post  \
0                    015009 2017-01-01   match_0          NaT           0   
1                    015009 2017-02-01   match_0          NaT           0   
2                    015009 2017-03-01   match_0          NaT           0   
3                    015009 2017-10-01   match_0          NaT           0   
4                    015009 2017-11-01   match_0          NaT           0   
5                    015009 2017-12-01   match_0          NaT           0   
6                    015009 2018-01-01   match_0          NaT           0   
7                    015009 2018-02-01   match_0          NaT           0   
8                    015009 2018-03-01   match_0          NaT           0   
9                    015009 2018-04-01   match_0          NaT           0   
10                   015009 2018-05-01   match