In [5]:
# ==============================================================================
# CHOW on the existing compact panel (ownership_ccn_month_compact.csv)
#   - No rebuild; compute-time highest-level filter + de-dupe + turnover
# ==============================================================================

import os, re, json
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# --------------------------- Config / Paths -----------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

IN_COMPACT  = INTERIM_DIR / "ownership_ccn_month_compact.csv"
OUT_EVENTS  = INTERIM_DIR / "ccn_month_chow_events_v4.csv"

# CHOW knobs (tune these):
THRESH                = 75.0         # percent turnover threshold
COOLDOWN_MONTHS       = 3            # do not flag again within N months after a flag
REQUIRE_PCT_BOTH      = True         # only use % turnover when both months have ≥1 % present
NAMES_FALLBACK        = True         # allow names-only when both sides all-null
NAME_JACCARD_CUTOFF   = 0.25         # low overlap ⇒ treat as change in names-only fallback
CUTOFF_DATE           = pd.Timestamp("2017-01-01")  # ignore before this

ROLE_PRIORITY = {"INDIRECT": 0, "DIRECT": 1, "PARTNERSHIP": 2}
SHOW_PROGRESS = True

print(f"[paths] INTERIM_DIR={INTERIM_DIR}")
print(f"[load]  {IN_COMPACT}")

# ------------------------------- Load -----------------------------------------
usecols = [
    "cms_certification_number", "month_ts", "processing_date",
    "roles", "owner_types", "owner_names", "ownership_percentages", "association_dates"
]
df = pd.read_csv(IN_COMPACT, usecols=usecols, dtype={"cms_certification_number":"string"}, low_memory=False)
df["cms_certification_number"] = df["cms_certification_number"].astype(str).str.zfill(6)

# Parse month as datetime (robust)
df["month_ts"] = pd.to_datetime(df["month_ts"], errors="coerce")
df = df[df["month_ts"].notna()].sort_values(["cms_certification_number","month_ts"]).reset_index(drop=True)

print(f"[loaded] rows={len(df):,}, cols={df.shape[1]}")

# --------------------------- Helpers & caches ---------------------------------
CORP_SUFFIX_RE = re.compile(
    r"\b(CORP(ORATION)?|INCORPORATED|INC|LLC|L\.L\.C\.|LP|L\.P\.|LLP|CO|COMPANY|HOLDINGS?)\b\.?",
    re.I
)
PUNCT_RE = re.compile(r"[,\.&]")

def months_between(a: pd.Timestamp, b: pd.Timestamp) -> int:
    """Whole months from b → a (assumes a,b are month-start Timestamps)."""
    return (a.year - b.year) * 12 + (a.month - b.month)

def norm_owner_name(name: str) -> str:
    s = str(name).upper().strip()
    s = PUNCT_RE.sub(" ", s)
    s = CORP_SUFFIX_RE.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def surname_token(n: str) -> str:
    n = str(n).strip()
    if not n:
        return ""
    return (n.split(",")[0] if "," in n else n.split()[0]).upper()

# JSON parse with caching (strings repeat a lot → huge speed win)
_json_cache: dict[str, list] = {}
def loads_cached(s: str) -> list:
    if not isinstance(s, str) or not s or s[0] != "[":
        return []
    v = _json_cache.get(s)
    if v is None:
        try:
            v = json.loads(s)
        except Exception:
            v = []
        _json_cache[s] = v
    return v

# Which role to use this month?
_role_pick_cache: dict[str, tuple[str, list[int]]] = {}
def pick_highest_and_indices(roles_s: str) -> tuple[str|None, list[int]]:
    roles = loads_cached(roles_s)
    if not roles:
        return None, []
    key = roles_s
    hit = _role_pick_cache.get(key)
    if hit is not None:
        return hit
    # pick highest present
    uniq = sorted({str(r).upper() for r in roles if r is not None}, key=lambda r: ROLE_PRIORITY.get(r, 99))
    chosen = uniq[0] if uniq else None
    idxs = [i for i, r in enumerate(roles) if str(r).upper() == chosen] if chosen else []
    _role_pick_cache[key] = (chosen, idxs)
    return chosen, idxs

# Build a per-month snapshot dict {name_norm: pct} plus surname set & flags
def month_snapshot(row) -> dict:
    chosen, idxs = pick_highest_and_indices(row.roles)
    names = loads_cached(row.owner_names)
    pcts  = loads_cached(row.ownership_percentages)

    # project to chosen indices
    if idxs:
        names = [names[i] if i < len(names) else None for i in idxs]
        pcts  = [pcts[i]  if i  < len(pcts)  else None for i in idxs]
    else:
        names, pcts = [], []

    # normalize & de-duplicate; sum % across same normalized name
    snap = {}
    any_pct = False
    for n, p in zip(names, pcts):
        nn = norm_owner_name(n)
        # coerce pct
        val = np.nan
        if p is not None and p != "":
            try:
                val = float(p)
                any_pct = True or any_pct
            except Exception:
                val = np.nan
        snap[nn] = snap.get(nn, 0.0) + (0.0 if np.isnan(val) else val)

    # surname set for names-only fallback
    surnames = {surname_token(n) for n in snap.keys() if n}

    if not snap:  # keep empty dict but annotate flags
        return {"used_level": chosen, "pct_map": {}, "has_pct": False, "all_null": True, "surnames": surnames}

    has_pct  = any_pct
    all_null = not any_pct
    return {"used_level": chosen, "pct_map": snap, "has_pct": has_pct, "all_null": all_null, "surnames": surnames}

def pct_turnover(prev_map: dict, curr_map: dict) -> float:
    owners = set(prev_map) | set(curr_map)
    diff = 0.0
    for o in owners:
        a = prev_map.get(o, 0.0)
        b = curr_map.get(o, 0.0)
        diff += abs(float(b) - float(a))
    return diff / 2.0

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return np.nan
    inter = len(a & b)
    union = len(a | b) or 1
    return inter / union

# -------------------------- Sweep per CCN -------------------------------------
events = []
grouped = df.groupby("cms_certification_number", sort=True, group_keys=False)

iter_groups = grouped
if SHOW_PROGRESS:
    iter_groups = tqdm(grouped, total=df["cms_certification_number"].nunique(), desc="CHOW")

for ccn, g in iter_groups:
    g = g.sort_values("month_ts")
    last_flag_month = pd.Timestamp.min  # cooldown tracker

    prev = None
    for row in g.itertuples(index=False):
        month = row.month_ts
        if month < CUTOFF_DATE:
            continue

        curr = month_snapshot(row)

        # Decide event
        reason = None
        turnover = np.nan

        if prev is not None:
            # Respect cooldown
            cooldown_active = (
                (last_flag_month is not None)
                and isinstance(last_flag_month, pd.Timestamp)
                and not pd.isna(last_flag_month)
                and months_between(month, last_flag_month) < COOLDOWN_MONTHS
            )

            if cooldown_active:
                # Still advance the baseline to allow detection after cooldown
                prev = curr
                continue

            # Use % turnover if allowed
            if (not REQUIRE_PCT_BOTH) or (prev["has_pct"] and curr["has_pct"]):
                turnover = pct_turnover(prev["pct_map"], curr["pct_map"])
                if np.isfinite(turnover) and turnover >= THRESH:
                    reason = "percent"

            # Names-only fallback when both sides all-null and feature On
            if reason is None and NAMES_FALLBACK and prev["all_null"] and curr["all_null"]:
                ov = jaccard(prev["surnames"], curr["surnames"])
                if not np.isnan(ov) and ov <= NAME_JACCARD_CUTOFF:
                    reason = "names"

        if reason is not None:
            events.append({
                "cms_certification_number": ccn,
                "month_ts": month,
                "processing_date": row.processing_date,
                "used_level_prev": prev["used_level"] if prev else None,
                "used_level_curr": curr["used_level"],
                "turnover": turnover,
                "chow_reason": reason,
            })
            last_flag_month = month

        prev = curr

# ------------------------------ Save & summary --------------------------------
# Ensure dtypes
ev["month_ts"] = pd.to_datetime(ev["month_ts"], errors="coerce")
ev["processing_date"] = pd.to_datetime(ev["processing_date"], errors="coerce")

# Save
ev.to_csv(OUT_EVENTS, index=False)
print(f"[save] {OUT_EVENTS}  rows={len(ev):,}")

# Yearly summaries
if not ev.empty:
    ev["year"] = ev["month_ts"].dt.year

    by_year = ev.groupby("year").size().rename("events").to_frame()
    print("\n=== CHOW events by year ===")
    print(by_year)

    by_year_reason = (
        ev.pivot_table(index="year", columns="chow_reason", values="cms_certification_number",
                       aggfunc="count", fill_value=0)
        .sort_index()
    )
    if not by_year_reason.empty:
        print("\n=== CHOW events by year — by reason ===")
        print(by_year_reason)

    mix = ev["chow_reason"].value_counts(normalize=True).mul(100).round(1).astype(str) + "%"
    print("\n=== Overall reason mix ===")
    print(mix)

    # Peek a few rows
    print("\n=== Example flagged rows ===")
    with pd.option_context("display.max_rows", 10, "display.max_colwidth", 120):
        print(ev.head(10).to_string(index=False))
else:
    print("\n[info] No CHOW events detected with current parameters.")

[paths] INTERIM_DIR=C:\Repositories\white-bowblis-nhmc\data\interim
[load]  C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_month_compact.csv
[loaded] rows=1,173,176, cols=8


CHOW:   0%|          | 0/14075 [00:00<?, ?it/s]

[save] C:\Repositories\white-bowblis-nhmc\data\interim\ccn_month_chow_events_v4.csv  rows=124

=== CHOW events by year ===
      events
year        
2017       5
2018      66
2019      27
2020      13
2021       6
2022       4
2024       2
2025       1

=== CHOW events by year — by reason ===
chow_reason  percent
year                
2017               5
2018              66
2019              27
2020              13
2021               6
2022               4
2024               2
2025               1

=== Overall reason mix ===
chow_reason
percent    100.0%
Name: proportion, dtype: object

=== Example flagged rows ===
cms_certification_number   month_ts processing_date used_level pct_fill_tag  turnover  name_overlap chow_reason  year
                  015083 2024-10-01      2024-10-01   INDIRECT    none_null     100.0           NaN     percent  2024
                  055067 2018-09-01      2018-09-01     DIRECT    some_null     100.0           NaN     percent  2018
                  0563