In [2]:
# =============================================================================
# CHOW Calculation on Compact Dataset (Highest-Level Ownership + Fallback)
# =============================================================================
import os, re, json
from pathlib import Path

import numpy as np
import pandas as pd

# -----------------------------------------------------------------------------
# 1) Paths / Config
# -----------------------------------------------------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

COMPACT_PATH = INTERIM_DIR / "ownership_ccn_month_compact.csv"
OUT_PATH     = INTERIM_DIR / "ccn_month_chow_flags.csv"

# --- CHOW parameters (tune for robustness checks) ---
THRESH = 50.0                         # main threshold (try 50 / 75 / 100)
CUTOFF_DATE = pd.Timestamp("2017-01-01")

# Names-only fallback when both sides have all-null %s:
NAME_OVERLAP_MAX     = 0.25           # surname Jaccard ≤ this → proxy turnover
NAME_HEURISTIC_PROXY = 75.0           # proxy turnover used in fallback

# Mark the very first observed month (post-cutoff) as “new snapshot” (optional)
FLAG_FIRST_OBS_AFTER_CUTOFF = True

print(f"[paths] INTERIM_DIR={INTERIM_DIR}")
print(f"[load] {COMPACT_PATH}")

# -----------------------------------------------------------------------------
# 2) Load + Parse
#   IMPORTANT: don't force dtype=str globally, or parse_dates gets clobbered.
# -----------------------------------------------------------------------------
df = pd.read_csv(COMPACT_PATH, low_memory=False)
print(f"[loaded] rows={len(df):,}, cols={df.shape[1]}")

# Ensure month_ts is datetime (this fixes your TypeError)
df["month_ts"] = pd.to_datetime(df["month_ts"], errors="coerce")

# Safe JSON -> list parser
def parse_json_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                return json.loads(s)
            except Exception:
                return []
    return []

# Ensure list columns are lists and normalize element types
list_cols = ["roles","owner_types","owner_names","ownership_percentages","association_dates"]
for c in list_cols:
    if c in df.columns:
        df[c] = df[c].apply(parse_json_list)
    else:
        df[c] = [[] for _ in range(len(df))]

# Percentages to numeric (keep NaN for blanks)
def to_nums(lst):
    out = []
    for v in lst:
        if v in (None, "", "null"):
            out.append(np.nan)
        else:
            out.append(pd.to_numeric(v, errors="coerce"))
    return out
df["ownership_percentages"] = df["ownership_percentages"].apply(to_nums)

# CCN normalized as zero-padded 6-char string (keeps sorting consistent)
df["cms_certification_number"] = (
    df["cms_certification_number"]
      .astype(str)
      .str.replace(r"\D", "", regex=True)
      .str.zfill(6)
)

# Processing date: keep as string for reporting; no math on it
if "processing_date" in df.columns:
    # If it already looks like dates, you can pretty-format; else leave as-is
    pd_proc = pd.to_datetime(df["processing_date"], errors="coerce")
    df["processing_date"] = pd_proc.dt.date.astype(str).where(pd_proc.notna(), df["processing_date"].astype(str))

# Sanity: drop rows with missing month_ts
bef = len(df)
df = df[df["month_ts"].notna()].copy()
if len(df) != bef:
    print(f"[note] dropped {bef - len(df):,} rows with non-parsable month_ts")

# -----------------------------------------------------------------------------
# 3) Highest-level selection per CCN-month
#     Prefer INDIRECT if present, else DIRECT, else (fallback) all (e.g., PARTNERSHIP only)
# -----------------------------------------------------------------------------
LEVEL_ORDER = ["INDIRECT", "DIRECT"]  # if neither present, we'll keep whatever exists

def pick_highest_level(roles, *parallel_lists):
    """Return mask of selected indices and the selected level label."""
    roles_up = [str(r or "").upper() for r in roles]
    for lvl in LEVEL_ORDER:
        idxs = [i for i, r in enumerate(roles_up) if r == lvl]
        if idxs:
            mask = np.array([i in set(idxs) for i in range(len(roles_up))], dtype=bool)
            return mask, lvl
    # fallback: keep everything (e.g., only PARTNERSHIP rows exist)
    return np.ones(len(roles_up), dtype=bool), "FALLBACK"

# -----------------------------------------------------------------------------
# 4) Owner name normalization + surname extraction for name-based fallback
# -----------------------------------------------------------------------------
CORP_SUFFIX_RE  = re.compile(r"\b(CORP(ORATION)?|INC(ORPORATED)?|LLC|L\.L\.C\.|LP|L\.P\.|CO|COMPANY|HOLDINGS?|TRUST|PARTNERS(HIP)?)\b\.?", re.I)
PUNCT_RE        = re.compile(r"[\.&,]")

def norm_owner_name(name: str) -> str:
    s = str(name or "").upper().strip()
    s = PUNCT_RE.sub(" ", s)
    s = CORP_SUFFIX_RE.sub("", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def surname_token(n: str) -> str:
    """Get a robust surname-ish token (before comma if present; else last word)."""
    s = str(n or "").strip()
    if not s:
        return ""
    if "," in s:
        tok = s.split(",")[0]
    else:
        parts = s.split()
        tok = parts[-1] if parts else ""
    return tok.upper()

def surname_set(owner_names):
    out = set()
    for n in owner_names:
        tok = surname_token(n)
        if tok:
            out.add(tok)
    return out

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return np.nan
    union = a | b
    if not union:
        return np.nan
    return len(a & b) / len(union)

# -----------------------------------------------------------------------------
# 5) Turnover engine between adjacent months for each CCN
# -----------------------------------------------------------------------------
def build_pct_dict(roles, names, pcts):
    """
    Build (normalized owner name -> percentage) for selected indices only.
    """
    mask, used_level = pick_highest_level(roles, names, pcts)
    if len(roles) == 0:
        return {}, used_level
    names_sel = [norm_owner_name(n) for i, n in enumerate(names) if mask[i]]
    pcts_sel  = [pcts[i] if mask[i] else np.nan for i in range(len(pcts))]
    pcts_sel  = [p for i, p in enumerate(pcts_sel) if mask[i]]
    d = {}
    for o, p in zip(names_sel, pcts_sel):
        if pd.notna(p):
            d[o] = d.get(o, 0.0) + float(p)
    return d, used_level

def pct_turnover(prev_row, curr_row):
    """
    Percentage-based turnover:
    - choose highest level in each row
    - union-of-owners, missing -> 0
    """
    prev_d, lvl_prev = build_pct_dict(prev_row.roles, prev_row.owner_names, prev_row.ownership_percentages)
    curr_d, lvl_curr = build_pct_dict(curr_row.roles, curr_row.owner_names, curr_row.ownership_percentages)
    used_level = (lvl_prev if lvl_prev != "FALLBACK" else lvl_curr)

    has_prev_pct = len(prev_d) > 0
    has_curr_pct = len(curr_d) > 0

    if not has_prev_pct and not has_curr_pct:
        return np.nan, used_level  # will need names-only fallback

    names_union = set(prev_d) | set(curr_d)
    diff = sum(abs(curr_d.get(o, 0.0) - prev_d.get(o, 0.0)) for o in names_union)
    return diff / 2.0, used_level

def names_only_proxy(prev_row, curr_row):
    """
    Names-only fallback when BOTH sides have all-null percentages.
    We compare surname sets (using highest-level selection mask to pick rows).
    """
    prev_mask, _ = pick_highest_level(prev_row.roles, prev_row.owner_names, prev_row.ownership_percentages)
    curr_mask, _ = pick_highest_level(curr_row.roles, curr_row.owner_names, curr_row.ownership_percentages)

    prev_names = [prev_row.owner_names[i] for i in range(len(prev_row.owner_names)) if prev_mask[i]]
    curr_names = [curr_row.owner_names[i] for i in range(len(curr_row.owner_names)) if curr_mask[i]]

    prev_surn = surname_set(prev_names)
    curr_surn = surname_set(curr_names)
    over = jaccard(prev_surn, curr_surn)

    if pd.isna(over):
        return np.nan, over, "names_fallback_no_tokens"

    if over <= NAME_OVERLAP_MAX:
        return NAME_HEURISTIC_PROXY, over, "names_fallback_low_overlap"
    else:
        return np.nan, over, "names_fallback_high_overlap"

# Compute turnover per CCN across months
df = df.sort_values(["cms_certification_number", "month_ts"]).reset_index(drop=True)

# Prepare output records
records = []

for ccn, grp in df.groupby("cms_certification_number", sort=False):
    grp = grp.sort_values("month_ts").reset_index(drop=True)
    # Optionally flag the first observed month post-cutoff
    first_flagged = False

    for i in range(len(grp)):
        row = grp.loc[i]
        # make absolutely sure month is a Timestamp
        month = pd.to_datetime(row["month_ts"], errors="coerce")

        rec = {
            "cms_certification_number": ccn,
            "month_ts": month,
            "processing_date": row.get("processing_date", ""),
            "used_level": None,
            "pct_fill_tag": None,        # 'none_null'|'some_null'|'all_null'
            "turnover": np.nan,          # numeric % turnover (if computed)
            "surname_overlap": np.nan,   # for names fallback
            "eff_turnover": np.nan,      # final turnover used for CHOW rule
            "chow_reason": "",           # 'percent' | 'names' | 'first_seen'
            "is_chow": False
        }

        # Tag percent availability for the current month (for diagnostics)
        pcts = row["ownership_percentages"]
        if len(pcts) == 0:
            rec["pct_fill_tag"] = "all_null"
        else:
            valid = [pd.notna(x) for x in pcts]
            if all(valid):
                rec["pct_fill_tag"] = "none_null"
            elif any(valid):
                rec["pct_fill_tag"] = "some_null"
            else:
                rec["pct_fill_tag"] = "all_null"

        # First observed month (post-cutoff): optional "new snapshot" CHOW
        if FLAG_FIRST_OBS_AFTER_CUTOFF and pd.notna(month) and (month >= CUTOFF_DATE) and not first_flagged:
            rec.update({"is_chow": True, "chow_reason": "first_seen", "eff_turnover": np.nan})
            first_flagged = True

        # For i == 0, we can't compute turnover to a previous month; record and continue
        if i == 0:
            records.append(rec)
            continue

        prev = grp.loc[i-1]

        # Compute percentage-based turnover if any percentages exist on either side
        t_pct, used_lvl = pct_turnover(prev, row)
        rec["used_level"] = used_lvl
        rec["turnover"] = t_pct

        # If no pct turnover (both all-null), attempt names-only fallback
        if pd.isna(t_pct):
            proxy, overlap, reason = names_only_proxy(prev, row)
            rec["surname_overlap"] = overlap
            if pd.notna(proxy):
                rec["eff_turnover"] = proxy
                rec["chow_reason"] = "names"
            else:
                rec["eff_turnover"] = np.nan
                rec["chow_reason"] = reason
        else:
            rec["eff_turnover"] = t_pct
            rec["chow_reason"] = "percent"

        # Final CHOW rule: turnover threshold + date cutoff
        if pd.notna(month) and (month >= CUTOFF_DATE) and pd.notna(rec["eff_turnover"]) and (rec["eff_turnover"] >= THRESH):
            rec["is_chow"] = True

        records.append(rec)

# -----------------------------------------------------------------------------
# 6) Save + Diagnostics
# -----------------------------------------------------------------------------
out = pd.DataFrame.from_records(records)

# Pretty sort: by CCN, then date
out = out.sort_values(["cms_certification_number", "month_ts"]).reset_index(drop=True)

out.to_csv(OUT_PATH, index=False)
print(f"[save] {OUT_PATH}  rows={len(out):,}")

# --- quick diagnostics ---
print("\n=== CHOW summary ===")
total_flags = int(out["is_chow"].sum())
by_reason = out.loc[out["is_chow"], "chow_reason"].value_counts()
print(f"Total CHOW month-rows: {total_flags:,}")
print("By reason:")
print(by_reason.to_string())

print("\n=== Turnover availability (share %) ===")
print( (out["chow_reason"].replace("", "no_chow")
            .value_counts(normalize=True).mul(100).round(1).astype(str) + "%")
      )

print("\n=== Example flagged rows ===")
print(out.loc[out["is_chow"]].head(10).to_string(index=False))

[paths] INTERIM_DIR=C:\Repositories\white-bowblis-nhmc\data\interim
[load] C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_month_compact.csv
[loaded] rows=1,173,176, cols=9
[save] C:\Repositories\white-bowblis-nhmc\data\interim\ccn_month_chow_flags.csv  rows=1,173,176

=== CHOW summary ===
Total CHOW month-rows: 79,220
By reason:
chow_reason
percent       62902
first_seen    14075
names          2243

=== Turnover availability (share %) ===
chow_reason
percent                        60.4%
names_fallback_high_overlap    38.2%
first_seen                      1.2%
names                           0.2%
Name: proportion, dtype: object

=== Example flagged rows ===
cms_certification_number   month_ts processing_date used_level pct_fill_tag  turnover  surname_overlap  eff_turnover chow_reason  is_chow
                  005125 2018-10-01      2018-10-01       None    none_null       NaN              NaN           NaN  first_seen     True
                  005130 2020-04-01      20