In [2]:
# ==============================================================================
# Prototype CHOW detection (no file writes)
#   - Loads: data/interim/ownership_ccn_month_chow_ready_v2.csv
#   - Works entirely in-memory on `df`
#   - Outputs: prints transition sample + CCN summary sample
# ==============================================================================

import json, math
import os
import numpy as np
import pandas as pd
from pathlib import Path

# ---------- Paths & load ----------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
OWN_DIR     = RAW_DIR / "ownership-files"
INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
OWN_DIR.mkdir(parents=True, exist_ok=True)
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

PANEL_FP = INTERIM_DIR / "ownership_ccn_month_chow_ready_v2.csv"
print(f"[load] {PANEL_FP}")
df = pd.read_csv(PANEL_FP, low_memory=False)

# ---------- Prep ----------
# Ensure datetimelike
df["month_ts"] = pd.to_datetime(df["month_ts"], errors="coerce")

# Parse JSON list columns (in-memory only)
def _parse(x):
    if isinstance(x, list):
        return x
    try:
        return json.loads(x) if isinstance(x, str) else []
    except Exception:
        return []

for col in ["name_norms","ownership_percentages","association_dates","owner_names","owner_types","roles"]:
    if col in df.columns:
        df[col] = df[col].apply(_parse)

# ---------- Turnover helpers ----------
def pct_turnover(prev_map: dict, next_map: dict) -> float:
    names = set(prev_map) | set(next_map)
    overlap = 0.0
    for n in names:
        p = float(prev_map.get(n, 0) or 0)
        q = float(next_map.get(n, 0) or 0)
        overlap += min(p, q)
    base_prev = sum(float(v or 0) for v in prev_map.values())
    base_next = sum(float(v or 0) for v in next_map.values())
    base = max(base_prev, base_next, 100.0)  # robust if totals != 100
    if base <= 0:
        return 0.0
    return float(np.clip(100.0 * (1.0 - overlap / base), 0.0, 100.0))

def names_turnover(prev_set: set, next_set: set) -> float:
    union = prev_set | next_set
    if not union:
        return 0.0
    inter = prev_set & next_set
    return float(np.clip(100.0 * (1 - len(inter)/len(union)), 0.0, 100.0))

def bucket_code(turnover_pct: float, inconclusive: bool) -> int:
    """
    0 = no change (<50%)
    1 = 90–100%
    2 = 80–90%
    3 = 70–80%
    4 = 60–70%
    5 = 50–60%
    6 = inconclusive
    """
    if inconclusive:
        return 6
    x = turnover_pct
    if x < 50: return 0
    if x >= 90: return 1
    if x >= 80: return 2
    if x >= 70: return 3
    if x >= 60: return 4
    return 5

# ---------- Build transitions (no writes) ----------
THRESH = 50.0  # CHOW threshold for the prototype

transitions = []
for ccn, g in df.groupby("cms_certification_number", sort=True):
    g = g.sort_values("month_ts").reset_index(drop=True)
    for i in range(len(g)-1):
        a = g.iloc[i]
        b = g.iloc[i+1]

        # aligned lists
        prev_names = [str(x) for x in a["name_norms"] if x is not None]
        next_names = [str(x) for x in b["name_norms"] if x is not None]
        prev_pcts  = [None if x is None else float(x) for x in a["ownership_percentages"]]
        next_pcts  = [None if x is None else float(x) for x in b["ownership_percentages"]]

        prev_map = {n:p for n,p in zip(prev_names, prev_pcts)}
        next_map = {n:p for n,p in zip(next_names, next_pcts)}
        prev_set, next_set = set(prev_names), set(next_names)

        prev_tag = str(a.get("pct_fill_tag", "none_null"))
        next_tag = str(b.get("pct_fill_tag", "none_null"))

        total_prev = float(a.get("total_pct", 0) or 0)
        total_next = float(b.get("total_pct", 0) or 0)

        # Inconclusive: any mix of nulls in either month (kept for transparency)
        inconclusive = (prev_tag == "some_null") or (next_tag == "some_null")

        # Method selection:
        # - Use PERCENT if both months have complete %s AND totals > 0
        # - Else fallback to NAMES
        if (prev_tag == "none_null") and (next_tag == "none_null") and (total_prev > 0) and (total_next > 0):
            method = 0
            turnover = pct_turnover(prev_map, next_map)
        else:
            method = 1
            turnover = names_turnover(prev_set, next_set)

        transitions.append({
            "cms_certification_number": ccn,
            "from_month": a["month_ts"],
            "to_month": b["month_ts"],
            "used_level_from": a["used_level"],
            "used_level_to": b["used_level"],
            "method": method,  # 0=percent, 1=names
            "inconclusive": inconclusive,
            "turnover_pct": round(turnover, 4),
            "bucket_code": bucket_code(turnover, inconclusive),
            "is_chow": bool((turnover >= THRESH) and not inconclusive),
        })

trans_df = pd.DataFrame(transitions).sort_values(
    ["cms_certification_number","from_month"]
).reset_index(drop=True)

# ---------- Per-CCN summary (still no writes) ----------
def _pad(lst, n, fill=None):
    return (lst + [fill]*n)[:n]

MAX_EVENTS = 6  # show first few CHOWs
summary_rows = []
for ccn, g in trans_df.groupby("cms_certification_number", sort=True):
    g = g.sort_values("to_month")
    chow = g.loc[g["is_chow"]]

    dates = chow["to_month"].dt.date.astype(str).tolist()
    mags  = chow["turnover_pct"].tolist()
    meths = chow["method"].tolist()
    incon = chow["inconclusive"].tolist()

    buckets = g["bucket_code"].value_counts().to_dict()

    out = {
        "cms_certification_number": ccn,
        "num_chows": len(dates),
        "cnt_bucket0": buckets.get(0, 0),
        "cnt_bucket1": buckets.get(1, 0),
        "cnt_bucket2": buckets.get(2, 0),
        "cnt_bucket3": buckets.get(3, 0),
        "cnt_bucket4": buckets.get(4, 0),
        "cnt_bucket5": buckets.get(5, 0),
        "cnt_bucket6_inconcl": buckets.get(6, 0),
    }
    for k in range(1, MAX_EVENTS+1):
        out[f"chow_date_{k}"]      = _pad(dates, MAX_EVENTS, None)[k-1]
        out[f"chow_magnitude_{k}"] = _pad(mags,  MAX_EVENTS, None)[k-1]
        out[f"chow_method_{k}"]    = _pad(meths, MAX_EVENTS, None)[k-1]   # 0=percent,1=names
        out[f"chow_inconcl_{k}"]   = _pad(incon, MAX_EVENTS, None)[k-1]
    summary_rows.append(out)

summary_ccn = pd.DataFrame(summary_rows).sort_values("cms_certification_number").reset_index(drop=True)

# ---------- Peeks ----------
print("\n=== Transitions sample ===")
print(trans_df.head(10).to_string(index=False))

print("\n=== CHOW summary sample ===")
print(summary_ccn.head(10).to_string(index=False))

print("\nCounts:")
print("Transitions:", len(trans_df), "| CHOWs:", int(trans_df["is_chow"].sum()))
print("\nMethod counts (0=percent,1=names):")
print(trans_df["method"].value_counts().sort_index())
print("\nBucket distribution (0..6):")
print(trans_df["bucket_code"].value_counts().sort_index())

[load] C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_month_chow_ready_v2.csv

=== Transitions sample ===
 cms_certification_number from_month   to_month used_level_from used_level_to  method  inconclusive  turnover_pct  bucket_code  is_chow
                     5125 2018-10-01 2018-11-01          DIRECT        DIRECT       0         False           0.0            0    False
                     5125 2018-11-01 2018-12-01          DIRECT        DIRECT       0         False           0.0            0    False
                     5125 2018-12-01 2019-01-01          DIRECT        DIRECT       0         False           0.0            0    False
                     5125 2019-01-01 2019-02-01          DIRECT        DIRECT       0         False           0.0            0    False
                     5125 2019-02-01 2019-03-01          DIRECT        DIRECT       0         False           0.0            0    False
                     5125 2019-03-01 2019-04-01          DIRECT