In [2]:
# ──────────────────────────────────────────────────────────────────────────────
# Facility Signatures Builder (Association-Date; Indirect→Direct→Partnership)
# With hospital-based filter: drop CCNs where provider_resides_in_hospital == True
# ──────────────────────────────────────────────────────────────────────────────

import os, re, json, pathlib
import pandas as pd
import numpy as np

# -------------------------- Config / Paths -----------------------------------
ENV_DIR      = os.getenv("NH_DATA_DIR", r"C:\Users\Owner\OneDrive\NursingHomeData")
OWN_DIR      = pathlib.Path(ENV_DIR) / "ownership-files"
INPUT_FP     = OWN_DIR / "ownership_combined.csv"

# hospital-based CCNs file (built earlier)
PROV_DIR     = pathlib.Path(r"C:\Users\Owner\OneDrive\NursingHomeData\provider-info-files")
HOSP_FP      = PROV_DIR / "provider_resides_in_hospital_by_ccn.csv"

PROJECT_ROOT = pathlib.Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent
if not (PROJECT_ROOT / "data").is_dir():
    PROJECT_ROOT = OWN_DIR.parent

INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

OUT_LONG = INTERIM_DIR / "facility_signatures_long.csv"
OUT_WIDE = INTERIM_DIR / "facility_signatures_wide_preview.csv"

# -------------------------- Tunables -----------------------------------------
LEVEL_PRIORITY   = ["indirect", "direct", "partnership"]
ROUND_PCT        = 1
TURNOVER_THRESH  = 0.50
WIDE_MAX_GROUPS  = 8

# -------------------------- Helpers ------------------------------------------
SUFFIXES = r'\b(INC|INCORPORATED|CORP|CORPORATION|LLC|L\.L\.C\.|L\.P\.|LP|LLP|PLC|CO|COMPANY|HOLDINGS?|PARTNERS?|PARTNERSHIP|CAPITAL|INVESTMENTS?|TRUST|GROUP)\b'

def clean_owner_name(s: str) -> str:
    if pd.isna(s) or not str(s).strip():
        return ""
    x = str(s).upper()
    x = re.sub(r"[.,&/()\-']", " ", x)
    x = re.sub(SUFFIXES, "", x)
    x = re.sub(r"\s+", " ", x).strip()
    return x

def level_bucket(role_val: str) -> str:
    s = str(role_val).lower()
    if "indirect" in s:  return "indirect"
    if "direct"   in s:  return "direct"
    if "partner"  in s:  return "partnership"
    return ""

def normalize_weights_allow_missing(df_block: pd.DataFrame) -> pd.DataFrame:
    g = df_block.copy()
    g["ownership_percentage"] = pd.to_numeric(g["ownership_percentage"], errors="coerce")

    agg_num = (
        g.groupby("owner_name_norm", as_index=False)["ownership_percentage"]
         .sum(min_count=1)
    )

    has_numeric   = agg_num["ownership_percentage"].notna().any()
    total_numeric = agg_num["ownership_percentage"].fillna(0).sum()

    if has_numeric and total_numeric > 0:
        vec = agg_num[agg_num["ownership_percentage"].notna()].copy()
        vec["ownership_percentage"] = vec["ownership_percentage"] * (100.0 / total_numeric)
    else:
        owners = agg_num["owner_name_norm"].tolist()
        if not owners:
            return pd.DataFrame(columns=["owner_name_norm","ownership_percentage"])
        equal = 100.0 / len(owners)
        vec = pd.DataFrame({"owner_name_norm": owners,
                            "ownership_percentage": [equal]*len(owners)})

    vec["ownership_percentage"] = vec["ownership_percentage"].round(ROUND_PCT)
    tot2 = vec["ownership_percentage"].sum()
    if tot2 > 0:
        vec["ownership_percentage"] = (vec["ownership_percentage"] * (100.0 / tot2)).round(ROUND_PCT)

    vec = vec[vec["ownership_percentage"] > 0].copy()
    if vec.empty:
        owners = agg_num["owner_name_norm"].tolist()
        equal = 100.0 / len(owners)
        vec = pd.DataFrame({"owner_name_norm": owners,
                            "ownership_percentage": [round(equal, ROUND_PCT)]*len(owners)})
    return vec.sort_values(["ownership_percentage","owner_name_norm"], ascending=[False, True]).reset_index(drop=True)

def pct_overlap(prev_map: dict, curr_map: dict) -> float:
    names = set(prev_map) | set(curr_map)
    overlap = 0.0
    for n in names:
        overlap += min(prev_map.get(n, 0.0), curr_map.get(n, 0.0))
    return max(0.0, min(overlap / 100.0, 1.0))

# -------------------------- Load & Prepare -----------------------------------
print("[load]", INPUT_FP)
df = pd.read_csv(INPUT_FP, low_memory=False)

needed = {"cms_certification_number", "role", "owner_name", "ownership_percentage", "association_date"}
missing = needed - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Normalize CCN & dates
df["cms_certification_number"] = df["cms_certification_number"].astype(str).str.extract(r"(\d+)")[0].str.zfill(6)
df["association_date"] = pd.to_datetime(df["association_date"], errors="coerce")

# Keep rows w/ valid date (we do NOT drop for missing ownership %)
df = df.dropna(subset=["association_date"]).copy()

# -------- NEW: load hospital-based CCNs and filter them out --------
if HOSP_FP.exists():
    hosp = pd.read_csv(HOSP_FP, dtype={"cms_certification_number":"string"})
    if "provider_resides_in_hospital" not in hosp.columns:
        raise ValueError(f"{HOSP_FP} missing 'provider_resides_in_hospital' column")
    hosp["cms_certification_number"] = hosp["cms_certification_number"].astype(str).str.extract(r"(\d+)")[0].str.zfill(6)

    # Normalize boolean (accept strings)
    def to_bool(x):
        if pd.isna(x): return pd.NA
        s = str(x).strip().lower()
        if s in {"1","y","yes","true","t"}:  return True
        if s in {"0","n","no","false","f"}:  return False
        return pd.NA
    hosp["provider_resides_in_hospital"] = hosp["provider_resides_in_hospital"].map(to_bool)

    drop_ccns = set(hosp.loc[hosp["provider_resides_in_hospital"] == True, "cms_certification_number"])
    before_fac = df["cms_certification_number"].nunique()
    df = df[~df["cms_certification_number"].isin(drop_ccns)].copy()
    after_fac  = df["cms_certification_number"].nunique()
    print(f"[hospital filter] dropped {before_fac - after_fac} facilities (flagged True) — kept {after_fac}")
else:
    print(f"[hospital filter] WARNING: {HOSP_FP} not found — no CCNs filtered")

# Normalize owner names and levels
df["owner_name_norm"] = df["owner_name"].map(clean_owner_name)
df["level"] = df["role"].map(level_bucket)

# -------------------------- Build Snapshots ----------------------------------
snapshots = []
for (ccn, adate), g in df.groupby(["cms_certification_number","association_date"], sort=True):
    chosen = None
    for lvl in LEVEL_PRIORITY:
        gl = g[g["level"] == lvl]
        if len(gl):
            chosen = (lvl, gl)
            break
    if chosen is None:
        continue

    lvl, gl = chosen
    vec = normalize_weights_allow_missing(gl[["owner_name_norm","ownership_percentage"]].copy())
    if vec.empty:
        owners = gl["owner_name_norm"].dropna().unique().tolist()
        if not owners:
            continue
        equal = 100.0 / len(owners)
        vec = pd.DataFrame({"owner_name_norm": owners,
                            "ownership_percentage": [round(equal, ROUND_PCT)]*len(owners)})

    weight_map = dict(zip(vec["owner_name_norm"], vec["ownership_percentage"].astype(float)))
    snapshots.append({
        "cms_certification_number": ccn,
        "association_date": adate,
        "source_level": lvl,
        "weights": weight_map
    })

snapshots_df = pd.DataFrame(snapshots).sort_values(["cms_certification_number","association_date"]).reset_index(drop=True)
print("[snapshots built] rows:", len(snapshots_df))

# -------------------------- Grouping into Stable Regimes ---------------------
long_rows = []
for ccn, g in snapshots_df.groupby("cms_certification_number", sort=True):
    g = g.sort_values("association_date").reset_index(drop=True)
    if g.empty: continue

    group_n     = 1
    group_start = g.loc[0, "association_date"]
    group_level = g.loc[0, "source_level"]
    prev_weights = g.loc[0, "weights"]

    def hhi_from_map(wm: dict) -> float:
        return round(sum((p/100.0)**2 for p in wm.values()), 4)

    long_rows.append({
        "cms_certification_number": ccn,
        "group_n": group_n,
        "start": group_start,
        "end": pd.NaT,
        "source_level": group_level,
        "names_list": json.dumps(list(prev_weights.keys()), separators=(",", ":")),
        "pcts_list": json.dumps(list(prev_weights.values()), separators=(",", ":")),
        "owner_count": len(prev_weights),
        "hhi": hhi_from_map(prev_weights),
    })

    for i in range(1, len(g)):
        curr_weights = g.loc[i, "weights"]
        ov = pct_overlap(prev_weights, curr_weights)
        turnover = 1.0 - ov

        if turnover >= TURNOVER_THRESH:
            long_rows[-1]["end"] = g.loc[i-1, "association_date"]
            group_n += 1
            group_start = g.loc[i, "association_date"]
            group_level = g.loc[i, "source_level"]
            long_rows.append({
                "cms_certification_number": ccn,
                "group_n": group_n,
                "start": group_start,
                "end": pd.NaT,
                "source_level": group_level,
                "names_list": json.dumps(list(curr_weights.keys()), separators=(",", ":")),
                "pcts_list": json.dumps(list(curr_weights.values()), separators=(",", ":")),
                "owner_count": len(curr_weights),
                "hhi": hhi_from_map(curr_weights),
            })
            prev_weights = curr_weights
        else:
            prev_weights = curr_weights

    long_rows[-1]["end"] = g.loc[len(g)-1, "association_date"]

long_df = pd.DataFrame(long_rows).sort_values(["cms_certification_number","group_n"]).reset_index(drop=True)

# -------------------------- Wide Preview (QC only) ---------------------------
def as_label(names_json, pcts_json, k=12):
    names = json.loads(names_json)
    pcts  = json.loads(pcts_json)
    pairs = [f"{n} ({round(p, ROUND_PCT)}%)" for n, p in zip(names, pcts)]
    return "; ".join(pairs[:k])

if not long_df.empty:
    wide_blocks = []
    for ccn, g in long_df.groupby("cms_certification_number"):
        g = g.sort_values("group_n")
        row = {"cms_certification_number": ccn}
        for _, r in g.head(WIDE_MAX_GROUPS).iterrows():
            n = int(r["group_n"])
            row[f"group{n}_start"] = pd.to_datetime(r["start"]).date()
            row[f"group{n}_end"]   = pd.to_datetime(r["end"]).date()
            row[f"group{n}_level"] = r["source_level"]
            row[f"group{n}_names"] = as_label(r["names_list"], r["pcts_list"])
            row[f"group{n}_pcts"]  = ",".join(map(lambda x: str(int(round(x,0))), json.loads(r["pcts_list"])))
        wide_blocks.append(row)
    wide_df = pd.DataFrame(wide_blocks).sort_values("cms_certification_number").reset_index(drop=True)
else:
    wide_df = pd.DataFrame(columns=["cms_certification_number"])

# -------------------------- Save --------------------------------------------
long_df.to_csv(OUT_LONG, index=False)
wide_df.to_csv(OUT_WIDE, index=False)

print("[save] long :", OUT_LONG,  " rows=", len(long_df))
print("[save] wide :", OUT_WIDE,  " rows=", len(wide_df))

# -------------------------- Diagnostics --------------------------------------
all_ccns_raw = set(pd.read_csv(INPUT_FP, usecols=["cms_certification_number"])["cms_certification_number"]
                     .astype(str).str.extract(r"(\d+)")[0].str.zfill(6).unique())
kept_ccns    = set(long_df["cms_certification_number"].unique())
print("[diagnostic] total CCNs in raw:", len(all_ccns_raw))
print("[diagnostic] CCNs after hospital filter:", len(kept_ccns))
print("[diagnostic] CCNs removed by hospital filter:", len(all_ccns_raw - kept_ccns))

[load] C:\Users\Owner\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv
[hospital filter] dropped 656 facilities (flagged True) — kept 13419
[snapshots built] rows: 39643
[save] long : C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_long.csv  rows= 37374
[save] wide : C:\Repositories\white-bowblis-nhmc\data\interim\facility_signatures_wide_preview.csv  rows= 13419
[diagnostic] total CCNs in raw: 14075
[diagnostic] CCNs after hospital filter: 13419
[diagnostic] CCNs removed by hospital filter: 656
