In [6]:
# ==============================================================================
# Ownership compact (CCN × month) — percent-availability profiling
#   - Load from data/interim
#   - Parse month_ts (robust, minimal warnings)
#   - Convert list-like columns from JSON strings → Python lists
#   - Coerce % lists to numeric
#   - Tag rows by percent availability: none_null / some_null / all_null
#   - Print overall snapshot and by-year breakdown
# ==============================================================================

import os, re, json, warnings
import numpy as np
import pandas as pd
from pathlib import Path

# --------------------------- Paths / load -------------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "data").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

IN_CSV = INTERIM_DIR / "ownership_ccn_month_compact.csv"
print(f"[paths] INTERIM_DIR={INTERIM_DIR}")
print(f"[load] {IN_CSV}")

if not IN_CSV.exists():
    raise FileNotFoundError(f"Cannot find {IN_CSV}")

# Load as strings; we'll parse what we need after
df = pd.read_csv(IN_CSV, dtype=str, low_memory=False)
print(f"[loaded] rows={len(df):,}, cols={df.shape[1]}")

# --------------------------- Helpers ------------------------------------------
# Quiet the specific "Could not infer format" warning from pandas
warnings.filterwarnings("ignore", message=r"Could not infer format.*")

LIST_COLS = ["roles","owner_types","owner_names","ownership_percentages","association_dates"]

def to_list_safe(x):
    """Parse JSON list strings -> list; pass lists through; else return []"""
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                return json.loads(s)
            except Exception:
                return []
    return []

def coerce_pct_list(lst):
    """Coerce all entries to float (NaN when empty/None/unparseable)."""
    out = []
    for v in (lst if isinstance(lst, list) else []):
        if v in ("", None):
            out.append(np.nan)
            continue
        try:
            out.append(float(v))
        except Exception:
            out.append(np.nan)
    return out

def pct_fill_tag(lst):
    """Classify percent list as none_null / some_null / all_null."""
    if not isinstance(lst, list) or len(lst) == 0:
        return "all_null"
    n_nan = sum(pd.isna(x) for x in lst)
    if n_nan == 0:
        return "none_null"
    if n_nan == len(lst):
        return "all_null"
    return "some_null"

# --------------------------- Parse / clean ------------------------------------
# month_ts as datetime — try strict formats first, then minimal fallback
mt = df["month_ts"].astype(str)

# ISO yyyy-mm-dd
mask_iso = mt.str.match(r"^\d{4}-\d{2}-\d{2}$", na=False)
df.loc[mask_iso, "month_ts"] = pd.to_datetime(mt[mask_iso], format="%Y-%m-%d", errors="coerce")

# m/d/yyyy (if any)
mask_us = (~mask_iso) & mt.str.match(r"^\d{1,2}/\d{1,2}/\d{4}$", na=False)
df.loc[mask_us, "month_ts"] = pd.to_datetime(mt[mask_us], format="%m/%d/%Y", errors="coerce")

# Any stragglers → last-resort parse (silenced warning above)
mask_rest = (~mask_iso) & (~mask_us)
if mask_rest.any():
    df.loc[mask_rest, "month_ts"] = pd.to_datetime(mt[mask_rest], errors="coerce")

# Ensure dtype is datetime64[ns]
df["month_ts"] = pd.to_datetime(df["month_ts"], errors="coerce")

# Convert list-like columns
for c in LIST_COLS:
    if c in df.columns:
        df[c] = df[c].apply(to_list_safe)
    else:
        df[c] = [[] for _ in range(len(df))]

# Percent lists to numeric
df["ownership_percentages"] = df["ownership_percentages"].apply(coerce_pct_list)

# Build fill tag
df["pct_fill_tag"] = df["ownership_percentages"].apply(pct_fill_tag)

# --------------------------- Summaries ----------------------------------------
print("\n=== Percent-availability snapshot (overall) ===")
overall = (
    df["pct_fill_tag"]
    .value_counts(normalize=True)
    .mul(100).round(1)
    .astype(str) + "%"
)
print(overall)

# by-year: derive year from month_ts
# Ensure month_ts is datetime (no-op if already)
df["month_ts"] = pd.to_datetime(df["month_ts"], errors="coerce")

# Derive year column once (idempotent)
df["year"] = df["month_ts"].dt.year

# Compute by-year availability shares safely (no reset_index collision)
tmp = (
    df.groupby(["year", "pct_fill_tag"], as_index=False)
      .size()
      .rename(columns={"size": "n"})
)
tmp["share_%"] = (
    tmp.groupby("year")["n"]
       .transform(lambda s: (s / s.sum() * 100).round(1))
)

by_year_pivot = (
    tmp.pivot(index="year", columns="pct_fill_tag", values="share_%")
       .sort_index()
       .fillna(0)
)

print("\n=== Percent-availability by year (share %) ===")
print(by_year_pivot)

# Quick sanity peeks
print("\nDistinct CCNs:", df["cms_certification_number"].nunique() if "cms_certification_number" in df.columns else "(missing col)")
print("Date range:", df["month_ts"].min().date() if pd.notna(df["month_ts"].min()) else None,
      "→", df["month_ts"].max().date() if pd.notna(df["month_ts"].max()) else None)

# Show a few representative rows from each tag
for tag in ["none_null","some_null","all_null"]:
    samp = df.loc[df["pct_fill_tag"]==tag, ["cms_certification_number","month_ts","ownership_percentages","owner_names"]].head(3)
    print(f"\n[{tag}] examples:")
    if len(samp):
        print(samp.to_string(index=False))
    else:
        print("(none)")

[paths] INTERIM_DIR=C:\Repositories\white-bowblis-nhmc\data\interim
[load] C:\Repositories\white-bowblis-nhmc\data\interim\ownership_ccn_month_compact.csv
[loaded] rows=1,173,176, cols=9

=== Percent-availability snapshot (overall) ===
pct_fill_tag
none_null    52.3%
some_null    31.2%
all_null     16.5%
Name: proportion, dtype: object

=== Percent-availability by year (share %) ===
pct_fill_tag  all_null  none_null  some_null
year                                        
2017              22.7       45.0       32.4
2018              20.3       48.2       31.5
2019              18.9       50.0       31.1
2020              17.9       51.4       30.7
2021              16.9       52.6       30.5
2022              14.7       54.5       30.8
2023              12.4       56.8       30.8
2024              10.8       57.8       31.5
2025              10.1       58.3       31.6

Distinct CCNs: 14075
Date range: 2017-01-01 → 2025-07-01

[none_null] examples:
cms_certification_number   month_ts ow