In [1]:
import os
import zipfile
from io import BytesIO
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm

In [2]:
# ------------------------------------------------------------------------------
# 1. Project‐wide paths (portable)
# ------------------------------------------------------------------------------
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
NH_ZIP_DIR  = RAW_DIR / "nh-compare"
OWN_DIR     = RAW_DIR / "ownership-files"       # flat directory for cleaned CSVs
OWN_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# ------------------------------------------------------------------------------
# 2. Column mapping (normalized lowercase keys)
# ------------------------------------------------------------------------------
COLUMN_MAP = {
    "cms certification number (ccn)":    "cms_certification_number",
    "federal provider number":           "cms_certification_number",
    "provnum":                           "cms_certification_number",
    "provider name":                     "provider_name",
    "provname":                          "provider_name",
    "role played by owner or manager in facility": "role",
    "role played by owner in facility":           "role",
    "role of owner or manager":                    "role",
    "owner role":                                   "role",
    "role_desc":                                   "role",
    "role desc":                                   "role",
    "owner type":                                 "owner_type",
    "owner name":                                 "owner_name",
    "ownership percentage":                      "ownership_percentage",
    "owner percentage":                         "ownership_percentage",
    "association date":                          "association_date",
    "processing date":                           "processing_date",
    "processingdate":                            "processing_date",
    "process date":                              "processing_date",
    "processdate":                                "processing_date",
    "filedate":                                   "processing_date",
}

In [4]:
# ------------------------------------------------------------------------------
# 3. Robust CSV reader
# ------------------------------------------------------------------------------
def safe_read_csv(raw: bytes) -> pd.DataFrame:
    for enc in ("utf-8", "latin-1"):
        try:
            return pd.read_csv(BytesIO(raw), dtype=str, encoding=enc)
        except Exception:
            pass
    return pd.read_csv(BytesIO(raw), dtype=str,
                       encoding="utf-8", encoding_errors="replace")

In [5]:
# ------------------------------------------------------------------------------
# 4. Extract → clean → write
# ------------------------------------------------------------------------------
OWN_PATTERNS = ("Ownership_Download.csv", "NH_Ownership_")
summary = []

for yearly_zip in tqdm(sorted(NH_ZIP_DIR.glob("nh_archive_*.zip"))):
    with zipfile.ZipFile(yearly_zip, "r") as yz:
        for mzip in sorted(n for n in yz.namelist() if n.lower().endswith(".zip")):
            parts = Path(mzip).stem.split("_")
            month, year = parts[-2], parts[-1]
            with zipfile.ZipFile(BytesIO(yz.read(mzip)), "r") as mz:
                for entry in mz.namelist():
                    fname = Path(entry).name
                    if not any(fname == p or fname.startswith(p) for p in OWN_PATTERNS):
                        continue

                    df = safe_read_csv(mz.read(entry))

                    # --- normalize & fuzzy‐map headers ---
                    new_cols = []
                    for col in df.columns:
                        key = col.strip().lower().replace("_", " ")
                        if key in COLUMN_MAP:
                            new_cols.append(COLUMN_MAP[key])
                        elif "percent" in key:
                            new_cols.append("ownership_percentage")
                        elif "role" in key:
                            new_cols.append("role")
                        else:
                            new_cols.append(key)
                    df.columns = new_cols

                    # --- annotate file/month/year/date ---
                    out_name = f"ownership_download_{month}_{year}.csv"
                    df["source_file"] = out_name
                    df["month"]       = int(month)
                    df["year"]        = int(year)
                    df["date"]        = pd.to_datetime({
                                            "year": df["year"],
                                            "month": df["month"],
                                            "day": 1
                                        })

                    # --- record null rates for key fields ---
                    summary.append({
                        "file": out_name,
                        "pct_null_role": df["role"].isna().mean()*100,
                        "pct_null_ownership_pct": df["ownership_percentage"].isna().mean()*100,
                    })

                    # --- keep only your 12 core columns ---
                    keep = [
                        "cms_certification_number",
                        "provider_name",
                        "role",
                        "owner_type",
                        "owner_name",
                        "ownership_percentage",
                        "association_date",
                        "processing_date",
                        "source_file",
                        "month",
                        "year",
                        "date",
                    ]
                    df = df.loc[:, [c for c in keep if c in df.columns]]

                    # --- write out cleaned file ---
                    df.to_csv(OWN_DIR / out_name, index=False)

  0%|          | 0/9 [00:00<?, ?it/s]

In [6]:
# ------------------------------------------------------------------------------
# 5. Combine
# ------------------------------------------------------------------------------
all_dfs = []
for p in sorted(OWN_DIR.glob("ownership_download_*.csv")):
    all_dfs.append(pd.read_csv(p, dtype=str, parse_dates=["date"]))

combined = pd.concat(all_dfs, ignore_index=True)
combined = combined.sort_values(["cms_certification_number", "date"])
combined.to_csv(OWN_DIR / "ownership_combined.csv", index=False)

In [7]:
# ------------------------------------------------------------------------------
# 6. Review null‐summary
# ------------------------------------------------------------------------------
null_df = pd.DataFrame(summary)

# Compute null‐rate of ownership_percentage by role
role_nulls = (
    combined
    .assign(is_null_pct = combined["ownership_percentage"].isna())
    .groupby("role")["is_null_pct"]
    .agg(total = "size", nulls = "sum")
)
role_nulls["pct_null"] = role_nulls["nulls"] / role_nulls["total"] * 100

# Sort and display
role_nulls = role_nulls.sort_values("pct_null", ascending=False)
print(role_nulls[["total","nulls","pct_null"]].round(2))
print("\n--- ROLE NULL % ---")
print(null_df["pct_null_role"].describe())
print("\n--- OWNERSHIP % NULL % ---")
print(null_df["pct_null_ownership_pct"].describe())
print(f"\n✅ Cleaned files in {OWN_DIR}; combined in ownership_combined.csv")

                                             total  nulls  pct_null
role                                                               
Ownership Data Not Available                 72733  72733     100.0
5% OR GREATER DIRECT OWNERSHIP INTEREST    2157271      0       0.0
5% OR GREATER INDIRECT OWNERSHIP INTEREST  2910876      0       0.0
5% OR GREATER MORTGAGE INTEREST              57005      0       0.0
5% OR GREATER SECURITY INTEREST              96978      0       0.0
CONTRACTED MANAGING EMPLOYEE                  5303      0       0.0
CORPORATE DIRECTOR                          203809      0       0.0
CORPORATE OFFICER                           178971      0       0.0
DIRECTOR                                   3287371      0       0.0
GENERAL PARTNERSHIP INTEREST                  1135      0       0.0
LIMITED PARTNERSHIP INTEREST                  1000      0       0.0
MANAGING EMPLOYEE                          1971404      0       0.0
OFFICER                                    27896

In [8]:
print(len(combined))

15916207
