In [2]:
import os, re, shutil, zipfile, pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

In [3]:
# ------------------------------------------------------------------------------
# 1.  Project-wide paths (portable)
# ------------------------------------------------------------------------------

import os, re, shutil, zipfile, pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

# Use env-var if present; otherwise fall back to repo-relative path (for tests)
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR      = Path(os.getenv("NH_DATA_DIR",  PROJECT_ROOT / "data" / "raw"))
NH_ZIP_DIR   = RAW_DIR / "nh-compare"
OWN_DIR      = RAW_DIR / "ownership-files"        # extracted monthly CSVs
OWN_DIR.mkdir(parents=True, exist_ok=True)

INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
# ------------------------------------------------------------------------------
# 2. clean malformed filenames already on disk
# ------------------------------------------------------------------------------

for bad in OWN_DIR.glob("ownership_download_20_*.csv"):
    print("🗑️ Deleting malformed file:", bad.name)
    bad.unlink()

In [5]:
# ------------------------------------------------------------------------------
# 3.  Extract the correct ownership CSV from every inner ZIP
# ------------------------------------------------------------------------------

for outer_zip in tqdm(sorted(NH_ZIP_DIR.glob("nh_archive_*.zip")), desc="Yearly archives"):
    with zipfile.ZipFile(outer_zip) as z_out:
        for inner_name in [n for n in z_out.namelist() if n.lower().endswith(".zip")]:
            # Grab month / year from inner-ZIP filename
            m = re.search(r"_(\d{2})_(\d{4})\.zip$", inner_name)
            if not m:
                continue
            mm, yyyy = map(int, m.groups())
            new_fmt = (yyyy > 2020) or (yyyy == 2020 and mm >= 8)

            with z_out.open(inner_name) as inner_bytes, zipfile.ZipFile(inner_bytes) as z_in:
                for member in z_in.namelist():
                    if not member.lower().endswith(".csv"):
                        continue
                    keep = (
                        (not new_fmt and "ownership_download" in member.lower())
                        or (new_fmt and "nh_ownership_" in member.lower())
                    )
                    if not keep:
                        continue

                    target = OWN_DIR / f"ownership_download_{mm:02d}_{yyyy}.csv"
                    if target.exists():   # skip if we already extracted
                        continue

                    with z_in.open(member) as src, open(target, "wb") as dst:
                        shutil.copyfileobj(src, dst)
                        print("✅ Saved", target.name)

Yearly archives:   0%|          | 0/9 [00:00<?, ?it/s]

In [6]:
# ------------------------------------------------------------------------------
# 4.  Standardise columns, add metadata columns
# ------------------------------------------------------------------------------

COLUMN_MAP = {   # (exactly your original mapping)
    "cms certification number (ccn)": "cms_certification_number",
    "federal provider number": "cms_certification_number",
    "provnum": "cms_certification_number",
    "provider name": "provider_name",
    "provname": "provider_name",
    "role played by owner or manager in facility": "role",
    "role played by owner in facility": "role",
    "role of owner or manager": "role",
    "owner role": "role",
    "role desc": "role",
    "owner type": "owner_type",
    "owner name": "owner_name",
    "ownership percentage": "ownership_percentage",
    "owner percentage": "ownership_percentage",
    "association date": "association_date",
    "processing date": "processing_date",
    "processingdate": "processing_date",
    "process date": "processing_date",
    "processdate": "processing_date",
    "filedate": "processing_date",
}
TARGET_COLS = list(set(COLUMN_MAP.values()))

def clean_col(c: str):
    c = re.sub(r"[^a-z0-9]+", " ", c.lower()).strip()
    return COLUMN_MAP.get(c)

dfs = []
for csv_path in tqdm(sorted(OWN_DIR.glob("ownership_download_*.csv")), desc="Monthly CSVs"):
    try:
        df_raw = pd.read_csv(csv_path, dtype=str, encoding="utf-8")
    except UnicodeDecodeError:
        df_raw = pd.read_csv(csv_path, dtype=str, encoding="ISO-8859-1")

    df = pd.DataFrame()
    for raw, std in {c: clean_col(c) for c in df_raw.columns}.items():
        if std in TARGET_COLS:
            df[std] = df_raw[raw]
            for col in TARGET_COLS:
                if col not in df.columns:
                    df[col] = pd.NA

    mm, yyyy = map(int, re.search(r"_(\d{2})_(\d{4})\.csv$", csv_path.name).groups())
    df["source_file"] = csv_path.stem
    df["month"] = mm
    df["year"] = yyyy
    df["date"] = pd.to_datetime({"year": df.year, "month": df.month, "day": 1})
    dfs.append(df)
    print("✅ Loaded", csv_path.name)

combined = pd.concat(dfs, ignore_index=True).sort_values("date").reset_index(drop=True)

Monthly CSVs:   0%|          | 0/97 [00:00<?, ?it/s]

✅ Loaded ownership_download_01_2017.csv
✅ Loaded ownership_download_01_2018.csv
✅ Loaded ownership_download_01_2019.csv
✅ Loaded ownership_download_01_2020.csv
✅ Loaded ownership_download_01_2021.csv
✅ Loaded ownership_download_01_2022.csv
✅ Loaded ownership_download_01_2023.csv
✅ Loaded ownership_download_01_2024.csv
✅ Loaded ownership_download_02_2017.csv
✅ Loaded ownership_download_02_2019.csv
✅ Loaded ownership_download_02_2020.csv
✅ Loaded ownership_download_02_2021.csv
✅ Loaded ownership_download_02_2022.csv
✅ Loaded ownership_download_02_2023.csv
✅ Loaded ownership_download_02_2024.csv
✅ Loaded ownership_download_02_2025.csv
✅ Loaded ownership_download_03_2017.csv
✅ Loaded ownership_download_03_2018.csv
✅ Loaded ownership_download_03_2019.csv
✅ Loaded ownership_download_03_2020.csv
✅ Loaded ownership_download_03_2021.csv
✅ Loaded ownership_download_03_2022.csv
✅ Loaded ownership_download_03_2023.csv
✅ Loaded ownership_download_03_2024.csv
✅ Loaded ownership_download_03_2025.csv


In [7]:
# ------------------------------------------------------------------------------
# 5.  Fill missing CCNs where provider→CCN mapping is unique
# ------------------------------------------------------------------------------

def fill_ccn(df):
    ccn, name = "cms_certification_number", "provider_name"
    mapping = (
        df.dropna(subset=[ccn])
          .groupby(name, observed=True)[ccn]
          .agg(lambda s: set(s))
    )
    unamb = mapping[mapping.str.len() == 1].apply(lambda s: next(iter(s)))
    before = df[ccn].isna().sum()
    df.loc[df[ccn].isna(), ccn] = df.loc[df[ccn].isna(), name].map(unamb)
    after = df[ccn].isna().sum()
    print(f"🔍 filled {before-after:,} missing CCNs (still {after:,} null)")
    return df

combined = fill_ccn(combined)

🔍 filled 3,043,383 missing CCNs (still 314,102 null)


In [8]:
# ------------------------------------------------------------------------------
# 6.  Save outputs
# ------------------------------------------------------------------------------

csv_out = OWN_DIR / "ownership_combined.csv"         # overwrite previous copy
combined.to_csv(csv_out, index=False)
print("💾 CSV written:", csv_out)

parq_out = INTERIM_DIR / "ownership_combined.parquet"
combined.to_parquet(parq_out, index=False)
print("💾 Parquet written:", parq_out)

print("📦 Final shape:", combined.shape)

💾 CSV written: C:\Users\Owner\OneDrive\NursingHomeData\ownership-files\ownership_combined.csv
💾 Parquet written: C:\Repositories\white-bowblis-nhmc\data\interim\ownership_combined.parquet
📦 Final shape: (15621094, 12)
