In [None]:
import zipfile
import os
from pathlib import Path
import shutil
import re

In [6]:
import zipfile
import os
from pathlib import Path
import shutil
import re

# Define base paths
base_path = Path(r"C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw")
nh_compare_path = base_path / "nh-compare"
output_path = base_path / "ownership-files"
output_path.mkdir(exist_ok=True)

# Clean up previously misnamed files (optional but recommended)
for bad_file in output_path.glob("ownership_download_20_*.csv"):
    print(f"🗑️ Deleting malformed file: {bad_file}")
    bad_file.unlink()

# Loop through all yearly zip files
for year in range(2017, 2026):
    outer_zip_path = nh_compare_path / f"nh_archive_{year}.zip"
    
    with zipfile.ZipFile(outer_zip_path, 'r') as outer_zip:
        inner_zip_names = [name for name in outer_zip.namelist() if name.endswith('.zip')]
        
        for inner_zip_name in inner_zip_names:
            # Use regex to extract MM and YYYY
            match = re.search(r'_(\d{2})_(\d{4})\.zip', inner_zip_name)
            if not match:
                print(f"⚠️ Skipping unrecognized format: {inner_zip_name}")
                continue

            month_str, year_str = match.groups()
            is_new_format = (
                int(year_str) > 2020 or
                (int(year_str) == 2020 and int(month_str) >= 8)
            )

            # Open and read the inner zip
            with outer_zip.open(inner_zip_name) as inner_zip_file:
                with zipfile.ZipFile(inner_zip_file) as inner_zip:
                    for file_name in inner_zip.namelist():
                        should_extract = (
                            (not is_new_format and 'ownership_download' in file_name.lower()) or
                            (is_new_format and 'NH_Ownership_' in file_name)
                        )
                        if should_extract:
                            ext = Path(file_name).suffix or '.csv'
                            new_filename = f"ownership_download_{month_str}_{year_str}{ext}"
                            target_path = output_path / new_filename

                            with inner_zip.open(file_name) as source_file, open(target_path, 'wb') as target_file:
                                shutil.copyfileobj(source_file, target_file)
                                print(f"✅ Saved: {target_path}")


🗑️ Deleting malformed file: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_download_20_2021.csv
🗑️ Deleting malformed file: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_download_20_2022.csv
🗑️ Deleting malformed file: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_download_20_2023.csv
🗑️ Deleting malformed file: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_download_20_2024.csv
🗑️ Deleting malformed file: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_download_20_2025.csv
✅ Saved: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_download_01_2017.csv
✅ Saved: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_download_02_2017.csv
✅ Saved: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownershi

In [22]:
import pandas as pd
from pathlib import Path
import re

# Define path to ownership files
ownership_dir = Path(r"C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files")
ownership_files = sorted(ownership_dir.glob("ownership_download_*.csv"))

# Known column name variants → standardized column names
column_map = {
    # CMS provider ID
    "cms certification number (ccn)": "cms_certification_number",
    "federal provider number": "cms_certification_number",
    "provnum": "cms_certification_number",

    # Provider name
    "provider name": "provider_name",
    "provname": "provider_name",

    # Role of owner/manager
    "role played by owner or manager in facility": "role",
    "role played by owner in facility": "role",
    "role of owner or manager": "role",
    "owner role": "role",
    "role desc": "role",

    # Ownership metadata
    "owner type": "owner_type",
    "owner name": "owner_name",
    "ownership percentage": "ownership_percentage",
    "owner percentage": "ownership_percentage", 
    "association date": "association_date",

    # Processing date
    "processing date": "processing_date",
    "processingdate": "processing_date",
    "process date": "processing_date",
    "processdate": "processing_date",
    "filedate": "processing_date"
}

# Final list of target columns
target_cols = list(set(column_map.values()))

# Clean and map raw column name
def clean_column(col):
    col = col.strip().lower()
    col = re.sub(r'[^a-z0-9]+', ' ', col)
    col = re.sub(r'\s+', ' ', col).strip()
    return column_map.get(col, None)

# List to store cleaned data
df_list = []

# Load and standardize all files
for file in ownership_files:
    try:
        df_raw = pd.read_csv(file, dtype=str, encoding='utf-8')
    except UnicodeDecodeError:
        try:
            df_raw = pd.read_csv(file, dtype=str, encoding='ISO-8859-1')
        except Exception as e:
            print(f"❌ Failed to load {file.name}: {e}")
            continue

    # Clean column names
    col_mapping = {raw_col: clean_column(raw_col) for raw_col in df_raw.columns}
    df_clean = pd.DataFrame()

    # Map only target columns
    for raw_col, std_col in col_mapping.items():
        if std_col in target_cols:
            df_clean[std_col] = df_raw[raw_col]

    # Add missing target columns
    for col in target_cols:
        if col not in df_clean.columns:
            df_clean[col] = pd.NA

    # Add metadata
    df_clean["source_file"] = file.stem
    df_clean["month"] = df_clean["source_file"].str.extract(r'_(\d{2})_\d{4}')[0].astype(int)
    df_clean["year"] = df_clean["source_file"].str.extract(r'_(\d{4})$')[0].astype(int)
    df_clean["date"] = pd.to_datetime(dict(year=df_clean.year, month=df_clean.month, day=1))

    df_list.append(df_clean)
    print(f"✅ Loaded: {file.name}")

# Combine and sort
combined_df = pd.concat(df_list, ignore_index=True)
combined_df = combined_df.sort_values("date").reset_index(drop=True)

# Output summary
print(f"\n📦 Final shape: {combined_df.shape}")
print(f"📚 Columns: {combined_df.columns.tolist()}")
print("🧪 Sample with non-null 'role' and 'processing_date':")
print(combined_df[['cms_certification_number', 'provider_name', 'role', 'processing_date']].dropna(subset=['role', 'processing_date']).head())


✅ Loaded: ownership_download_01_2017.csv
✅ Loaded: ownership_download_01_2018.csv
✅ Loaded: ownership_download_01_2019.csv
✅ Loaded: ownership_download_01_2020.csv
✅ Loaded: ownership_download_01_2021.csv
✅ Loaded: ownership_download_01_2022.csv
✅ Loaded: ownership_download_01_2023.csv
✅ Loaded: ownership_download_01_2024.csv
✅ Loaded: ownership_download_02_2017.csv
✅ Loaded: ownership_download_02_2019.csv
✅ Loaded: ownership_download_02_2020.csv
✅ Loaded: ownership_download_02_2021.csv
✅ Loaded: ownership_download_02_2022.csv
✅ Loaded: ownership_download_02_2023.csv
✅ Loaded: ownership_download_02_2024.csv
✅ Loaded: ownership_download_02_2025.csv
✅ Loaded: ownership_download_03_2017.csv
✅ Loaded: ownership_download_03_2018.csv
✅ Loaded: ownership_download_03_2019.csv
✅ Loaded: ownership_download_03_2020.csv
✅ Loaded: ownership_download_03_2021.csv
✅ Loaded: ownership_download_03_2022.csv
✅ Loaded: ownership_download_03_2023.csv
✅ Loaded: ownership_download_03_2024.csv
✅ Loaded: owners

In [24]:
# Save final combined DataFrame
output_path = ownership_dir / "ownership_combined.csv"
combined_df.to_csv(output_path, index=False)
print(f"\n💾 Saved to: {output_path}")


💾 Saved to: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_combined.csv


In [None]:
# ── Fill NaN cms_certification_number if provider_name maps unambiguously ──
def fill_ccn_from_provider(df: pd.DataFrame, ccn_col="cms_certification_number", name_col="provider_name"):
    # 1. Build a mapping: provider_name → set of distinct non-null CCNs
    mapping = (
        df.dropna(subset=[ccn_col])                   # keep rows WITH a CCN
          .groupby(name_col, observed=True)[ccn_col]
          .agg(lambda s: set(s))                     # set of all CCNs for that provider
    )

    # 2. Keep only *unambiguous* providers (set size == 1)
    unambig = mapping[mapping.str.len() == 1].apply(lambda s: next(iter(s)))

    print(f"🔍 Providers with exactly one CCN on record: {len(unambig):,}")

    # 3. Vectorised fill
    missing_mask = df[ccn_col].isna()
    before_missing = missing_mask.sum()
    df.loc[missing_mask, ccn_col] = (
        df.loc[missing_mask, name_col].map(unambig)
    )

    after_missing  = df[ccn_col].isna().sum()
    filled         = before_missing - after_missing

    print(f"✅ Filled {filled:,} of {before_missing:,} missing CCNs "
          f"({after_missing:,} still null or ambiguous)")

    return df

# ---- run it
df = fill_ccn_from_provider(df)

In [None]:
# ── AFTER all CCN‑filling steps ───────────────────────────────────────────
RAW_CSV = Path(r"C:\Users\wrthj\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_combined.csv")

df.to_csv(RAW_CSV, index=False)            # overwrites the original file
# (≈ 5‑10 min for 15 M rows on an SSD)