In [12]:
import pandas as pd 
import numpy as np
import re
import os
from pathlib import Path
from datetime import date

In [4]:
# ── 0. Load raw once ─────────────────────────────────────────────────────
csv_path = (
    Path(os.environ["OneDrive"]) /
    "Documents" / "Honors_Thesis" / "data" / "data-raw" / "ownership-files" /
    "ownership_combined.csv"
)

if not csv_path.is_file():
    raise FileNotFoundError(f"Expected CSV not found: {csv_path}")

df = pd.read_csv(
    csv_path,
    dtype={"cms_certification_number": "string"},   # keep leading zeros
    low_memory=False
)
print(f"STEP 0  raw → {len(df):,} rows from {csv_path}")

STEP 0  raw → 15,621,094 rows from C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data-raw\ownership-files\ownership_combined.csv


In [5]:
# ── 1. Keep only the roles of interest (plus NaNs) ───────────────────────
target_roles = [
    "5% OR GREATER DIRECT OWNERSHIP INTEREST",
    "5% OR GREATER INDIRECT OWNERSHIP INTEREST",
    "PARTNERSHIP INTEREST",
]
role_regex = re.compile("|".join(map(re.escape, target_roles)), flags=re.IGNORECASE)

mask_roles = df["role"].fillna("").str.contains(role_regex)
df = df[mask_roles | df["role"].isna()].copy()
print(f"STEP 1  role‑filter → {len(df):,}")

STEP 1  role‑filter → 5,121,498


In [6]:
# ── 2. De‑duplicate identical ownership snapshots ───────────────────────
dedup_cols = [
    "cms_certification_number", "provider_name",
    "role", "owner_type", "owner_name",
    "ownership_percentage", "association_date"
]
df = (
    df.sort_values("processing_date")          # oldest → newest
      .drop_duplicates(subset=dedup_cols, keep="last")
      .reset_index(drop=True)
)
print(f"STEP 2  de‑dup      → {len(df):,}")

STEP 2  de‑dup      → 162,904


In [14]:
out_csv = (
    Path(os.environ["OneDrive"]) /
    "Documents" / "Honors_Thesis" / "data" / "data" /
    "ownership_file_clean.csv"
)

# Ensure directory exists
out_csv.parent.mkdir(parents=True, exist_ok=True)

# Save
df.to_csv(out_csv, index=False)
print("Saved:", out_csv)

Saved: C:\Users\Owner\OneDrive\Documents\Honors_Thesis\data\data\ownership_file_clean.csv
