In [10]:
# ------------------------------------------------------------------------------
# 1.  Project-wide paths (portable)
# ------------------------------------------------------------------------------

import os, re, shutil, zipfile, pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR      = Path(os.getenv("NH_DATA_DIR",  PROJECT_ROOT / "data" / "raw"))
OWN_DIR      = RAW_DIR / "ownership-files"
INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"

OWN_DIR.mkdir(parents=True, exist_ok=True)
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

In [17]:
# 2.  Load raw CSV ─────────────────────────────────────────────────────
csv_path = RAW_DIR / "ownership-files" / "ownership_combined.csv"

dtype_map = {"cms_certification_number": "string"}      # keep leading zeros
df = pd.read_csv(
    csv_path,
    dtype=dtype_map,
    parse_dates=["processing_date"],                    # ISO format
    low_memory=False
)

# parse two messy date columns
def parse_assoc(col: pd.Series) -> pd.Series:
    stripped = col.str.replace(r"^\s*since\s*", "", regex=True, case=False)
    return pd.to_datetime(stripped, format="%m/%d/%Y", errors="coerce")

df["association_date"] = parse_assoc(df["association_date"])
df["date"]             = pd.to_datetime(df["date"], errors="coerce")

print(f"\nSTEP 0  raw → {len(df):,} rows")


STEP 0  raw → 15,621,094 rows


In [18]:
# 2.a Quick structure / quality overview ──────────────────────────────
print("\n--- BASIC INFO -------------------------------------------------------")
print(df.info(show_counts=True))

print("\n--- NULL % by column --------------------------------------------------")
null_pct = df.isna().mean().mul(100).round(1).sort_values()
display(null_pct.to_frame("percent_null").T)

print("\n--- DATE RANGE --------------------------------------------------------")
print("association_date:", df["association_date"].min(), "→", df["association_date"].max())
print("processing_date :", df["processing_date"].min(),  "→", df["processing_date"].max())

print("\n--- UNIQUE PROVIDERS & OWNERS ----------------------------------------")
print("# unique CCNs      :", df["cms_certification_number"].nunique(dropna=True))
print("# unique providers :", df["provider_name"].nunique(dropna=True))
print("# unique owners    :", df["owner_name"].nunique(dropna=True))

print("\n--- TOP 10 ROLE VALUES -----------------------------------------------")
display(df["role"].value_counts(dropna=False).head(10))


--- BASIC INFO -------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15621094 entries, 0 to 15621093
Data columns (total 12 columns):
 #   Column                    Non-Null Count     Dtype         
---  ------                    --------------     -----         
 0   cms_certification_number  15306992 non-null  string        
 1   owner_name                15549728 non-null  object        
 2   association_date          15547537 non-null  datetime64[ns]
 3   owner_type                15549728 non-null  object        
 4   processing_date           15621094 non-null  datetime64[ns]
 5   ownership_percentage      15549728 non-null  object        
 6   role                      15621094 non-null  object        
 7   provider_name             15621089 non-null  object        
 8   source_file               15621094 non-null  object        
 9   month                     15621094 non-null  int64         
 10  year                      15

Unnamed: 0,processing_date,role,provider_name,source_file,month,year,date,owner_name,association_date,owner_type,ownership_percentage,cms_certification_number
percent_null,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,2.0



--- DATE RANGE --------------------------------------------------------
association_date: 1942-01-01 00:00:00 → 2025-05-01 00:00:00
processing_date : 2017-01-01 00:00:00 → 2025-05-01 00:00:00

--- UNIQUE PROVIDERS & OWNERS ----------------------------------------
# unique CCNs      : 16294
# unique providers : 23916
# unique owners    : 91852

--- TOP 10 ROLE VALUES -----------------------------------------------


role
DIRECTOR                                     3287371
5% OR GREATER INDIRECT OWNERSHIP INTEREST    2853623
OFFICER                                      2789665
5% OR GREATER DIRECT OWNERSHIP INTEREST      2118152
MANAGING EMPLOYEE                            1971404
OPERATIONAL/MANAGERIAL CONTROL               1954320
PARTNERSHIP INTEREST                          149723
CORPORATE DIRECTOR                            147091
CORPORATE OFFICER                             128420
5% OR GREATER SECURITY INTEREST                95274
Name: count, dtype: int64

In [19]:
# 3. Keep only the roles of interest (plus NaNs) ───────────────────────
target_roles = [
    "5% OR GREATER DIRECT OWNERSHIP INTEREST",
    "5% OR GREATER INDIRECT OWNERSHIP INTEREST",
    "PARTNERSHIP INTEREST",
]
role_regex = re.compile("|".join(map(re.escape, target_roles)), flags=re.IGNORECASE)

mask_roles = df["role"].fillna("").str.contains(role_regex)
df = df[mask_roles | df["role"].isna()].copy()
print(f"\nSTEP 1  role-filter → {len(df):,} rows")


STEP 1  role-filter → 5,121,498 rows


In [20]:
# 4. De-duplicate identical ownership snapshots ───────────────────────
dedup_cols = [
    "cms_certification_number", "provider_name",
    "role", "owner_type", "owner_name",
    "ownership_percentage", "association_date"
]
df = (
    df.sort_values("processing_date")          # oldest → newest
      .drop_duplicates(subset=dedup_cols, keep="last")
      .reset_index(drop=True)
)
print(f"STEP 2  de-dup      → {len(df):,} rows")

STEP 2  de-dup      → 162,904 rows


In [21]:
# 5. Fill NaN CCNs where provider name maps unambiguously ─────────────
def fill_ccn_from_provider(df, ccn_col="cms_certification_number", name_col="provider_name"):
    mapping = (
        df.dropna(subset=[ccn_col])
          .groupby(name_col, observed=True)[ccn_col]
          .agg(lambda s: set(s))
    )
    unamb = mapping[mapping.str.len() == 1].apply(lambda s: next(iter(s)))
    before = df[ccn_col].isna().sum()
    df.loc[df[ccn_col].isna(), ccn_col] = df.loc[df[ccn_col].isna(), name_col].map(unamb)
    after = df[ccn_col].isna().sum()
    print(f"🔍 CCN fill: {before-after:,} filled | {after:,} still missing")
    return df

df = fill_ccn_from_provider(df)

🔍 CCN fill: 443 filled | 12,560 still missing


In [22]:
# ────────────────────────────────────────────────────────────────────────────────
# 5.  Save clean file
# ────────────────────────────────────────────────────────────────────────────────
out_csv = INTERIM_DIR / "ownership_file_clean.csv"
df.to_csv(out_csv, index=False)
print("\n💾 Saved clean table →", out_csv)
print("📦 Final shape       :", df.shape)


💾 Saved clean table → C:\Repositories\white_bowblis_nhmc\data\interim\ownership_file_clean.csv
📦 Final shape       : (162904, 12)
