In [1]:
# ------------------------------------------------------------------------------
# 1.  Project-wide paths (portable)
# ------------------------------------------------------------------------------

import os, re, shutil, zipfile, pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR      = Path(os.getenv("NH_DATA_DIR",  PROJECT_ROOT / "data" / "raw"))
OWN_DIR      = RAW_DIR / "ownership-files"
INTERIM_DIR  = PROJECT_ROOT / "data" / "interim"

OWN_DIR.mkdir(parents=True, exist_ok=True)
INTERIM_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
# 2.  Load raw CSV ─────────────────────────────────────────────────────
csv_path = RAW_DIR / "ownership-files" / "ownership_combined.csv"

dtype_map = {"cms_certification_number": "string"}      # keep leading zeros
df = pd.read_csv(
    csv_path,
    dtype=dtype_map,
    parse_dates=["processing_date"],                    # ISO format
    low_memory=False
)

# parse two messy date columns
def parse_assoc(col: pd.Series) -> pd.Series:
    stripped = col.str.replace(r"^\s*since\s*", "", regex=True, case=False)
    return pd.to_datetime(stripped, format="%m/%d/%Y", errors="coerce")

df["association_date"] = parse_assoc(df["association_date"])
df["date"]             = pd.to_datetime(df["date"], errors="coerce")

print(f"\nSTEP 0  raw → {len(df):,} rows")


STEP 0  raw → 15,916,207 rows


In [3]:
# 2.a Quick structure / quality overview ──────────────────────────────
print("\n--- BASIC INFO -------------------------------------------------------")
print(df.info(show_counts=True))

print("\n--- NULL % by column --------------------------------------------------")
null_pct = df.isna().mean().mul(100).round(1).sort_values()
display(null_pct.to_frame("percent_null").T)

print("\n--- DATE RANGE --------------------------------------------------------")
print("association_date:", df["association_date"].min(), "→", df["association_date"].max())
print("processing_date :", df["processing_date"].min(),  "→", df["processing_date"].max())

print("\n--- UNIQUE PROVIDERS & OWNERS ----------------------------------------")
print("# unique CCNs      :", df["cms_certification_number"].nunique(dropna=True))
print("# unique providers :", df["provider_name"].nunique(dropna=True))
print("# unique owners    :", df["owner_name"].nunique(dropna=True))

print("\n--- TOP 10 ROLE VALUES -----------------------------------------------")
display(df["role"].value_counts(dropna=False).head(10))


--- BASIC INFO -------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15916207 entries, 0 to 15916206
Data columns (total 12 columns):
 #   Column                    Non-Null Count     Dtype         
---  ------                    --------------     -----         
 0   cms_certification_number  15916207 non-null  string        
 1   provider_name             15916202 non-null  object        
 2   role                      15916207 non-null  object        
 3   owner_type                15843474 non-null  object        
 4   owner_name                15843474 non-null  object        
 5   ownership_percentage      15843474 non-null  object        
 6   association_date          15841244 non-null  datetime64[ns]
 7   processing_date           15916207 non-null  datetime64[ns]
 8   source_file               15916207 non-null  object        
 9   month                     15916207 non-null  int64         
 10  year                      15

Unnamed: 0,cms_certification_number,provider_name,role,processing_date,source_file,month,year,date,owner_type,owner_name,ownership_percentage,association_date
percent_null,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5



--- DATE RANGE --------------------------------------------------------
association_date: 1942-01-01 00:00:00 → 2025-07-01 00:00:00
processing_date : 2017-01-01 00:00:00 → 2025-07-01 00:00:00

--- UNIQUE PROVIDERS & OWNERS ----------------------------------------
# unique CCNs      : 16402
# unique providers : 24104
# unique owners    : 94347

--- TOP 10 ROLE VALUES -----------------------------------------------


role
DIRECTOR                                     3287371
5% OR GREATER INDIRECT OWNERSHIP INTEREST    2910876
OFFICER                                      2789665
5% OR GREATER DIRECT OWNERSHIP INTEREST      2157271
OPERATIONAL/MANAGERIAL CONTROL               2005393
MANAGING EMPLOYEE                            1971404
CORPORATE DIRECTOR                            203809
CORPORATE OFFICER                             178971
PARTNERSHIP INTEREST                          149723
5% OR GREATER SECURITY INTEREST                96978
Name: count, dtype: int64

In [4]:
# 3. Keep only the roles of interest (plus NaNs) ───────────────────────
target_roles = [
    "5% OR GREATER DIRECT OWNERSHIP INTEREST",
    "5% OR GREATER INDIRECT OWNERSHIP INTEREST",
    "PARTNERSHIP INTEREST",
]
role_regex = re.compile("|".join(map(re.escape, target_roles)), flags=re.IGNORECASE)

mask_roles = df["role"].fillna("").str.contains(role_regex)
df = df[mask_roles | df["role"].isna()].copy()
print(f"\nSTEP 1  role-filter → {len(df):,} rows")


STEP 1  role-filter → 5,220,005 rows


In [5]:
# 4. De-duplicate identical ownership snapshots ───────────────────────
dedup_cols = [
    "cms_certification_number", "provider_name",
    "role", "owner_type", "owner_name",
    "ownership_percentage", "association_date"
]
df = (
    df.sort_values("processing_date")          # oldest → newest
      .drop_duplicates(subset=dedup_cols, keep="last")
      .reset_index(drop=True)
)
print(f"STEP 2  de-dup      → {len(df):,} rows")

STEP 2  de-dup      → 165,339 rows


In [6]:
# ────────────────────────────────────────────────────────────────────────────────
# 5.  Save clean file
# ────────────────────────────────────────────────────────────────────────────────
out_csv = INTERIM_DIR / "ownership_file_clean.csv"
df.to_csv(out_csv, index=False)
print("\n💾 Saved clean table →", out_csv)
print("📦 Final shape       :", df.shape)


💾 Saved clean table → C:\Repositories\white-bowblis-nhmc\data\interim\ownership_file_clean.csv
📦 Final shape       : (165339, 12)


In [2]:
import os, re, csv, zipfile, shutil, tempfile, warnings
from io import BytesIO
from pathlib import Path

import numpy as np
import pandas as pd

# ============================== Config / Paths ================================
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "src").is_dir() and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

RAW_DIR     = Path(os.getenv("NH_DATA_DIR", PROJECT_ROOT / "data" / "raw"))
NH_ZIP_DIR  = RAW_DIR / "nh-compare"
OWN_DIR     = RAW_DIR / "ownership-files"
OWN_DIR.mkdir(parents=True, exist_ok=True)

# Behavior flags
DRY_RUN        = False          # True = preview only
NAME_STYLE     = "mm_yyyy"      # "mm_yyyy" or "yyyy_mm"
DO_STANDARDIZE = True           # standardize in-place before combine
DO_COMBINE     = True           # write final combined CSV
COMBINED_CSV   = OWN_DIR / "ownership_combined.csv"

print(f"[paths] NH_ZIP_DIR={NH_ZIP_DIR}")
print(f"[paths] OWN_DIR={OWN_DIR}")

[paths] NH_ZIP_DIR=C:\Users\Owner\OneDrive\NursingHomeData\nh-compare
[paths] OWN_DIR=C:\Users\Owner\OneDrive\NursingHomeData\ownership-files


In [3]:
df = pd.read_csv(OWN_DIR / "ownership_combined.csv")

In [4]:
# Basic shape & column info
print("Shape:", df.shape)
print("\nColumns & dtypes:")
print(df.dtypes)

# Peek at first few rows
print("\nHead:")
print(df.head(5).to_string())

# Null counts
print("\nNull counts:")
print(df.isnull().sum())

# Distinct roles
print("\nUnique role values:", df['role'].unique())

# Ownership percentage summary
print("\nOwnership % summary:")
print(df['ownership_percentage'].describe())

# Date ranges
print("\nAssociation date range:", df['association_date'].min(), "to", df['association_date'].max())
print("Processing date range:", df['processing_date'].min(), "to", df['processing_date'].max())

# Count facilities (unique CCNs) and total owners
print("\nUnique facilities (CCNs):", df['cms_certification_number'].nunique())
print("Unique owners:", df['owner_name'].nunique())

Shape: (5330314, 7)

Columns & dtypes:
cms_certification_number      int64
role                         object
owner_type                   object
owner_name                   object
ownership_percentage        float64
association_date             object
processing_date              object
dtype: object

Head:
   cms_certification_number    role  owner_type owner_name  ownership_percentage association_date processing_date
0                      5125  DIRECT  Individual   VO, TIEN                 100.0       2016-10-24      2019-01-01
1                      5125  DIRECT  Individual   VO, TIEN                 100.0       2016-10-24      2020-01-01
2                      5125  DIRECT  Individual   VO, TIEN                 100.0       2016-10-24      2019-02-01
3                      5125  DIRECT  Individual   VO, TIEN                 100.0       2016-10-24      2020-02-01
4                      5125  DIRECT  Individual   VO, TIEN                 100.0       2016-10-24      2019-03-01

Nul