In [1]:
# ------------------------------------------------
# Understanding the structure of the dataframe
# ------------------------------------------------

import os
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# ------------------------------------------------
# Understanding the structure of the dataframe
# ------------------------------------------------

pd.set_option("display.max_columns", None)      # show all columns
pd.set_option("display.width",         180)     # wide console wrapping

# ── 1. Resolve OneDrive + load the raw file ────────────────────────────────
# Build path dynamically so it works on desktop (Owner) *and* laptop (wrthj).
one_drive = Path(os.environ["OneDrive"])  # raises KeyError if OneDrive not set
csv_path  = one_drive / "Documents" / "Honors_Thesis" / "data" / "data-raw" / "ownership-files" / "ownership_combined.csv"

# Optional: sanity check
if not csv_path.is_file():
    raise FileNotFoundError(f"Expected CSV not found: {csv_path}")

# If the first column is an ID that should stay string‑typed, keep dtype={0: str}
df_raw = pd.read_csv(csv_path, dtype={0: str}, low_memory=False)

In [41]:
# ── 0. Inspect the available role labels (optional but recommended) ───────
print("Distinct role values:\n")
print(df_raw["role"].value_counts(dropna=False).to_string())
# ^ You’ll see counts for each label, including any odd spellings or NaNs.

# ── 1. Define the roles of interest (case‑insensitive, allows minor typos) ─
target_roles = [
    "5% OR GREATER DIRECT OWNERSHIP INTEREST",
    "5% OR GREATER INDIRECT OWNERSHIP INTEREST",
    "PARTNERSHIP INTEREST"
]

# Create a regex that matches any of the above, ignoring case & extra spaces
import re
role_regex = re.compile("|".join(map(re.escape, target_roles)), flags=re.IGNORECASE)

# ── 2. Filter, but keep rows where role is missing (NaN) ──────────────────
mask   = df_raw["role"].fillna("").str.contains(role_regex)
df_core = df[mask | df_raw["role"].isna()].copy()

print(f"\n✅ df_core shape → {df_core.shape} (kept {mask.sum():,} matching rows plus {df['role'].isna().sum():,} NaN‑role rows)")

# df_core is now your working dataset; nothing else is changed.

Distinct role values:

role
DIRECTOR                                     3287371
5% OR GREATER INDIRECT OWNERSHIP INTEREST    2853623
OFFICER                                      2789665
5% OR GREATER DIRECT OWNERSHIP INTEREST      2118152
MANAGING EMPLOYEE                            1971404
OPERATIONAL/MANAGERIAL CONTROL               1954320
PARTNERSHIP INTEREST                          149723
CORPORATE DIRECTOR                            147091
CORPORATE OFFICER                             128420
5% OR GREATER SECURITY INTEREST                95274
Ownership Data Not Available                   71366
5% OR GREATER MORTGAGE INTEREST                54685

✅ df_core shape → (5121498, 12) (kept 5,121,498 matching rows plus 0 NaN‑role rows)


In [42]:
df_core.head()

Unnamed: 0,cms_certification_number,provider_name,role,owner_type,owner_name,ownership_percentage,association_date,processing_date,source_file,month,year,date
0,15009,"BURNS NURSING HOME, INC.",5% OR GREATER DIRECT OWNERSHIP INTEREST,Individual,"DEARMAN, LARRY",10%,since 09/01/1969,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
1,365987,CALCUTTA HEALTH CARE CENTER,5% OR GREATER DIRECT OWNERSHIP INTEREST,Organization,"JCTH HOLDINGS, INC.",100%,since 10/01/2009,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
2,365987,CALCUTTA HEALTH CARE CENTER,5% OR GREATER INDIRECT OWNERSHIP INTEREST,Individual,"CILONE, JOSEPH",20%,since 10/01/2009,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
3,365987,CALCUTTA HEALTH CARE CENTER,5% OR GREATER INDIRECT OWNERSHIP INTEREST,Individual,"FRANKOVITCH, CARL",20%,since 10/01/2009,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
4,365987,CALCUTTA HEALTH CARE CENTER,5% OR GREATER INDIRECT OWNERSHIP INTEREST,Individual,"HUBER, MICHAEL",20%,since 10/01/2009,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01


In [85]:

# ── 3. Drop *exact* duplicates across the ownership‑identity fields ——–––
dedup_cols = [
    "cms_certification_number", "provider_name",
    "role", "owner_type", "owner_name",
    "ownership_percentage", "association_date"
]

df_dedup = (
    df_raw.sort_values("processing_date")      # oldest → newest
           .drop_duplicates(subset=dedup_cols, keep="last")  # keep the latest snapshot
           .reset_index(drop=True)
)

# ── 4. Report shrinkage ──────────────────────────────────────────────────
before, after = len(df_raw), len(df_dedup)
print(f"Rows before: {before:,}  →  after de‑dup: {after:,}  "
      f"({after/before:.1%} of original)")

Rows before: 15,621,094  →  after de‑dup: 549,979  (3.5% of original)


In [87]:
# ── 2. Load only the rows we need (fast) ─────────────────────────────────
# Use dtype=str so '015009' keeps its leading zero
ccn = "015009"
df_015009 = (
    pd.read_csv(
        csv_path,
        dtype={"cms_certification_number": "string"},
        low_memory=False
    )
    .query("cms_certification_number == @ccn")
    .reset_index(drop=True)
)

# ── 3. Peek at the data ─────────────────────────────────────────────────
print(f"Rows for CCN {ccn}: {len(df_015009):,}")
display(df_015009.head())

Rows for CCN 015009: 619


Unnamed: 0,cms_certification_number,provider_name,role,owner_type,owner_name,ownership_percentage,association_date,processing_date,source_file,month,year,date
0,15009,"BURNS NURSING HOME, INC.",5% OR GREATER DIRECT OWNERSHIP INTEREST,Individual,"DEARMAN, LARRY",10%,since 09/01/1969,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
1,15009,"BURNS NURSING HOME, INC.",5% OR GREATER DIRECT OWNERSHIP INTEREST,Individual,"DEARMAN, MARTHA",81%,since 09/01/1969,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
2,15009,"BURNS NURSING HOME, INC.",DIRECTOR,Individual,"DEARMAN, MARTHA",NOT APPLICABLE,since 09/07/1969,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
3,15009,"BURNS NURSING HOME, INC.",OFFICER,Individual,"DEARMAN, LARRY",NOT APPLICABLE,since 09/01/1969,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
4,15009,"BURNS NURSING HOME, INC.",OFFICER,Individual,"DEARMAN, MARTHA",NOT APPLICABLE,since 09/01/1969,2017-01-01,ownership_download_01_2017,1,2017,2017-01-01
