In [9]:
# 1) Imports & Paths
import pandas as pd
from pathlib import Path

ROOT      = Path().resolve().parents[0]
RAW_DIR   = ROOT / "data" / "raw"
CLEAN_DIR = ROOT / "data" / "clean"

In [10]:
# 2) Load raw nursing‐home data
raw_path = RAW_DIR / "nh_data.csv"
df = pd.read_csv(raw_path, dtype=str)

In [11]:
# 3) Parse processing_date and drop unparseable rows
df["processing_date"] = pd.to_datetime(df["processing_date"], errors="coerce")
df = df.dropna(subset=["processing_date"])

In [12]:
# 4) Restrict to analysis window (Jan 2018 – Jul 2025)
df = df[
    (df["processing_date"] >= "2018-01-01") &
    (df["processing_date"] <= "2025-07-31")
]

In [13]:
# 5) Cast key numeric columns
numeric_cols = [
    "number_of_certified_beds",
    "average_number_of_residents_per_day"
]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

In [14]:
# 6) Drop columns that are entirely null (e.g., empty rating fields)
df = df.dropna(axis=1, how="all")

In [15]:
# 7) Drop exact duplicates by (CMS ID, processing_date)
df = df.drop_duplicates(subset=["cms_certification_number", "processing_date"])

In [16]:
# 8) Create a month‐stamp for merging
df["report_month"] = df["processing_date"].dt.to_period("M").dt.to_timestamp()

In [17]:
# 9) Save cleaned dataset
output_path = CLEAN_DIR / "nh_data_clean.csv"
df.to_csv(output_path, index=False)
print(f"Cleaned nursing‐home rows: {len(df)}")
df.head()

Cleaned nursing‐home rows: 139


Unnamed: 0,cms_certification_number,provider_name,number_of_certified_beds,average_number_of_residents_per_day,overall_rating,health_inspection_rating,staffing_rating,source_file,reported_nurse_aide_staffing_hours_per_resident_per_day,reported_lpn_staffing_hours_per_resident_per_day,reported_rn_staffing_hours_per_resident_per_day,reported_total_nurse_staffing_hours_per_resident_per_day,processing_date,report_month
0,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,,,,,provider_info_03_2021.csv,,,,,2021-03-01,2021-03-01
1,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,,,,,provider_info_04_2021.csv,,,,,2021-04-01,2021-04-01
2,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,,,,,provider_info_05_2021.csv,,,,,2021-05-01,2021-05-01
3,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,,,,,provider_info_06_2021.csv,,,,,2021-06-01,2021-06-01
4,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,5.6,,,,provider_info_07_2021.csv,,,,,2021-07-01,2021-07-01
