In [9]:
import pandas as pd
from pathlib import Path

In [10]:
# 1. Folder containing your monthly provider_info CSVs
DATA_DIR = Path(r"C:\Users\wrthj\OneDrive\NursingHomeData\provider-info-files")

# 2. The CCN we care about
TARGET_CCN = "366480"

# 3. Collect matches here
matches = []

# 4. Loop over years & months
for year in range(2018, 2026):
    max_month = 7 if year == 2025 else 12
    for month in range(1, max_month + 1):
        fn = DATA_DIR / f"provider_info_{month:02d}_{year}.csv"
        if not fn.exists():
            continue
        # read & filter
        df = pd.read_csv(fn, dtype=str, parse_dates=["date"], low_memory=False)
        sel = df[df["cms_certification_number"] == TARGET_CCN]
        if not sel.empty:
            matches.append(sel)

# 5. Concatenate (or empty DF if no matches)
if matches:
    df_ccn = pd.concat(matches, ignore_index=True)
else:
    # if you want the same columns as your monthly files, pick one representative file:
    sample = next((DATA_DIR / f"provider_info_{m:02d}_{y}.csv" 
                   for y in range(2018,2026) for m in range(1, (5 if y==2025 else 12)+1)
                   if (DATA_DIR / f"provider_info_{m:02d}_{y}.csv").exists()), None)
    df_ccn = pd.read_csv(sample, nrows=0) if sample else pd.DataFrame()

# 6. Review
print(f"Found {len(df_ccn)} rows for CCN {TARGET_CCN}\n")
# this will render as a DataFrame in Jupyter
df_ccn

Found 52 rows for CCN 366480



Unnamed: 0,cms_certification_number,provider_name,address,city,state,zip_code,phone,provider_ssa_county_code,provider_county_name,ownership_type,...,chain_average_overall_5star_rating,chain_average_health_inspection_rating,chain_average_staffing_rating,chain_average_qm_rating,rating_cycle_23_total_number_of_health_deficiencies,rating_cycle_23_number_of_complaint_health_deficiencies,rating_cycle_23_health_deficiency_score,rating_cycle_23_number_of_health_revisits,rating_cycle_23_health_revisit_score,rating_cycle_23_total_health_score
0,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
1,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
2,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
3,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
4,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
5,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
6,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
7,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
8,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,
9,366480,TAYLOR SPRINGS HEALTH CAMPUS,748 TAYLOR ROAD,GAHANNA,OH,43230,6148636384.0,250,Franklin,For profit - Limited Liability company,...,,,,,,,,,,


In [11]:
print(df_ccn.columns.tolist)

<bound method IndexOpsMixin.tolist of Index(['cms_certification_number', 'provider_name', 'address', 'city', 'state',
       'zip_code', 'phone', 'provider_ssa_county_code', 'provider_county_name',
       'ownership_type',
       ...
       'chain_average_overall_5star_rating',
       'chain_average_health_inspection_rating',
       'chain_average_staffing_rating', 'chain_average_qm_rating',
       'rating_cycle_23_total_number_of_health_deficiencies',
       'rating_cycle_23_number_of_complaint_health_deficiencies',
       'rating_cycle_23_health_deficiency_score',
       'rating_cycle_23_number_of_health_revisits',
       'rating_cycle_23_health_revisit_score',
       'rating_cycle_23_total_health_score'],
      dtype='object', length=124)>


In [12]:
for col in df_ccn.columns:
    print(col)

cms_certification_number
provider_name
address
city
state
zip_code
phone
provider_ssa_county_code
provider_county_name
ownership_type
number_of_certified_beds
average_number_of_residents_per_day
average_number_of_residents_per_day_footnote
provider_type
provider_resides_in_hospital
legal_business_name
date_first_approved_to_provide_medicare_and_medicaid_services
continuing_care_retirement_community
special_focus_status
abuse_icon
most_recent_health_inspection_more_than_2_years_ago
provider_changed_ownership_in_last_12_months
with_a_resident_and_family_council
automatic_sprinkler_systems_in_all_required_areas
overall_rating
overall_rating_footnote
health_inspection_rating
health_inspection_rating_footnote
qm_rating
qm_rating_footnote
longstay_qm_rating
longstay_qm_rating_footnote
shortstay_qm_rating
shortstay_qm_rating_footnote
staffing_rating
staffing_rating_footnote
rn_staffing_rating
rn_staffing_rating_footnote
reported_staffing_footnote
physical_therapist_staffing_footnote
reported_

In [13]:
cols_to_keep = ["cms_certification_number",
                "provider_name",
                "number_of_certified_beds",
                "average_number_of_residents_per_day",
                "overall_rating",
                "health_inspection_rating",
                "staffing_rating",
                "source_file",
                "reported_nurse_aide_staffing_hours_per_resident_per_day",
                "reported_lpn_staffing_hours_per_resident_per_day",
                "reported_rn_staffing_hours_per_resident_per_day",
                "reported_total_nurse_staffing_hours_per_resident_per_day",
                "processing_date"
               ]

df_clean = df_ccn[cols_to_keep]

In [14]:
df_clean.head(n=105)

Unnamed: 0,cms_certification_number,provider_name,number_of_certified_beds,average_number_of_residents_per_day,overall_rating,health_inspection_rating,staffing_rating,source_file,reported_nurse_aide_staffing_hours_per_resident_per_day,reported_lpn_staffing_hours_per_resident_per_day,reported_rn_staffing_hours_per_resident_per_day,reported_total_nurse_staffing_hours_per_resident_per_day,processing_date
0,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,,,,,provider_info_03_2021.csv,,,,,2021-03-01
1,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,,,,,provider_info_04_2021.csv,,,,,2021-04-01
2,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,,,,,provider_info_05_2021.csv,,,,,2021-05-01
3,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,,,,,provider_info_06_2021.csv,,,,,2021-06-01
4,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,5.6,,,,provider_info_07_2021.csv,,,,,2021-07-01
5,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,5.6,,,,provider_info_08_2021.csv,,,,,2021-08-01
6,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,5.6,,,,provider_info_09_2021.csv,,,,,2021-09-01
7,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,17.3,,,,provider_info_10_2021.csv,,,,,2021-10-01
8,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,17.3,,,,provider_info_11_2021.csv,,,,,2021-10-01
9,366480,TAYLOR SPRINGS HEALTH CAMPUS,50,17.3,,,,provider_info_12_2021.csv,,,,,2021-11-01


In [15]:
outfile = r"C:\Repositories\jefferson-township-run-forecasting\data\raw\nh_data.csv"
df_clean.to_csv(outfile, index=False)