In [None]:
# Load Files
import pandas as pd
from pathlib import Path
import re

RAW_PATH = Path("../data/raw")
CLEAN_PATH = Path("../data/clean")

def extract_year(path):
    m = re.search(r"(20[0-2][0-9])", path.name)
    return int(m.group(1)) if m else None

service_files = sorted(RAW_PATH.glob("*Services_and_Profiles*.csv"))
service_files


[WindowsPath('../data/raw/2020_Final_Assisted_Reproductive_Technology_(ART)_Services_and_Profiles_20251127.csv'),
 WindowsPath('../data/raw/2021_Final_Assisted_Reproductive_Technology_(ART)_Services_and_Profiles_20251127.csv'),
 WindowsPath('../data/raw/2022_Final_Assisted_Reproductive_Technology_(ART)_Services_and_Profiles_20251127.csv')]

In [None]:
# Load and combined services
services_dfs = []

for f in service_files:
    year = extract_year(f)
    print("Loading:", f.name)
    df = pd.read_csv(f)
    df["year"] = year
    services_dfs.append(df)

services_all = pd.concat(services_dfs, ignore_index=True)
services_all.shape


Loading: 2020_Final_Assisted_Reproductive_Technology_(ART)_Services_and_Profiles_20251127.csv
Loading: 2021_Final_Assisted_Reproductive_Technology_(ART)_Services_and_Profiles_20251127.csv
Loading: 2022_Final_Assisted_Reproductive_Technology_(ART)_Services_and_Profiles_20251127.csv


(17012, 18)

In [None]:
# Inspect columns
services_all.columns


Index(['Year', 'LocationAbbr', 'LocationDesc', 'FacilityName',
       'MedicalDirector', 'Address', 'City', 'Zipcode', 'Phone',
       'Clinic Status', 'Topic', 'SubTopic', 'Data_Value', 'ClinicId',
       'TopicId', 'SubTopicId', 'Geolocation', 'year'],
      dtype='object')

In [None]:
# Drop unused columns
cols_to_drop = [
    "MedicalDirector",
    "Address",
    "Zipcode",
    "Phone",
    "TopicId",
    "SubTopicId"
]

services_clean = services_all.drop(columns=cols_to_drop, errors="ignore")
services_clean.head()


Unnamed: 0,Year,LocationAbbr,LocationDesc,FacilityName,City,Clinic Status,Topic,SubTopic,Data_Value,ClinicId,Geolocation,year
0,2020,,Other,National,,Open,Clinic Services & Profile,Fertility preservation cycles,24120,9999,,2020
1,2020,TX,Texas,IVFMD-Arlington,Arlington,Open,Clinic Services & Profile,SART member,Yes,827,POINT (-97.1133526 32.6905651),2020
2,2020,,Other,National,,Open,Clinic Services & Profile,Gestational carrier services,90%,9999,,2020
3,2020,,National,National,,Open,Clinic Services & Profile,Verified lab accreditation - Yes,88%,9999,,2020
4,2020,,National,National,,Open,Clinic Services & Profile,Fertility preservation cycles,24120,9999,,2020


In [None]:
# Standardize column names
services_clean = services_clean.rename(columns={
    "Clinic Status": "clinic_status",
    "Data_Value": "data_value"
})

services_clean.columns = (
    services_clean.columns
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("__", "_")
)

services_clean.head()
services_clean.columns


Index(['year', 'locationabbr', 'locationdesc', 'facilityname', 'city',
       'clinic_status', 'topic', 'subtopic', 'data_value', 'clinicid',
       'geolocation', 'year'],
      dtype='object')

In [6]:
services_clean.to_csv("../data/clean/services_clean.csv", index=False)


In [7]:
services_clean.shape


(17012, 12)