In [1]:
# === WHAT YOU'LL NEED TO INTERACT WITH ===

# download_dir
# This defines the download location. The default, "/Users/ashlanjo/Downloads/Statcom_final", should be updated to a path
# appropriate for your system. It's recommended to create a dedicated folder for these downloads.
# The code will redownload files to this folder if they are stored elsewhere or don't follow the naming convention.
# To preserve old files, copy them into the new folder (rather than moving or renaming them).

# last_scrap_date
# This marks the last date the script is assumed to have been run. Files dated before this are skipped, saving significant time.
# It's recommended to backdate last_scrap_date by ~2 days to avoid missing late-uploaded files (e.g., uploads made later the same day).
# Files already downloaded won't be downloaded again, so this buffer is harmless.
# To force downloading older files, set last_scrap_date to a date far in the past (e.g., 20 years ago).
# This is easier than modifying the logic that skips older files.

# urls_to_run = sub_urls
# This should generally be left unchanged. However, to run the code for specific agencies, you can redefine sub_urls to a subset.
# Use print(urls_to_run) and check the links to ensure the correct agencies are selected.
# The order in sub_urls matches the order on the website.
# Be sure to reset it back to the full list once you're done.

# Manual interruptions and errors
# If you manually stop the code, make sure to run driver.quit() before doing anything else.
# This ensures that the browser is properly closed so it can reopen for future scraping sessions.
# If the code errors out, driver.quit() is automatically executed.

In [9]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

chrome_server = Service(executable_path='/Users/stefaneng/lib/chromedriver-mac-arm64/chromedriver')
chrome_server.start()
driver = webdriver.Chrome(service=chrome_server)
driver.get('http://www.google.com/')
time.sleep(5) # Let the user actually see something!
search_box = driver.find_element("name", "q")
search_box.send_keys('ChromeDriver')
search_box.submit()
time.sleep(5) # Let the user actually see something!
driver.quit()

In [None]:
#To Do
#Send to Kevin

In [16]:
# Selenium is used for web automation and scraping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
# Time utilities for delays and timestamp handling
import time
from time import sleep
# Date and time parsing
from datetime import datetime
# File and path operations
from pathlib import Path
import os
import glob

In [56]:
#This code collects the links for all of the individual agencies (location where children stay)
#These links will be iterated through in future code chunks

# Initialize a Chrome browser session and navigate to the main licensing search page
chrome_server = Service(executable_path='/Users/stefaneng/lib/chromedriver-mac-arm64/chromedriver')
chrome_server.start()
driver = webdriver.Chrome(service=chrome_server)

try:
    driver.get("https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/")
    time.sleep(5) # Wait for the page to fully load (longer than usual due to dynamic content)
    # Prepare an empty list to store all agency-specific URLs found across pages

    sub_urls = []
    table_header = []
    table_data = []
    header_elements = driver.find_elements(By.XPATH, "//lightning-datatable//table/thead/tr/th")
    # Clean up header text by removing "Sort by:" and "Sorted: None"
    for i, header in enumerate(header_elements):
        text = header.text.strip()
        text = text.replace("Sort by:", "").replace("Sorted: None", "").replace("\n", " ").strip()
        table_header.append(text)

    while True:
        # Find the license number rows:
        # Change this to tr OR th
        table_rows = driver.find_elements(By.XPATH, "//lightning-datatable//table/tbody/tr")
        for row in table_rows:
            # Find all columns (td elements) in the row
            row_data = []
            columns = row.find_elements(By.XPATH, "./td | ./th")
            for col in columns:
                row_data.append(col.text.strip())
            table_data.append(row_data)

        for row in table_rows:
            link_elements = row.find_elements(By.XPATH, ".//lightning-formatted-url/a")
            for link in link_elements:
                href = link.get_attribute('href')
                if href:
                    sub_urls.append(href)

        try:
            # Try to locate and click the "Next" page button to load the next page of results
            next_button = driver.find_element(By.XPATH, "//lightning-button-icon[3]/button/lightning-primitive-icon")
            next_button.click()
        except ElementClickInterceptedException:
            # If the click fails (e.g., no more pages or overlay blocking it), stop the loop
            print("No more pages available.")
            break

finally:
    driver.quit()
# Show the number of agency URLs collected
len(sub_urls)

print(table_header)
print(table_data)

No more pages available.
['License #', 'Agency Name', 'City', 'County', 'Zip Code', 'Agency Type']
[['CI390407805', "Glen's House", 'Kalamazoo', 'Kalamazoo', '49001-3170', 'Child Caring Institution: Private'], ['CI250410439', 'Mission Ranch', 'Gaines', 'Genesee', '48436-9746', 'Child Caring Institution: Private'], ['CI410414464', 'Heartland Center for Autism', 'Grand Rapids', 'Kent', '49507-2772', 'Child Caring Institution: Private'], ['CI670414130', 'Osceola Youth Center', 'Evart', 'Osceola', '49631-7218', 'Child Caring Institution: Private'], ['CI390403045', "Let's Talk About it Girl's Home III", 'Kalamazoo', 'Kalamazoo', '49007-4615', 'Child Caring Institution: Private'], ['CI730404189', 'House of Love Agency', 'Saginaw', 'Saginaw', '48602-1528', 'Child Caring Institution: Private'], ['CI330410650', 'Jackson House', 'Lansing', 'Ingham', '48910-2871', 'Child Caring Institution: Private'], ['CI820294390', 'VISTA MARIA', 'DEARBORN HEIGHTS', 'Wayne', '48127-2622', 'Child Caring Institut

In [60]:
import csv
import re

# Prepare header with additional columns
csv_header = table_header + ["URL", "agency_id"]
# Replace "#" with Number
csv_header = [col.replace("#", "Number") for col in csv_header]

# Prepare data rows with URL and agency_id
csv_rows = []
for row, url in zip(table_data, sub_urls):
    # Extract agency_id from the URL
    match = re.search(r'agency=([^&]+)', url)
    agency_id = match.group(1) if match else ""
    csv_rows.append(row + [url, agency_id])

# Write to CSV
csv_file_path = "agencies_with_urls.csv"
# Quote all fields to handle commas and special characters
with open(csv_file_path, "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerow(csv_header)
    writer.writerows(csv_rows)

print(f"CSV file saved to {csv_file_path}")

CSV file saved to agencies_with_urls.csv


In [35]:
chrome_server = Service(executable_path='/Users/stefaneng/lib/chromedriver-mac-arm64/chromedriver')
chrome_server.start()
driver = webdriver.Chrome(service=chrome_server)
driver.get("https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/")
time.sleep(10)
entire_table = driver.find_elements(By.XPATH, "//lightning-datatable")
# Parse the lightning-datatable into a list of rows and columns (as text)
table_data = []
# Find all table rows within the datatable
table_rows = driver.find_elements(By.XPATH, "//lightning-datatable//table/tbody/tr")
# Get the table header
table_header = []
header_elements = driver.find_elements(By.XPATH, "//lightning-datatable//table/thead/tr/th")
# Clean up header text by removing "Sort by:" and "Sorted: None"
for i, header in enumerate(header_elements):
    text = header.text.strip()
    text = text.replace("Sort by:", "").replace("Sorted: None", "").replace("\n", " ").strip()
    table_header.append(text)
for row in table_rows:
    row_data = []
    # Find all columns (td elements) in the row
    columns = row.find_elements(By.TAG_NAME, "td")
    for col in columns:
        row_data.append(col.text.strip())
    table_data.append(row_data)
driver.quit()




In [38]:
#print(entire_table)
print(table_rows)
#print(table_data)
print(table_header)

[<selenium.webdriver.remote.webelement.WebElement (session="ad1af21853314b14c8816d8b682b1461", element="f.062E6C7FBC71E1F82885676ED87A6C0C.d.34970BFDC27B3B6AEC7F34C93A70066E.e.40")>, <selenium.webdriver.remote.webelement.WebElement (session="ad1af21853314b14c8816d8b682b1461", element="f.062E6C7FBC71E1F82885676ED87A6C0C.d.34970BFDC27B3B6AEC7F34C93A70066E.e.41")>, <selenium.webdriver.remote.webelement.WebElement (session="ad1af21853314b14c8816d8b682b1461", element="f.062E6C7FBC71E1F82885676ED87A6C0C.d.34970BFDC27B3B6AEC7F34C93A70066E.e.42")>, <selenium.webdriver.remote.webelement.WebElement (session="ad1af21853314b14c8816d8b682b1461", element="f.062E6C7FBC71E1F82885676ED87A6C0C.d.34970BFDC27B3B6AEC7F34C93A70066E.e.43")>]
['License #', 'Agency Name', 'City', 'County', 'Zip Code', 'Agency Type']


In [None]:
#All files prior to this date will not be downloaded
#All files on or after this data will usually be downloaded
#The exception is if the file is already in the download location with the correct name
last_scrap_date = "7-01-2025"
last_scrap_date = datetime.strptime(last_scrap_date, '%m-%d-%Y').date()

#All files will be downloaded here
download_dir = "/Users/stefaneng/Library/CloudStorage/Dropbox-UniversityofMichigan/Stefan Eng/MCYJ_parsing/test_download"

#This line can be adjusted to run only certain agencies
urls_to_run = sub_urls

In [14]:
#Initialize the browser
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory": download_dir}
chromeOptions.add_experimental_option("prefs", prefs)
#driver = webdriver.Chrome(options=chromeOptions)
driver = webdriver.Chrome(service=chrome_server, options=chromeOptions)

#Iterates through each of the urls collected above
try:
    for url in urls_to_run:
            # Open the target page
        driver.get(url)
        time.sleep(20)

        # Get the document agency name
        document_agency_element = driver.find_elements(By.XPATH, "//lightning-layout-item[1]/slot/div[1]/div")
        document_agency = "_".join([element.text.strip().replace(" ", "_").replace("/", "_") for element in document_agency_element]) if document_agency_element else "Unknown_Agency"

        print(document_agency)

        # Locate all rows within the table
        table_rows = driver.find_elements(By.XPATH, "//lightning-datatable/div[2]/div/div/table/tbody/tr")
        print("Number of rows found:", len(table_rows))

        # Iterate through each row/ document
        for row in reversed(table_rows):
            try:
                # Extract document date from the 1st column and reformats it
                document_date_element = row.find_element(By.XPATH, "./td[1]")
                document_date = document_date_element.text.strip().replace(" ", "_")
                sanitized_date = document_date.replace("/", "-")
                sanitized_date = datetime.strptime(sanitized_date, '%m-%d-%Y').date()

                # Extract document name from the 2nd column and reformats it
                document_name_element = row.find_element(By.XPATH, "./td[2]")
                document_name_temp = document_name_element.text.strip().replace(" ", "_")
                document_name = document_name_temp.replace("/", "-")

                #Skips all the files that should have been scrapped on a prior run
                if sanitized_date < last_scrap_date:
                    print(document_name, " and all files before it should have been downloaded before")
                    break

                # Construct the new file name
                new_file_name = f"{document_agency}_{document_name}_{sanitized_date}.pdf"
                new_file_path = os.path.join(download_dir, new_file_name)

                #If the file is already downloaded (and follows the naming convention)
                #It will skip it
                if os.path.exists(new_file_path):
                    print(document_name, "has been downloaded since last scrap")
                    continue

                # Locate the clickable element in the 4th column
                clickable_element = row.find_element(By.XPATH, "./td[4]")

                # Check if the element is clickable
                if clickable_element.is_enabled() and clickable_element.is_displayed():
                    # Click to download
                    clickable_element.click()
                    print(f"Clicked on element in row with document '{document_name}'")

                    # Wait for the file to download (adjust wait time as needed)
                    time.sleep(10)

                    # Find the most recently downloaded file
                    list_of_files = glob.glob(f"{download_dir}/*")
                    latest_file = max(list_of_files, key=os.path.getctime)

                    # Rename the file
                    os.rename(latest_file, new_file_path)
                    print(f"Renamed file to: {new_file_name}")

            except Exception as e:
                print("Failed to click on element in row:", e)
    driver.quit()
finally:
    driver.quit()

Glen's_House
Number of rows found: 4
Glens_House_interim_2024  and all files before it should have been downloaded before
Mission_Ranch
Number of rows found: 11
Mission_Ranch_Renewal_2025  and all files before it should have been downloaded before
Heartland_Center_for_Autism
Number of rows found: 6
2025SIC0000409  and all files before it should have been downloaded before
Osceola_Youth_Center
Number of rows found: 25
Clicked on element in row with document '2025SIC0000657'
Renamed file to: Osceola_Youth_Center_2025SIC0000657_2025-07-09.pdf
Clicked on element in row with document '2025SIC000737'
Renamed file to: Osceola_Youth_Center_2025SIC000737_2025-07-09.pdf
Clicked on element in row with document '2025SIC0000763'
Renamed file to: Osceola_Youth_Center_2025SIC0000763_2025-07-02.pdf
2025SIC0000655  and all files before it should have been downloaded before
Let's_Talk_About_it_Girl's_Home_III
Number of rows found: 17
2025SIC0000512  and all files before it should have been downloaded be

KeyboardInterrupt: 