In [34]:
import re
import time
import subprocess
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver"  # Update if needed
BASE_URL = "https://medicine.ucsf.edu/people"

def parse_name(raw_name: str):
    """
    1. Extract nickname from quotes or parentheses.
    2. Remove everything after the first comma (e.g., degrees).
    3. Split the remaining name into tokens for First, Middle, Last.
       Extra tokens go to Last.
    """
    nickname = ""
    quote_match = re.search(r'“(.*?)”', raw_name) or re.search(r'"(.*?)"', raw_name)
    paren_match = None if quote_match else re.search(r'\((.*?)\)', raw_name)
    if quote_match:
        nickname = quote_match.group(1)
        raw_name = raw_name.replace(quote_match.group(0), "")
    elif paren_match:
        nickname = paren_match.group(1)
        raw_name = raw_name.replace(paren_match.group(0), "")
    raw_name = raw_name.split(",", 1)[0].strip()
    tokens = raw_name.split()
    if len(tokens) == 0:
        return "", "", "", nickname
    elif len(tokens) == 1:
        return "", "", tokens[0], nickname
    elif len(tokens) == 2:
        return tokens[0], "", tokens[1], nickname
    elif len(tokens) == 3:
        return tokens[0], tokens[1], tokens[2], nickname
    else:
        first = tokens[0]
        middle = tokens[1]
        last = " ".join(tokens[2:])
        return first, middle, last, nickname

def scroll_and_scrape(driver, data_rows):
    """
    Scrolls to the bottom to load lazy content,
    then scrapes all profile cards on the page.
    """
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    profile_cards = driver.find_elements(By.CSS_SELECTOR, ".profile-list-card")
    page_urls = set()
    for card in profile_cards:
        try:
            name_link = card.find_element(By.CSS_SELECTOR, "h2 a")
            raw_name = name_link.text.strip()
            profile_url = name_link.get_attribute("href")
        except Exception:
            raw_name = "Name not found"
            profile_url = ""
        page_urls.add(profile_url)
        first, middle, last, nickname = parse_name(raw_name)
        try:
            info_elem = card.find_element(By.CSS_SELECTOR, "p")
            lines = info_elem.text.split("\n")
            position = lines[0].strip() if len(lines) > 0 else ""
            department = lines[1].strip() if len(lines) > 1 else ""
        except Exception:
            position = ""
            department = ""
        data_rows.append([
            "University of California, San Francisco",
            first,
            middle,
            last,
            nickname,
            department,
            position,
            profile_url
        ])
    return page_urls

def main():
    # Prevent system sleep on macOS
    caffeinate_process = subprocess.Popen(["caffeinate", "-d"])
    service = Service(CHROMEDRIVER_PATH)
    driver = webdriver.Chrome(service=service)
    data_rows = []
    scraped_urls = set()
    page_num = 0

    try:
        while True:
            page_url = f"{BASE_URL}?page={page_num}"
            print(f"[DEBUG] Loading page {page_num}: {page_url}")
            driver.get(page_url)
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".view-people"))
            )
            current_page_urls = scroll_and_scrape(driver, data_rows)
            # If no new URLs are found, assume we've reached the end
            new_urls = current_page_urls - scraped_urls
            if not new_urls:
                print(f"[DEBUG] No new profiles on page {page_num}. Ending pagination.")
                break
            scraped_urls.update(new_urls)
            page_num += 1
            # Optional: add a sleep to avoid overloading the server
            time.sleep(1)
    finally:
        driver.quit()
        caffeinate_process.terminate()

    df = pd.DataFrame(data_rows, columns=[
        "University", "First", "Middle", "Last", "Nickname", "Department", "Position", "Link"
    ])
    output_path = "/Users/elliehozhabri/Documents/RBP/scrape/UCSF_Medicine_People.xlsx"
    df.to_excel(output_path, index=False)
    print(f"Scraping complete. Data saved to '{output_path}'.")

if __name__ == "__main__":
    main()


[DEBUG] Loading page 0: https://medicine.ucsf.edu/people?page=0
[DEBUG] Found 12 profile cards on this page.
[DEBUG] New URLs found on page 0: 12
[DEBUG] Loading page 1: https://medicine.ucsf.edu/people?page=1
[DEBUG] Found 12 profile cards on this page.
[DEBUG] New URLs found on page 1: 12
[DEBUG] Loading page 2: https://medicine.ucsf.edu/people?page=2
[DEBUG] Found 12 profile cards on this page.
[DEBUG] New URLs found on page 2: 12
[DEBUG] Loading page 3: https://medicine.ucsf.edu/people?page=3
[DEBUG] Found 12 profile cards on this page.
[DEBUG] New URLs found on page 3: 12
[DEBUG] Loading page 4: https://medicine.ucsf.edu/people?page=4
[DEBUG] Found 12 profile cards on this page.
[DEBUG] New URLs found on page 4: 12
[DEBUG] Loading page 5: https://medicine.ucsf.edu/people?page=5
[DEBUG] Found 12 profile cards on this page.
[DEBUG] New URLs found on page 5: 11
[DEBUG] Loading page 6: https://medicine.ucsf.edu/people?page=6
[DEBUG] Found 12 profile cards on this page.
[DEBUG] New URL