In [1]:
import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import warnings
import subprocess
import os
import re

# Suppress BeautifulSoup XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# Prevent system sleep using caffeinate
caffeinate_process = subprocess.Popen(["caffeinate", "-d"])  # Keeps Mac awake during script execution

def setup_driver():
    try:
        options = webdriver.ChromeOptions()
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-gpu")
        # Disable headless mode for debugging
        # options.add_argument("--headless")
        options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid detection
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")
        service = Service("/opt/homebrew/bin/chromedriver")  # Adjust path if needed
        driver = webdriver.Chrome(service=service, options=options)
        return driver
    except Exception as e:
        print(f"Error initializing ChromeDriver: {e}")
        exit()

driver = setup_driver()

# Navigate to UCSF homepage first to establish session
driver.get("https://profiles.ucsf.edu")
time.sleep(3)
print("UCSF homepage loaded successfully.")

driver.get("https://profiles.ucsf.edu/search/advanced")
time.sleep(3)
print("Search page loaded successfully.")

# Select the appropriate researcher types
positions_to_select = [
    "Professor", "Assistant Professor", "Associate Professor", "Clinical Research Coordinator"
]

for position in positions_to_select:
    try:
        checkbox = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.XPATH, f"//label[contains(text(), '{position}')]/preceding-sibling::input"))
        )
        driver.execute_script("arguments[0].scrollIntoView();", checkbox)  # Ensure visibility
        if not checkbox.is_selected():
            checkbox.click()
        print(f"Selected: {position}")
    except Exception as e:
        print(f"Could not select {position}: {e}")

time.sleep(2)

# Click the search button with corrected logic
try:
    search_button = WebDriverWait(driver, 30).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "search-button"))
    )
    driver.execute_script("arguments[0].scrollIntoView();", search_button)
    time.sleep(2)
    driver.execute_script("arguments[0].click();", search_button)
    print("Search button clicked successfully.")
except Exception as e:
    print(f"Failed to click search button: {e}")
    driver.quit()
    exit()

# Dynamically wait for search results to appear
try:
    time.sleep(10)  # Extra wait for JavaScript execution
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # Force-load results
    WebDriverWait(driver, 90).until(
        EC.presence_of_element_located((By.XPATH, "//a[contains(@class, 'listTableLink')]")
    ))
    print("Search results loaded successfully.")
except Exception as e:
    print(f"Search results did not load: {e}")
    driver.quit()
    exit()

def parse_name(full_name):
    print(f"Extracting name from: {full_name}")
    # Remove degrees (everything after the first comma)
    full_name = re.split(r',', full_name, 1)[0].strip()
    
    # Extract nickname (inside quotes or parentheses)
    nickname_match = re.search(r'“(.*?)”|\((.*?)\)', full_name)
    nickname = nickname_match.group(1) if nickname_match and nickname_match.group(1) else (nickname_match.group(2) if nickname_match and nickname_match.group(2) else "")
    full_name = re.sub(r'“.*?”|\(.*?\)', '', full_name).strip()
    
    # Split into parts
    name_parts = full_name.split()
    
    if len(name_parts) == 1:
        return "", "", name_parts[0], nickname  # First, Middle, Last, Nickname
    elif len(name_parts) == 2:
        return name_parts[0], "", name_parts[1], nickname
    elif len(name_parts) == 3:
        return name_parts[0], name_parts[1].replace(".", ""), name_parts[2], nickname
    else:
        return name_parts[0], name_parts[1].replace(".", ""), " ".join(name_parts[2:]), nickname

def scrape_faculty():
    faculty_elements = driver.find_elements(By.XPATH, "//a[contains(@class, 'listTableLink')]")
    
    if not faculty_elements:
        print("No faculty found in search results.")
        driver.quit()
        return
    
    print(f"Found {len(faculty_elements)} faculty profiles.")
    faculty_data = []
    for idx, faculty_element in enumerate(faculty_elements, start=1):
        profile_url = faculty_element.get_attribute("href")
        full_name = faculty_element.text.strip()
        first, middle, last, nickname = parse_name(full_name)
        
        # Extract department from the adjacent table cell
        try:
            department_element = faculty_element.find_element(By.XPATH, "./ancestor::tr/td[2]")
            department = department_element.text.strip()
        except Exception:
            department = "Department not found"
        
        print(f"[{idx}] Name: {first} {middle} {last} (Nickname: {nickname})")
        print(f"[{idx}] Department: {department}")
        
        faculty_data.append(["University of California, San Francisco", first, middle, last, nickname, department, profile_url])
    
    print(f"Extracted {len(faculty_data)} faculty members from search results.")
    
    # Save to Excel
    save_path = "ucsf_faculty_list.xlsx"
    df = pd.DataFrame(faculty_data, columns=["University", "First", "Middle", "Last", "Nickname", "Department", "Profile URL"])
    df.to_excel(save_path, index=False)
    print(f"Excel file created: {save_path}")
    
    # Close browser to release memory
    driver.quit()
    
    time.sleep(2)

scrape_faculty()

# Stop preventing sleep
caffeinate_process.terminate()


UCSF homepage loaded successfully.
Search page loaded successfully.
Selected: Professor
Selected: Assistant Professor
Selected: Associate Professor
Selected: Clinical Research Coordinator
Search button clicked successfully.
Search results did not load: HTTPConnectionPool(host='localhost', port=62919): Read timed out. (read timeout=120)


MaxRetryError: HTTPConnectionPool(host='localhost', port=62919): Max retries exceeded with url: /session/9d549c71decb898475f30cc6e162f27b/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x112fc2ad0>: Failed to establish a new connection: [Errno 61] Connection refused'))