In [16]:
import subprocess
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import re
from webdriver_manager.chrome import ChromeDriverManager

# Start caffeinate to prevent sleep
caffeinate_process = subprocess.Popen(["caffeinate", "-i"])

# List of common degree titles and suffixes
degrees = {"M.D.", "Ph.D.", "L.C.S.W.", "D.O.", "J.D.", "M.S.", "M.A.", "B.S.", "B.A.", "D.D.S.", "F.A.C.S.", "PAC"}
suffixes = {"Jr.", "Sr.", "Jr", "Sr", "II", "III", "IV", "V"}

# Setup Chrome WebDriver
def setup_driver():
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--log-level=3")  # Suppress logs
    return webdriver.Chrome(service=service, options=options)

# Extract faculty data from the correct table
def extract_faculty_data(driver):
    faculty_data = []
    faculty_rows = driver.find_elements(By.XPATH, "//tr[contains(@class, 'Row')]")

    faculty_count = 0  # Counter

    for row in faculty_rows:
        name_cell = row.find_element(By.CLASS_NAME, "listTableLink")  # Find profile link
        full_name = name_cell.text.strip()
        profile_url = name_cell.get_attribute("href")

        # Find the hidden input field that contains the "Person Summary"
        hidden_inputs = row.find_elements(By.TAG_NAME, "input")
        department = "Unknown"
        for input_field in hidden_inputs:
            value_content = input_field.get_attribute("value")

            # Use regex to extract department from the hidden input
            match = re.search(r"Department<\/u><br\/>\s*(.*?)\s*<br\/>", value_content, re.IGNORECASE)
            if match:
                department = match.group(1).strip()
                break  # Stop as soon as we find the department

        # Parse the name correctly
        first, middle, last, nickname = parse_name(full_name)

        faculty_data.append(["University of Colorado Anschutz", first, middle, last, nickname, department, profile_url])
        faculty_count += 1

    return faculty_data, faculty_count

# Function to parse names correctly
def parse_name(full_name):
    # Extract nickname (if present in parentheses or quotes)
    nickname = ""
    match = re.search(r'[\(\"](.+?)[\)\"]', full_name)  # Find nickname in ( ) or " "
    if match:
        nickname = match.group(1).strip()  # Get nickname
        full_name = full_name.replace(match.group(0), "").strip()  # Remove nickname from name

    name_parts = full_name.split()
    first = name_parts[0] if len(name_parts) > 0 else ""
    middle = ""
    last = ""

    # If the last word is a degree, remove it
    if name_parts[-1] in degrees:
        name_parts.pop()

    # If there's only 2 parts, assign them normally
    if len(name_parts) == 2:
        last = name_parts[1]

    # If there's more than 2 parts, first stays first, middle is next, last gets everything else
    elif len(name_parts) > 2:
        middle = name_parts[1]  # Keep middle name
        last = " ".join(name_parts[2:])  # Everything else goes into last name

    # Remove periods from middle name
    middle = middle.replace(".", "")

    return first, middle, last, nickname

# Navigate to CU Anschutz directory, select institution, and perform search
def scrape_cu_anschutz():
    driver = setup_driver()
    base_url = "https://profiles.ucdenver.edu/search/people"

    print("\nNavigating to CU Anschutz Faculty Directory...")
    driver.get(base_url)
    time.sleep(3)

    try:
        # Find the dropdown and print all options
        institution_dropdown = Select(driver.find_element(By.NAME, "institution"))

        selected_value = None
        for option in institution_dropdown.options:
            value = option.get_attribute('value')

            # Match the correct institution and store the value
            if option.text.strip() == "University of Colorado Denver - Anschutz Medical Campus":
                selected_value = value

        # Ensure we found the right value before selecting
        if selected_value:
            institution_dropdown.select_by_value(selected_value)
        else:
            print("❌ ERROR: Could not find matching institution value.")

        time.sleep(3)  # Give time for selection to apply
    except Exception as e:
        print(f"❌ ERROR: Institution selection failed: {e}")

    try:
        # Wait for the search button (image) to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//img[@alt='Search']"))
        )

        # Click the search image button
        search_button = driver.find_element(By.XPATH, "//img[@alt='Search']")
        search_button.click()
        print("✔ Clicked 'Search' button (Image)")
        time.sleep(3)  # Wait for search results to load

    except Exception as e:
        print(f"❌ ERROR: Could not click search button: {e}")

    faculty_data = []
    total_faculty_count = 0

    # Stop after extracting 15 faculty members for testing
    while True:
        page_faculty, count = extract_faculty_data(driver)
        faculty_data.extend(page_faculty)
        total_faculty_count += count
    
        # Check if there's a "Next" button to continue pagination
        try:
            next_button = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.LINK_TEXT, "Next"))
            )
            print("✔ Found 'Next' button. Moving to next page...")
            next_button.click()
            time.sleep(3)  # Wait for next page to load
        except:
            print("❌ No 'Next' button found. Scraping complete.")
            break  # Exit loop when no more pages


    print(f"\n✅ Scraping completed. Total faculty scraped: {total_faculty_count}")
    driver.quit()

    # Save to Excel
    df = pd.DataFrame(faculty_data, columns=["University", "First", "Middle", "Last", "Nickname", "Department", "Link"])
    df.to_excel("CU_Anschutz_Faculty.xlsx", index=False)

    print("Data saved to CU_Anschutz_Faculty.xlsx.")

# Run the scraper
scrape_cu_anschutz()

# Stop caffeinate when scraping is done
caffeinate_process.terminate()



Navigating to CU Anschutz Faculty Directory...
✔ Clicked 'Search' button (Image)
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page...
✔ Found 'Next' button. Moving to next page..