In [None]:
import re
import time
import subprocess
import pandas as pd
import requests
import string
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver"  # Adjust if needed
BASE_URL = "https://cancer.ucsf.edu/people/profiles?last_name="

def parse_name(raw_name: str):
    """
    1. Extract nickname from quotes or parentheses.
    2. Remove everything after the first comma (e.g., degrees).
    3. Split the remaining name into tokens for First, Middle, Last.
       Extra tokens go into Last.
    4. Remove trailing period from the middle name.
    """
    nickname = ""
    quote_match = re.search(r'“(.*?)”', raw_name) or re.search(r'"(.*?)"', raw_name)
    paren_match = None if quote_match else re.search(r'\((.*?)\)', raw_name)
    if quote_match:
        nickname = quote_match.group(1)
        raw_name = raw_name.replace(quote_match.group(0), "")
    elif paren_match:
        nickname = paren_match.group(1)
        raw_name = raw_name.replace(paren_match.group(0), "")
    
    raw_name = raw_name.split(",", 1)[0].strip()
    tokens = raw_name.split()
    if len(tokens) == 0:
        return "", "", "", nickname
    elif len(tokens) == 1:
        return "", "", tokens[0], nickname
    elif len(tokens) == 2:
        first, last = tokens
        return first, "", last, nickname
    elif len(tokens) == 3:
        first, middle, last = tokens
    else:
        first = tokens[0]
        middle = tokens[1]
        last = " ".join(tokens[2:])
    
    if middle.endswith("."):
        middle = middle.rstrip(".")
    
    return first, middle, last, nickname

def fetch_profile_details_selenium(driver, profile_url: str):
    """
    Navigates to the faculty profile page with Selenium and extracts:
      - The full name from <h1>.
      - The first <p> inside div.people-callout-content.
        That text is split on the first comma: before is position, after is department.
      - In the department string, first remove any occurrence of ", UCSF", then remove other redundant phrases.
    """
    driver.get(profile_url)
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.TAG_NAME, "h1"))
    )
    
    try:
        h1_elem = driver.find_element(By.TAG_NAME, "h1")
        raw_name = h1_elem.text.strip()
    except Exception as e:
        print(f"[DEBUG] Error getting <h1> from {profile_url}: {e}")
        raw_name = ""
    
    position = ""
    department = ""
    try:
        callout_div = driver.find_element(By.CSS_SELECTOR, "div.people-callout-content")
        p_elem = callout_div.find_element(By.TAG_NAME, "p")
        p_text = p_elem.text.strip()  # e.g., "Associate Clinical Professor, Department of Surgery, UCSF"
        parts = [part.strip() for part in p_text.split(",", 1)]
        if len(parts) > 0:
            position = parts[0]
        if len(parts) > 1:
            department = parts[1]
            # First remove any occurrence of ", UCSF"
            department = department.replace(", UCSF", "").strip()
            # Then remove redundant phrases
            remove_phrases = [
                "Department of", "Dept. of", "Departments of", "Dept of",
                "UCSF", "University of California, San Francisco"
            ]
            for phrase in remove_phrases:
                department = department.replace(phrase, "").strip()
            # Clean up leading commas and spaces
            department = department.lstrip(", ").strip()
    except Exception:
        print(f"[DEBUG] .people-callout-content not found or <p> missing in {profile_url}.")
    
    return raw_name, position, department

def main():
    caffeinate_process = subprocess.Popen(["caffeinate", "-d"])
    service = Service(CHROMEDRIVER_PATH)
    driver = webdriver.Chrome(service=service)
    data_rows = []
    
    try:
        for letter in string.ascii_lowercase:
            listing_url = BASE_URL + letter
            print(f"[DEBUG] Loading listing page for letter '{letter}': {listing_url}")
            driver.get(listing_url)
            
            try:
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "ul.people-list-container"))
                )
            except TimeoutException:
                print(f"[DEBUG] Timeout waiting for listing page for letter '{letter}'. Skipping.")
                continue
            
            # Scroll to load all profiles
            last_height = driver.execute_script("return document.body.scrollHeight")
            while True:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            
            faculty_links = driver.find_elements(By.CSS_SELECTOR, "ul.people-list-container li a.article.people-list.cf")
            print(f"[DEBUG] Found {len(faculty_links)} faculty links for letter '{letter}'.")
            
            profile_urls = [link.get_attribute("href") for link in faculty_links]
            
            for profile_url in profile_urls:
                print(f"[DEBUG] Processing profile: {profile_url}")
                raw_name, position_str, department_str = fetch_profile_details_selenium(driver, profile_url)
                first, middle, last, nickname = parse_name(raw_name)
                data_rows.append([
                    "University of California, San Francisco",
                    first,
                    middle,
                    last,
                    nickname,
                    department_str,
                    position_str,
                    profile_url
                ])
    
    finally:
        driver.quit()
        caffeinate_process.terminate()
    
    df = pd.DataFrame(data_rows, columns=[
        "University", "First", "Middle", "Last", "Nickname", "Department", "Position", "Link"
    ])
    output_path = "/Users/elliehozhabri/Documents/RBP/scrape/UCSF_Cancer_Center_All.xlsx"
    df.to_excel(output_path, index=False)
    print(f"[DEBUG] Scraping complete. Data saved to '{output_path}'.")

if __name__ == "__main__":
    main()
