In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager # 自动管理 ChromeDriver
import time

def get_columbia_professor_info_selenium(url):
    professors_data = []
    driver = None # 初始化 driver 变量

    try:
        # 自动下载和管理 ChromeDriver
        service = ChromeService(ChromeDriverManager().install())
        options = webdriver.ChromeOptions()
        options.add_argument('--headless') # 无头模式：不显示浏览器窗口
        options.add_argument('--disable-gpu') # 禁用 GPU 硬件加速
        options.add_argument('--no-sandbox') # 禁用沙盒模式
        options.add_argument('--disable-dev-shm-usage') # 解决Linux下共享内存问题
        # 添加 User-Agent，虽然Selenium会模拟真实User-Agent，但显式设置可能更好
        options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36')

        driver = webdriver.Chrome(service=service, options=options)
        driver.get(url)

        # 等待页面加载完成，可以根据实际情况调整等待时间或使用显式等待
        time.sleep(5) # 等待5秒，确保JavaScript内容加载完成

        # 使用你提供的 CSS class 来定位教授的链接
        # <a href="/faculty/people/eric-abrahamson" class="m-listing-faculty__link" ...>
        professor_link_wrappers = driver.find_elements(By.CLASS_NAME, 'm-listing-faculty__link')

        if not professor_link_wrappers:
            print("警告：Selenium 未找到任何 class='m-listing-faculty__link' 的元素。")
            # 调试时可以打印页面源码
            # print(driver.page_source)

        for wrapper_link in professor_link_wrappers:
            full_link = wrapper_link.get_attribute('href')

            name = None
            # 尝试从内部元素获取姓名
            # 再次强调，你可能需要检查这个 <a> 标签内部哪个元素包含姓名
            # 假设它在一个 class 为 'faculty-name-display' 的 span 中
            try:
                name_element = wrapper_link.find_element(By.CLASS_NAME, 'm-listing-faculty__title') # 猜测一个可能的class，需要实际检查
                name = name_element.text.strip()
            except Exception:
                # 如果没找到特定元素，尝试从 aria-label 获取
                aria_label_text = wrapper_link.get_attribute('aria-label')
                if aria_label_text and "Read More about " in aria_label_text:
                    name = aria_label_text.replace("Read More about ", "").strip()
                elif wrapper_link.text: # 最后尝试获取链接的全部可见文本
                    name = wrapper_link.text.strip()

            if name and full_link:
                professors_data.append({
                    'name': name,
                    'personal_page_link': full_link
                })

    except Exception as e:
        print(f"使用 Selenium 发生错误: {e}")
    finally:
        if driver:
            driver.quit() # 确保关闭浏览器驱动

    return professors_data

if __name__ == "__main__":
    faculty_url = "https://business.columbia.edu/faculty/divisions/management/faculty"

    print(f"正在使用 Selenium 从 {faculty_url} 获取教授姓名和个人页面链接...")
    professor_list = get_columbia_professor_info_selenium(faculty_url)

    if professor_list:
        print(f"成功获取 {len(professor_list)} 位教授的信息:")
        for prof in professor_list:
            print(f"姓名: {prof['name']}, 个人页面: {prof['personal_page_link']}")
    else:
        print("未能获取到任何教授信息。请检查 URL 或 HTML 结构解析部分是否需要调整。")

: 

In [None]:
pip install openpyxl

: 