In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import subprocess

# Prevent system sleep using caffeinate
caffeinate_process = subprocess.Popen(["caffeinate", "-d"])  # Keeps Mac awake during script execution

# List of STEM departments to filter
stem_keywords = [
    "Science", "Engineering", "Mathematics", "Technology", "Biology", "Physics", "Chemistry",
    "Geology", "Astronomy", "Astrophysics", "Neuroscience", "Genetics", "Ecology", "Evolution",
    "Cybersecurity", "Data", "Computer", "Computer Science", "Artificial Intelligence", 
    "Machine Learning", "Robotics", "Statistics", "Analytics", "Actuarial", "Operations Research",
    "Materials", "Nanotechnology", "Bioinformatics", "Cognitive", "Mechatronics", "Systems",
    "Geography", "Astrophysical", "Planetary", "Public Health", "Clinical", "Integrative Physiology",
    "Chemical", "Biological", "Medicine", "Medical", "Psychiatry"
]

# Function to get faculty details
def get_faculty_info(profile_url, listing_name, count):
    try:
        response = requests.get(profile_url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract name from title tag
        name_section = soup.find("title")
        profile_name = name_section.get_text(strip=True).split("|")[0] if name_section else "Name not found"
        
        # Compare names and keep the more detailed one
        full_name = profile_name if len(profile_name) > len(listing_name) else listing_name
        print(f"Selected name: {full_name}")
        
        if not full_name or full_name == 'Name not found':
            print(f"Failed to extract name from: {profile_url}")
            return "", "", "", "", ""
        
        # Ensure correct splitting of names
        if ", " in full_name:
            last_name, first_middle = full_name.split(", ", 1)
            first_middle_parts = first_middle.split(" ")
            first_name = first_middle_parts[0]
            middle_name = " ".join(first_middle_parts[1:]).replace(".", "") if len(first_middle_parts) > 1 else ""
        else:
            last_name = full_name
            first_name, middle_name = "", ""
        
        # Extract department (handling multiple positions)
        departments = []
        positions = soup.select("ul#individual-personInPosition li[role='listitem']")
        for position in positions:
            title_text = position.get_text(strip=True)
            if any(keyword in title_text for keyword in ["Professor", "Dean", "Director"]):
                dept_link = position.find("a")
                if dept_link:
                    departments.append(dept_link.get_text(strip=True))
        department = ", ".join(departments) if departments else "Department not found"
        
        # Determine if the faculty member belongs to a STEM department
        is_stem = "Y" if any(stem in department for stem in stem_departments) else "N"
        
        return first_name, middle_name, last_name, department, is_stem
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {profile_url}: {e}")
        return "", "", "", "", ""

# Base URL for UC Boulder Faculty A-Z Catalog
base_url = "https://catalog.colorado.edu/faculty-a-z/"
faculty_data = []

print("Scraping faculty directory...")
response = requests.get(base_url, headers={"User-Agent": "Mozilla/5.0"})
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

# Find faculty listings
faculty_list = soup.select("a[href*='experts.colorado.edu']")
print(f"Found {len(faculty_list)} faculty links")

for count, faculty in enumerate(faculty_list, start=1):
    listing_name = faculty.get_text(strip=True)
    print(f"Extracted name from listing: {listing_name}")
    
    profile_link = faculty["href"]
    if not profile_link.startswith("http"):
        profile_link = "https://experts.colorado.edu" + profile_link  # Ensure full URL
    
    first_name, middle_name, last_name, department, is_stem = get_faculty_info(profile_link, listing_name, count)
    
    if first_name and last_name:  # Skip entries with errors
        faculty_data.append(["University of Colorado Boulder", first_name, middle_name, last_name, department, is_stem, profile_link])
    
    time.sleep(1)  # Avoid overwhelming the server

# Save to Excel
output_file = "uc_boulder_faculty2.xlsx"
df = pd.DataFrame(faculty_data, columns=["University", "First", "Middle", "Last", "Department", "isSTEM", "Link"])
print(f"Total extracted records: {len(faculty_data)}")
df.to_excel(output_file, index=False)
print(f"Faculty list saved to {output_file}")

# Stop preventing sleep
caffeinate_process.terminate()


Scraping faculty directory...
Found 1923 faculty links
Extracted name from listing: Aaronson, Norman F.
Selected name: Aaronson, Norman F.
Extracted name from listing: Abbott, Lon D.
Selected name: Abbott, Lon D.
Extracted name from listing: Abdalati, Waleed
Selected name: Abdalati, Waleed 
Extracted name from listing: Abdullah, Aamir
Error fetching https://experts.colorado.edu/display/fisid_167490: 404 Client Error:  for url: https://experts.colorado.edu/display/fisid_167490
Extracted name from listing: Abiragi, Anthony A.
Selected name: Abiragi, Anthony A. 
Extracted name from listing: Ablowitz, Mark J.
Selected name: Ablowitz, Mark J.
Extracted name from listing: Acevedo-Muñoz, Ernesto R.
Selected name: Acevedo-Muñoz, Ernesto R.
Extracted name from listing: Ackerman, John Martin
Selected name: Ackerman, John Martin
Extracted name from listing: Ackland, Len
Selected name: Ackland, Len 
Extracted name from listing: Adams, Heather L.
Selected name: Adams, Heather L.
Extracted name from