# Departmental Scrapers

This notebook contains specific scraping logic for different department faculty pages, structured as individual functions. Each function reads a local HTML file and extracts structured data using `BeautifulSoup`.

We currently support:
- UCSB Department of Art
- UCSB Department of Anthropology
- UCSB Department of Asian American Studies

In [None]:
#| default_exp my_scrapers

In [None]:
#| export
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.parse import urljoin

## Domain Mapping

The following dictionary maps the domain names to their respective scrapers.

In [None]:
#| export
def extract_domain(url):
    """
    Extract the domain name from a URL.
    """
    url = str(url)
    # Use regex to extract the domain name, but keep the protocol and www if they exist
    match = re.search(r'^(https?://)?(www\.)?([^/]+)', url)
    if match:
        # Return the domain name with the protocol and www if they exist
        return match.group(0)
    else:
        # If no match is found, return None
        return None
    

DIR = '../faculty_html/'  # Directory to save the HTML files

def create_filename(department):
    """
    Create a filename for the department.
    """
    # Use the department name to create an html filename. Fill spaces with underscores
    return department.replace(" ", "_").replace("/", "_") + ".html"


# Dictionary of department domains for proper URL resolution
# read in the csv file
df = pd.read_csv('../UCSB Departments and Programs - Sheet1.csv')
# create a dictionary from the dataframe
departments = {}
for index, row in df.iterrows():
    # extract the domain name from the URL
    full_url = row['Faculty Listing Link']
    # remove the trailing slash if it exists
    domain = extract_domain(full_url)
    # add the domain name to the dictionary
    # if the domain name is not in the dictionary, add it
    if row['Department/Unit'] not in departments:
        departments[row['Department/Unit']] = {}
    # add the domain name and url to the dictionary
    # if the domain name is not in the dictionary, add it
    if 'domain' not in departments[row['Department/Unit']]:
        departments[row['Department/Unit']]['domain'] = domain
    # add the url to the dictionary
    # if the url is not in the dictionary, add it
    if 'url' not in departments[row['Department/Unit']]:
        departments[row['Department/Unit']]['url'] = full_url
    # add the filename to the dictionary
    # if the filename is not in the dictionary, add it
    if 'filename' not in departments[row['Department/Unit']]:
        departments[row['Department/Unit']]['filename'] = DIR + create_filename(row['Department/Unit'])
    



In [None]:
# print the dictionary
for department in departments:
    print(department)
    print(departments[department])
    print()

Anthropology
{'domain': 'https://www.anth.ucsb.edu', 'url': 'https://www.anth.ucsb.edu/people/academic', 'filename': '../faculty_html/Anthropology.html'}

Art
{'domain': 'https://www.arts.ucsb.edu', 'url': 'https://www.arts.ucsb.edu/faculty/', 'filename': '../faculty_html/Art.html'}

Asian American Studies
{'domain': 'https://www.asamst.ucsb.edu', 'url': 'https://www.asamst.ucsb.edu/people', 'filename': '../faculty_html/Asian_American_Studies.html'}

Biomolecular Science and Engineering
{'domain': 'https://www.bmse.ucsb.edu', 'url': 'https://www.bmse.ucsb.edu/people', 'filename': '../faculty_html/Biomolecular_Science_and_Engineering.html'}

Developmental Biology
{'domain': 'https://www.mcdb.ucsb.edu', 'url': 'https://www.mcdb.ucsb.edu/people/faculty', 'filename': '../faculty_html/Developmental_Biology.html'}

Black Studies
{'domain': 'https://www.blackstudies.ucsb.edu', 'url': 'https://www.blackstudies.ucsb.edu/people/academic', 'filename': '../faculty_html/Black_Studies.html'}

Chemis

## Generic Drupal Site

This scraper works for:

- Black Studies

In [None]:
#| export

def drupal_department(file_html, base_url=None):
    """
    Generic scraper for UCSB departments using Drupal-based sites with a common structure.
    
    Args:
        file_html: Path to the HTML file
        base_url: Base URL for the department's website (for resolving relative links)
    """
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    
    # Try to detect the department domain if not provided
    if not base_url:
        meta_tags = soup.select('meta[content*="ucsb.edu"]')
        for tag in meta_tags:
            content = tag.get('content', '')
            if 'ucsb.edu' in content:
                match = re.search(r'https?://[^/]*\.ucsb\.edu', content)
                if match:
                    base_url = match.group(0)
                    break
    
    # Common selectors for Drupal-based department sites
    faculty_rows = soup.select("div.views-row")
    faculty_data = []

    def get_text(elem):
        return elem.get_text(strip=True) if elem else None

    for row in faculty_rows:
        # Try different selectors for name
        name_elem = (
            row.select_one("div.views-field-title span.field-content a") or 
            row.select_one("h2 a") or
            row.select_one("div.table--name a")
        )
        
        if not name_elem:
            continue
            
        name = get_text(name_elem)
        
        # Extract profile URL
        profile_url = name_elem.get('href', '')
        if profile_url and profile_url.startswith('/') and base_url:
            profile_url = f"{base_url}{profile_url}"
        
        # Try different selectors for title/affiliation
        title_elem = (
            row.select_one("div.views-field-field-affiliation div.field-content") or
            row.select_one("div.table--position") or
            row.select_one("p:nth-of-type(1)")  # Assuming first p tag might contain title
        )
        title = get_text(title_elem)
        
        # Try different selectors for specialization
        specialization_elem = row.select_one("div.views-field-field-specialization div.field-content")
        specialization = get_text(specialization_elem)
        
        # Try different selectors for email
        email_elem = (
            row.select_one("div.views-field-field-contact-email a") or
            row.select_one("a[href^='mailto:']")
        )
        email = get_text(email_elem) if email_elem else None
        
        # Try different selectors for phone
        phone_elem = row.select_one("div.views-field-field-contact-phone div.field-content")
        phone = get_text(phone_elem)
        
        # Try different selectors for office
        office_elem = row.select_one("div.views-field-field-office-location div.field-content")
        office = get_text(office_elem)
        
        # Try different selectors for personal website
        website_elem = (
            row.select_one("div.views-field-field-website div.field-content a") or
            row.select_one("a[href*='http']:not([href^='mailto:'])")
        )
        personal_website = website_elem.get('href', '') if website_elem and 'mailto:' not in website_elem.get('href', '') else None
        
        # If there's no explicit personal website, use the profile URL
        website = personal_website if personal_website else profile_url
        
        # Try different selectors for photo
        photo_elem = (
            row.select_one("div.views-field-field-photo img") or
            row.select_one("img.image-style-people-view") or
            row.select_one("div.table--portrait img") or
            row.select_one("img")
        )
        photo_url = photo_elem.get('src', '') if photo_elem else None

        faculty_data.append({
            "Name": name,
            "Title(s)": title,
            "Specialization": specialization,
            "Email": email,
            "Phone": phone,
            "Office": office,
            "Website": website,
            "Photo URL": photo_url
        })

    return pd.DataFrame(faculty_data)

In [None]:
# Test the generic Drupal department scraper on a few departments
department = 'Black Studies'
faculty = drupal_department(departments[department]['filename'], departments[department]['url'])
print(f"Found {len(faculty)} faculty members")
if len(faculty) > 0:
    display(faculty.head(3))
    
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in faculty['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")

# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(faculty['Name'].head(3), faculty['Title(s)'].head(3))):
    print(f"{name}: {title}")


Found 11 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL
0,Jude Akudinobi,Lecturer,,akudinob@blackstudies.ucsb.edu,,Room 3713 South Hall,https://www.blackstudies.ucsb.edu/people/acade...,https://www.blackstudies.ucsb.edu/sites/defaul...
1,Jaime Alves,Vice Chair and Associate Professor,,jaimealves@blackstudies.ucsb.edu,,Room 3715 South Hall,https://www.blackstudies.ucsb.edu/people/acade...,https://www.blackstudies.ucsb.edu/sites/defaul...
2,Ingrid Banks,Associate Professor,,ibanks@blackstudies.ucsb.edu,,Room 3708 South Hall,https://www.blackstudies.ucsb.edu/people/acade...,https://www.blackstudies.ucsb.edu/sites/defaul...


All URLs are valid: True

Sample of faculty titles:
Jude Akudinobi: Lecturer
Jaime  Alves: Vice Chair and Associate Professor
Ingrid  Banks: Associate Professor


## Drupal Directory-style Scraper

This is a generic scraper that works for departments with a directory-style listing of faculty:


![geog_directory](../faculty_screenshots/Geography.png){width=300px}

### Directory-style Departments (Spring, 2025):
- Earth Science
- Ecology, Evolution, and Marine Biology
- Economics
- Geography
- Marine Science Graduate Program
- Physics
- Electrical and Computer Engineering

In [None]:
#| export
def drupal_directory(file_html:str, # Path to the HTML file to parse
                     base_url:str=None # Base URL for the department website for converting relative URLs to absolute
                     )->pd.DataFrame: # A dataframe of faculty members
    """
    Generic scraper for UCSB department faculty pages that follow the Earth Science
    department structure with group-first, group-second, group-third, and group-fourth divisions.
    
    Parameters:
    -----------
    file_html : str
        Path to the HTML file to parse
    base_url : str, optional
        Base URL for the department website for converting relative URLs to absolute
        (e.g., "https://www.geol.ucsb.edu" for Earth Science)
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing faculty information
    """
    def get_department_name(file_html):
        # Extract the department name from the filename
        return file_html.split("/")[-1].split(".")[0].replace("_", " ").title()
    department_name = get_department_name(file_html)
    print(f"Scraping {department_name} department...")
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # Find all faculty rows in the view content
        faculty_blocks = soup.select("div.view-content div.views-row")
        
        if not faculty_blocks:
            print("Warning: No faculty blocks found. Check HTML structure.")
        
        faculty_data = []
        
        for block in faculty_blocks:
            # Get the four groups that make up each faculty entry
            first_group = block.select_one("div.group-first")
            second_group = block.select_one("div.group-second")
            third_group = block.select_one("div.group-third")
            fourth_group = block.select_one("div.group-fourth")
            
            if not second_group:  # Skip if essential info is missing
                continue
            
            # Extract name and profile URL
            name_elem = second_group.select_one("h3 a")
            if not name_elem:
                continue
                
            name = name_elem.get_text(strip=True)
            profile_url = name_elem.get("href", "")
            
            if profile_url and not profile_url.startswith("http") and base_url:
                profile_url = urljoin(base_url, profile_url)
            
            # Extract title - it appears right after the name heading
            title = None
            # Get the HTML as string to analyze structure
            second_group_html = str(second_group)
            
            # Look for <br> tags which often separate title from other info
            br_tags = second_group.find_all('br')
            if br_tags:
                # Title is often between h3 and first br tag
                h3_tag = second_group.select_one('h3')
                if h3_tag:
                    next_elem = h3_tag.next_sibling
                    title_text = ""
                    while next_elem and (isinstance(next_elem, str) or next_elem.name != 'br'):
                        if isinstance(next_elem, str):
                            title_text += next_elem
                        elif next_elem.name not in ['span']:
                            title_text += next_elem.get_text()
                        next_elem = next_elem.next_sibling
                    title = title_text.strip()
            
            # If title not found yet, try another approach
            if not title or not title.strip():
                # Get all text directly under second_group, after h3
                h3_tag = second_group.select_one('h3')
                if h3_tag:
                    # Get the text content after h3
                    content_after_h3 = ""
                    current = h3_tag.next_sibling
                    
                    while current and (isinstance(current, str) or current.name != 'span'):
                        if isinstance(current, str):
                            content_after_h3 += current
                        elif current.name != 'br':
                            content_after_h3 += current.get_text()
                        current = current.next_sibling
                    
                    # Split by line breaks and take the first non-empty line
                    lines = [line.strip() for line in content_after_h3.splitlines() if line.strip()]
                    if lines:
                        title = lines[0]
            
            # Extract office location - it's typically after the icons
            office = None
            icon_span = second_group.select_one('span.directory-fa-icons')
            if icon_span:
                # Office is often after the icon span
                next_elem = icon_span.next_sibling
                office_text = ""
                while next_elem:
                    if isinstance(next_elem, str):
                        office_text += next_elem
                    elif next_elem.name not in ['span']:
                        office_text += next_elem.get_text()
                    next_elem = next_elem.next_sibling
                
                # Look for Hall reference in the text
                hall_pattern = r'(Webb\s+Hall|South\s+Hall|[A-Za-z]+\s+Hall)\s+\d+\w*'
                hall_match = re.search(hall_pattern, office_text)
                if hall_match:
                    office = hall_match.group(0).strip()
            
            # If office is still not found, search the entire second group text
            if not office:
                hall_pattern = r'(Webb\s+Hall|South\s+Hall|[A-Za-z]+\s+Hall)\s+\d+\w*'
                hall_match = re.search(hall_pattern, second_group.get_text())
                if hall_match:
                    office = hall_match.group(0).strip()
            
            # Extract specialization from the third group
            specialization = None
            if third_group:
                specialization = third_group.get_text(strip=True)
            
            # Extract contact information
            email = phone = personal_website = None
            
            # Look for email, phone, and website in the directory-fa-icons span
            fa_icons = second_group.select_one("span.directory-fa-icons")
            if fa_icons:
                # Extract email
                email_elem = fa_icons.select_one('a[data-toggle="tooltip"][title*="Email"]')
                if email_elem:
                    email_text = email_elem.get("title", "")
                    email_match = re.search(r'[\w\.-]+@[\w\.-]+', email_text)
                    email = email_match.group(0) if email_match else None
                
                # Extract phone
                phone_elem = fa_icons.select_one('a[data-toggle="tooltip"][title*="Phone"]')
                if phone_elem:
                    phone_text = phone_elem.get("title", "").replace("Phone:", "").strip()
                    phone = phone_text
                
                # Extract website
                website_elem = fa_icons.select_one('a[data-toggle="tooltip"][title*="Website"]')
                if website_elem:
                    personal_website = website_elem.get("href", "").strip()
            
            # Extract photo URL
            photo_url = None
            if first_group:
                img_elem = first_group.select_one("img")
                if img_elem:
                    # Try different attribute names for the image URL
                    photo_url = img_elem.get("data-src") or img_elem.get("src")
                    if photo_url and not photo_url.startswith("http") and base_url:
                        photo_url = urljoin(base_url, photo_url)
            
            # Extract research areas from fourth group links
            research_areas = []
            if fourth_group and fourth_group.select("a"):
                for link in fourth_group.select("a"):
                    area = link.get_text(strip=True)
                    if area:
                        research_areas.append(area)
            
            research_area_text = ", ".join(research_areas) if research_areas else None
            
            faculty_data.append({
                "Name": name,
                "Title(s)": title,
                "Specialization": specialization,
                "Email": email,
                "Phone": phone,
                "Office": office,
                "Website": personal_website if personal_website else profile_url,
                "Photo URL": photo_url,
                "Research Areas": research_area_text,
                "Department": department_name
            })
        
        return pd.DataFrame(faculty_data)

In [None]:
earth_science_df = drupal_directory("../faculty_html/Earth_Science.html", "https://www.geol.ucsb.edu")
print(f"Found {len(earth_science_df)} faculty members in Earth Science")
if len(earth_science_df) > 0:
    display(earth_science_df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in earth_science_df['Website'] if pd.notnull(url))
print(f"All URLs are valid in Earth Science: {has_valid_urls}")
# Test - extract specialization correctly
print("\nSample of Earth Science faculty specializations:")
for i, row in earth_science_df.head(3).iterrows():
    print(f"{row['Name']}: Specialization = {row['Specialization']}")

# Test this scraper for geography:
geography_df = drupal_directory("../faculty_html/Geography.html", "https://www.geog.ucsb.edu")
print(f"Found {len(geography_df)} faculty members in Geography")
if len(geography_df) > 0:
    display(geography_df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in geography_df['Website'] if pd.notnull(url))
print(f"All URLs are valid in Geography: {has_valid_urls}")
# Test - extract specialization correctly
print("\nSample of Geography faculty specializations:")
for i, row in geography_df.head(3).iterrows():
    print(f"{row['Name']}: Specialization = {row['Specialization']}")



Scraping Earth Science department...
Found 21 faculty members in Earth Science


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL,Research Areas,Department
0,Jordan Clark,Professor,General field of Aqueous Geochemistry.,jfclark@geol.ucsb.edu,(805) 450 1824,Webb Hall 2115,https://jfclark.faculty.geol.ucsb.edu/,https://www.geol.ucsb.edu/sites/default/files/...,"Surface Processes, Sedimentology & Hydrology",Earth Science
1,John Cottle,Professor,Application of geochronology to solve tectonic...,cottle@geol.ucsb.edu,(805) 893 7315,Webb Hall 2031A,https://johncottle.weebly.com/,https://www.geol.ucsb.edu/sites/default/files/...,"Geochronology, Petrology, Geochemistry & Volca...",Earth Science
2,Zach Eilon,Associate Professor,"Seismic tomography, inverse methods, anisotrop...",eilon@ucsb.edu,,Webb Hall 2116,https://zeilon.squarespace.com,https://www.geol.ucsb.edu/sites/default/files/...,"Seismology & Geophysics, Structure & Tectonics",Earth Science


All URLs are valid in Earth Science: True

Sample of Earth Science faculty specializations:
Jordan Clark: Specialization = General field of Aqueous Geochemistry.
John Cottle: Specialization = Application of geochronology to solve tectonic problems
Zach Eilon: Specialization = Seismic tomography, inverse methods, anisotropy, and attenuation.
Scraping Geography department...
Found 27 faculty members in Geography


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL,Research Areas,Department
0,Elizabeth Ackert,Associate Professor,Professor Ackert conducts research in the area...,ackert@ucsb.edu,,,https://lizackert.com/,https://www.geog.ucsb.edu/sites/default/files/...,Population & Health,Geography
1,Kathy Baylis,Professor,"I explore how agricultural, trade, and conserv...",baylis@ucsb.edu,,,https://edge.geog.ucsb.edu/,https://www.geog.ucsb.edu/sites/default/files/...,"Geoanalytics Science & Planning, Population & ...",Geography
2,Leila Carvalho,Professor,An interdisciplinary domain that investigates ...,leila@eri.ucsb.edu,+1 805-679-3216,,http://clivac.eri.ucsb.edu/,https://www.geog.ucsb.edu/sites/default/files/...,Atmospheric & Climate Science,Geography


All URLs are valid in Geography: True

Sample of Geography faculty specializations:
Elizabeth Ackert: Specialization = Professor Ackert conducts research in the areas of population geography, immigration, health geography, and urban geography using quantitative social science research methods.
Kathy Baylis: Specialization = I explore how agricultural, trade, and conservation policy affects human and environmental outcomes.
Leila Carvalho: Specialization = An interdisciplinary domain that investigates coupled systems to advance understanding of the Earth’s climate on multiple scales.


## Anthropology Department

![anthropology](../faculty_screenshots/Anthropology.png){width=300}

In [None]:
#| export

def anthropology(file_html, base_url=None):
    "Extract faculty data from the UCSB Anthropology department HTML file."
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    faculty_rows = soup.select("div.views-row")
    faculty_data = []

    def get_text(elem):
        # replace newlines with spaces:
        return elem.get_text(separator=" ", strip=True) if elem else None


    for row in faculty_rows:
        name = get_text(row.select_one("div.views-field-title span.field-content a"))
        title = get_text(row.select_one("div.views-field-field-affiliation div.field-content"))
        specialization = get_text(row.select_one("div.views-field-field-specialization div.field-content"))
        email = get_text(row.select_one("div.views-field-field-contact-email a"))
        phone = get_text(row.select_one("div.views-field-field-contact-phone div.field-content"))
        office = get_text(row.select_one("div.views-field-field-office-location div.field-content"))
        website_tag = row.select_one("div.views-field-field-website a")
        website = website_tag["href"] if website_tag else None
        photo_tag = row.select_one("div.views-field-field-photo img")
        photo_url = photo_tag["src"] if photo_tag else None

        faculty_data.append({
            "Name": name,
            "Title(s)": title,
            "Specialization": specialization,
            "Email": email,
            "Phone": phone,
            "Office": office,
            "Website": website,
            "Photo URL": photo_url,
            "Department": "Anthropology"
        })

    return pd.DataFrame(faculty_data)


In [None]:
# Test the Anthropology scraper
file_path = "../faculty_html/Anthropology.html"
faculty = anthropology(file_path)
print(f"Found {len(faculty)} faculty members")
if len(faculty) > 0:
    display(faculty.head(3))
    
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in faculty['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")

# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(faculty['Name'].head(3), faculty['Title(s)'].head(3))):
    print(f"{name}: {title}")

Found 15 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL,Department
0,Amy Boddy,Associate Professor,Integrative Anthropological Sciences My work f...,boddy@anth.ucsb.edu,805-893-2456,HSSB 2045 Boddy Lab: HSSB 2041,http://www.boddylab.com,https://www.anth.ucsb.edu/sites/default/files/...,Anthropology
1,Michael Gurven,Professor,Integrative Anthropological Sciences (behavior...,gurven@anth.ucsb.edu,(805) 893-2202,HSSB 2060,http://gurven.anth.ucsb.edu,https://www.anth.ucsb.edu/sites/default/files/...,Anthropology
2,Charles Hale,SAGE Sara Miller McCune Dean of Social Science...,Sociocultural Anthropology ( Political anthrop...,crhale@ltsc.ucsb.edu,805-893-8354,,,https://www.anth.ucsb.edu/sites/default/files/...,Anthropology


All URLs are valid: True

Sample of faculty titles:
Amy Boddy: Associate Professor
Michael Gurven: Professor
Charles  Hale: SAGE Sara Miller McCune Dean of Social Sciences Professor


## Art Department

The Art Department scraper is pretty straight-forward

![art](../faculty_screenshots/Art.png){width=300}

In [None]:
#| export

def art(file_html, base_url=None):
    "Extract faculty data from the UCSB Art department HTML file."
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    paragraphs = soup.find_all("p")
    faculty_entries = []

    for p in paragraphs:
        content = p.get_text(separator="\n", strip=True)
        links = p.find_all("a")

        if links and ("Professor" in content or "Teaching" in content or "Lecturer" in content):
            name = links[0].get_text(strip=True)
            website = links[0]["href"] if links[0].has_attr("href") else None
            # check if the website is a full URL or a relative URL
            if website and not website.startswith("http"):
                # if the website is a relative URL, add the base URL to it
                website = base_url + website
            # Standardize the URL to end with a trailing slash:
            if website and not website.endswith("/"):
                website += "/"
            

            photo_url = None
            email_match = re.search(r"mailto:([^\"]+)", str(p))
            email = None
            if email_match:
                email = bytes(email_match.group(1), "utf-8").decode("unicode_escape").replace("&#x", "").replace(";", "")

            lines = content.splitlines()
            title = specialization = office = phone = None

            for line in lines[1:]:
                if "Professor" in line:
                    # remove specializations from the title (they are in parentheses)
                    title = re.sub(r"\s*\(.*?\)", "", line)
                    # add the specialization to the specialization variable
                    specialization = re.search(r"\((.*?)\)", line)
                    if specialization:
                        specialization = specialization.group(1)
                elif "Arts" in line or "Elings" in line:
                    office = line
                elif "@" in line and not email:
                    email = line
                elif re.search(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", line):
                    phone = line
                elif not specialization:
                    specialization = line

            faculty_entries.append({
                "Name": name,
                "Title(s)": title,
                "Specialization": specialization,
                "Email": email,
                "Phone": phone,
                "Office": office,
                "Website": website,
                "Photo URL": photo_url
            })

    return pd.DataFrame(faculty_entries)


In [None]:
# Test the art scraper
# Test the Computer Science scraper
faculty = art(departments['Art']['filename'], departments['Art']['domain'])
print(f"Found {len(faculty)} faculty members")
if len(faculty) > 0:
    display(faculty.head(3))
    
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in faculty['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")

# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(faculty['Name'].head(3), faculty['Title(s)'].head(3))):
    print(f"{name}: {title}")

Found 8 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL
0,Sarah Rosalena Brady,Assistant Professor,Computational Craft and Haptic Media,rosalena@arts.ucsb.edu,,Arts 0250,http://www.sarahrosalena.com/,
1,Jane Callister,Professor,"Painting, Drawing",jane@arts.ucsb.edu,,Arts 1348,https://www.janecallister.com/,
2,Iman Djouini,Assistant Teaching Professor,"Print, Book Arts and Intermedia",imandjouini@ucsb.edu,,,https://imandjouini.com/,


All URLs are valid: True

Sample of faculty titles:
Sarah Rosalena Brady: Assistant Professor
Jane Callister: Professor
Iman Djouini: Assistant Teaching Professor


## Asian American Studies Department

The Asian American Studies department uses a Drupal-based site with a similar structure to Anthropology.

![asian_american_studies](../faculty_screenshots/Asian_American_Studies.png){width=300}

In [None]:
#| export

def asian_american_studies(file_html, base_url=None):
    "Extract faculty data from the UCSB Asian American Studies department HTML file."
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    faculty_rows = soup.select("div.views-row")
    faculty_data = []

    def get_text(elem):
        return elem.get_text(strip=True) if elem else None

    for row in faculty_rows:
        name_tag = row.select_one("div.views-field-title span.field-content a")
        name = get_text(name_tag)
        
        # Extract the faculty profile URL from the name link
        profile_url = None
        if name_tag and name_tag.has_attr('href'):
            profile_url = "https://www.asamst.ucsb.edu" + name_tag['href'] if name_tag['href'].startswith('/') else name_tag['href']
        
        title = get_text(row.select_one("div.views-field-field-affiliation div.field-content"))
        specialization = get_text(row.select_one("div.views-field-field-specialization div.field-content"))
        email = get_text(row.select_one("div.views-field-field-contact-email a"))
        phone = get_text(row.select_one("div.views-field-field-contact-phone div.field-content"))
        office = get_text(row.select_one("div.views-field-field-office-location div.field-content"))
        
        # Check if there's an explicit website in website field, otherwise use profile URL
        website_tag = row.select_one("div.views-field-field-website a")
        website = website_tag["href"] if website_tag and website_tag.has_attr("href") else profile_url
        
        photo_tag = row.select_one("div.views-field-field-photo img")
        photo_url = photo_tag["src"] if photo_tag and photo_tag.has_attr("src") else None

        faculty_data.append({
            "Name": name,
            "Title(s)": title,
            "Specialization": specialization,
            "Email": email,
            "Phone": phone,
            "Office": office,
            "Website": website,
            "Photo URL": photo_url,
            "Department": "Asian American Studies"
        })

    return pd.DataFrame(faculty_data)

In [None]:
# Test the Asian American Studies department scraper
file_path = "../faculty_html/Asian_American_Studies.html"
aas_faculty = asian_american_studies(file_path)
print(f"Found {len(aas_faculty)} faculty members")
if len(aas_faculty) > 0:
    display(aas_faculty.head(3))
    
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in aas_faculty['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")

# Test - extract specialization correctly
print("\nSample of AAS faculty specializations:")
for i, row in aas_faculty.head(3).iterrows():
    print(f"{row['Name']}: Specialization = {row['Specialization']}")

Found 9 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL,Department
0,Alexander Cho,Assistant Professor,"Ph.D., The University of Texas at Austin, Medi...",alexcho@ucsb.edu,,HSSB 5032,https://www.asamst.ucsb.edu/people/alexander-cho,https://www.asamst.ucsb.edu/sites/default/file...,Asian American Studies
1,Jigna Desai,Professor,"Queer of Color Critique, Women of Color Femini...",jignadesai@ucsb.edu,,,https://www.asamst.ucsb.edu/people/jigna-desai,https://www.asamst.ucsb.edu/sites/default/file...,Asian American Studies
2,Diane C. Fujino,Professor,"Ph.D., University of California, Los AngelesAs...",fujino@ucsb.edu,,HSSB 5034,https://www.asamst.ucsb.edu/people/diane-c-fujino,https://www.asamst.ucsb.edu/sites/default/file...,Asian American Studies


All URLs are valid: True

Sample of AAS faculty specializations:
Alexander Cho: Specialization = Ph.D., The University of Texas at Austin, Media StudiesAsian American media studies, digital and social media, human-centered design, popular culture, gender and sexuality studies, learning and education, ethnography, critical race theory, affect.
Jigna Desai: Specialization = Queer of Color Critique, Women of Color Feminisms, South Asian Diasporic and Migration Studies, Asian American Feminism, Asian American Queer and Sexuality Studies, Critical University Studies, Critical Disability Studies, Public Engagement and Humanities, Media Studies
Diane C. Fujino: Specialization = Ph.D., University of California, Los AngelesAsian American social movement history, Japanese American radical history 1940s-1970s, Black Power studies and the Black Radical Tradition, Afro-Asian solidarities, and activist-scholarship research and pedagogies.


## Biomolecular Science and Engineering Program

This is a program, so most faculty will appear as Affiliated

![black_studies](../faculty_screenshots/Biomolecular_Science_and_Engineering.png){width=300}

In [None]:
#| export
def biomolecular_science_and_engineering(file_html, base_url="https://www.bmse.ucsb.edu"):
    """
    Scraper for the UCSB Biomolecular Science and Engineering department faculty page.
    
    Parameters:
    -----------
    file_html : str
        Path to the HTML file to parse
    base_url : str, optional
        Base URL for the department website for converting relative URLs to absolute
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing faculty information
    """
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # Find all faculty entries - each has a class of node-faculty and node-teaser
        faculty_blocks = soup.select("div.node-faculty.node-teaser")
        
        if not faculty_blocks:
            print("Warning: No faculty blocks found. Check HTML structure.")
        
        faculty_data = []
        
        for block in faculty_blocks:
            # Extract name and profile URL from the title field
            title_field = block.select_one("div.field-name-title")
            if not title_field:
                continue
                
            name_link = title_field.select_one("a")
            if not name_link:
                continue
                
            name = name_link.get_text(strip=True)
            profile_url = name_link.get("href", "")
            if profile_url and not profile_url.startswith("http"):
                profile_url = urljoin(base_url, profile_url)
            
            # Extract title/position
            title = None
            job_title_field = block.select_one("div.field-name-field-job-title")
            if job_title_field:
                title = job_title_field.get_text(strip=True)
            
            # Extract department
            department = None
            dept_field = block.select_one("div.field-name-field-dept")
            if dept_field:
                department = dept_field.get_text(strip=True)
            
            # Extract photo URL
            photo_url = None
            image_field = block.select_one("div.field-name-field-image")
            if image_field:
                img = image_field.select_one("img")
                if img:
                    photo_url = img.get("src", "")
                    if photo_url and not photo_url.startswith("http"):
                        photo_url = urljoin(base_url, photo_url)
            
            # Combine our extracted data
            specialization = department  # Using department as specialization
            
            # For email and phone, we'll need to check the actual faculty profile page
            # as they're not directly available in the teaser view
            # For now, we'll leave them as None
            
            faculty_data.append({
                "Name": name,
                "Title(s)": title,
                "Specialization": specialization,
                "Email": None,  # Not available in teaser view
                "Phone": None,  # Not available in teaser view
                "Office": None,  # Not available in teaser view
                "Website": profile_url,
                "Photo URL": photo_url,
                "Department": department
            })
        
        return pd.DataFrame(faculty_data)

In [None]:
# Test the Biomolecular Science and Engineering scraper
df = biomolecular_science_and_engineering(departments['Biomolecular Science and Engineering']['filename'])
print(f"Found {len(df)} faculty members")
if len(df) > 0:
    display(df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in df['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")
# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(df['Name'].head(3), df['Title(s)'].head(3))):
    print(f"{name}: {title}")

Found 54 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL,Department
0,Cherie Briggs,Mellichamp Professor,"Ecology, Evolution and Marine Biology",,,,https://www.bmse.ucsb.edu/people/briggs,https://www.bmse.ucsb.edu/sites/www.bmse.ucsb....,"Ecology, Evolution and Marine Biology"
1,Frank L. Brown,Professor (Affiliated),Chemistry & Biochemistry,,,,https://www.bmse.ucsb.edu/people/brown,https://www.bmse.ucsb.edu/sites/www.bmse.ucsb....,Chemistry & Biochemistry
2,Alison Butler,Professor (Affiliated),Chemistry & Biochemistry,,,,https://www.bmse.ucsb.edu/people/butler,https://www.bmse.ucsb.edu/sites/www.bmse.ucsb....,Chemistry & Biochemistry


All URLs are valid: True

Sample of faculty titles:
Cherie Briggs: Mellichamp Professor
Frank L. Brown: Professor (Affiliated)
Alison Butler: Professor (Affiliated)


## Black Studies Department

This department uses the generic drupal scraper

![black_studies](../faculty_screenshots/Black_Studies.png){width=300}

In [None]:
# Test the generic Drupal department scraper on a few departments
department = 'Black Studies'
faculty = drupal_department(departments[department]['filename'], departments[department]['url'])
print(f"Found {len(faculty)} faculty members")
if len(faculty) > 0:
    display(faculty.head(3))
    
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in faculty['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")

# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(faculty['Name'].head(3), faculty['Title(s)'].head(3))):
    print(f"{name}: {title}")

Found 11 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL
0,Jude Akudinobi,Lecturer,,akudinob@blackstudies.ucsb.edu,,Room 3713 South Hall,https://www.blackstudies.ucsb.edu/people/acade...,https://www.blackstudies.ucsb.edu/sites/defaul...
1,Jaime Alves,Vice Chair and Associate Professor,,jaimealves@blackstudies.ucsb.edu,,Room 3715 South Hall,https://www.blackstudies.ucsb.edu/people/acade...,https://www.blackstudies.ucsb.edu/sites/defaul...
2,Ingrid Banks,Associate Professor,,ibanks@blackstudies.ucsb.edu,,Room 3708 South Hall,https://www.blackstudies.ucsb.edu/people/acade...,https://www.blackstudies.ucsb.edu/sites/defaul...


All URLs are valid: True

Sample of faculty titles:
Jude Akudinobi: Lecturer
Jaime  Alves: Vice Chair and Associate Professor
Ingrid  Banks: Associate Professor


## Bren School of Environmental Science and Management

![bren_school](../faculty_screenshots/Bren_School_of_Environmental_Science.png){width=300}


In [None]:
#| export
def bren_school(file_html, base_url="https://bren.ucsb.edu"):
    """Extract faculty data from the UCSB Bren School of Environmental Science department HTML file."""
    from bs4 import BeautifulSoup
    import pandas as pd
    
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        faculty_rows = soup.select("div.views-row")
        faculty_data = []
        
    def get_text(elem):
        # replace newlines with spaces:
        return elem.get_text(separator=" ", strip=True) if elem else None
    
    for row in faculty_rows:
        # Get name and extract website URL from the name link
        name_link = row.select_one("div.views-field-title-1 h3.field-content a")
        name = get_text(name_link)
        
        # Get the website URL from the href attribute of the name link
        website = name_link["href"] if name_link else None
        # Add base URL if the link is relative
        if website and website.startswith('/'):
            website = base_url + website
            
        title = get_text(row.select_one("div.views-field-field-title div.field-content"))
        email_tag = row.select_one("div.views-field-field-email div.field-content a")
        email = email_tag.get_text(strip=True) if email_tag else None
        
        # Get photo URL
        photo_tag = row.select_one("div.views-field-field-photo img")
        photo_url = photo_tag["src"] if photo_tag else None
        
        # Faculty category based on section headings
        # Find the nearest preceding h2 element to determine the category
        prev_section = row.find_previous("h2")
        category = prev_section.get_text(strip=True) if prev_section else "Unknown"
        
        faculty_data.append({
            "Name": name,
            "Title(s)": title,
            "Email": email,
            "Website": website,
            "Photo URL": photo_url,
            "Category": category,
            "Department": "Bren School of Environmental Science"
        })
    
    return pd.DataFrame(faculty_data)

In [None]:
# Test the Bren School scraper
df = bren_school(departments['Bren School of Environmental Science']['filename'])
print(f"Found {len(df)} faculty members")
if len(df) > 0:
    display(df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in df['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")
# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(df['Name'].head(3), df['Title(s)'].head(3))):
    print(f"{name}: {title}")


Found 76 faculty members


Unnamed: 0,Name,Title(s),Email,Website,Photo URL,Category,Department
0,Steve Gaines,"Dean, Bren School",gaines@bren.ucsb.edu,https://bren.ucsb.edu/people/steve-gaines,/sites/default/files/styles/person/public/2024...,Dean,Bren School of Environmental Science
1,Sarah Anderson,"Professor; Associate Dean of Diversity, Equity...",sanderson@bren.ucsb.edu,https://bren.ucsb.edu/people/sarah-anderson,/sites/default/files/styles/person/public/2020...,Permanent Faculty,Bren School of Environmental Science
2,Mark Buntaine,Professor,buntaine@bren.ucsb.edu,https://bren.ucsb.edu/people/mark-buntaine,/sites/default/files/styles/person/public/2020...,Permanent Faculty,Bren School of Environmental Science


All URLs are valid: True

Sample of faculty titles:
Steve Gaines: Dean, Bren School
Sarah Anderson: Professor; Associate Dean of Diversity, Equity and Inclusion
Mark Buntaine: Professor


## Chemical Engineering

![chemical_engineering](../faculty_screenshots/Chemical_Engineering.png){width=300}

In [None]:
#| export
def chemical_engineering(file_html, base_url=""):
    """
    Extract faculty data from the UCSB Chemical Engineering department HTML file.
    
    Parameters:
    -----------
    file_html : str
        Path to the HTML file
    base_url : str, optional
        Base URL to prepend to relative URLs. Default is empty string.
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing faculty information
    """
    from bs4 import BeautifulSoup
    import pandas as pd
    
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        faculty_items = soup.select("div.view-content li")
        faculty_data = []
        
    def get_text(elem):
        # replace newlines with spaces:
        return elem.get_text(separator=" ", strip=True) if elem else None
    
    for item in faculty_items:
        # Extract name from the title field
        name_elem = item.select_one("div.views-field-title span.field-content a")
        name = get_text(name_elem)
        
        # Extract profile URL from the name link
        profile_url = name_elem["href"] if name_elem else None
        # Add base URL if the link is relative and base_url is provided
        if profile_url and profile_url.startswith('/') and base_url:
            profile_url = base_url + profile_url
            
        # Extract title/department information
        title = get_text(item.select_one("div.views-field-field-titles--departments div.field-content"))
        
        # Extract email
        email_elem = item.select_one("div.views-field-field-people-email div.field-content a")
        email = get_text(email_elem)
        
        # Extract photo URL
        photo_elem = item.select_one("div.views-field-field-image div.field-content a img")
        photo_url = photo_elem["src"] if photo_elem else None
        # Add base URL to photo URL if it's relative and base_url is provided
        if photo_url and photo_url.startswith('/') and base_url:
            photo_url = base_url + photo_url
        
        faculty_data.append({
            "Name": name,
            "Title(s)": title,
            "Email": email,
            "Website": profile_url,
            "Photo URL": photo_url,
            "Department": "Chemical Engineering"
        })
    
    return pd.DataFrame(faculty_data)

In [None]:
# Test the Chemical Engineering scraper
df = chemical_engineering(departments['Chemical Engineering']['filename'], departments['Chemical Engineering']['url'])
print(f"Found {len(df)} faculty members")
if len(df) > 0:
    display(df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in df['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")
# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(df['Name'].head(3), df['Title(s)'].head(3))):
    print(f"{name}: {title}")


Found 20 faculty members


Unnamed: 0,Name,Title(s),Email,Website,Photo URL,Department
0,Mahdi M. Abu-Omar,"Mellichamp Chair in Green Chemistry Professor,...",mabuomar@ucsb.edu,https://www.chemengr.ucsb.edu/people/faculty/p...,https://www.chemengr.ucsb.edu/sites/default/fi...,Chemical Engineering
1,Christopher M. Bates,"Assistant Professor, Materials and (by courtes...",cbates@ucsb.edu,https://www.chemengr.ucsb.edu/people/faculty/p...,https://www.chemengr.ucsb.edu/sites/default/fi...,Chemical Engineering
2,Joseph Chada (he/him/his),Associate Teaching Professor Pronounced: Joe C...,jchada@ucsb.edu,https://www.chemengr.ucsb.edu/people/faculty/p...,https://www.chemengr.ucsb.edu/sites/default/fi...,Chemical Engineering


All URLs are valid: True

Sample of faculty titles:
Mahdi M. Abu-Omar: Mellichamp Chair in Green Chemistry Professor, Chemistry and (by courtesy) Chemical Engineering
Christopher M. Bates: Assistant Professor, Materials and (by courtesy) Chemical Engineering
Joseph Chada (he/him/his): Associate Teaching Professor Pronounced: Joe Chad-ah


## Chemistry and Biochemistry Department

![chemistry_and_biochemistry](../faculty_screenshots/Chemistry_and_Biochemistry.png){width=300}

In [None]:
#| export
def chemistry_biochemistry(file_html, base_url=""):
    """
    Extract faculty data from the UCSB Chemistry and Biochemistry department HTML file.
    
    Parameters:
    -----------
    file_html : str
        Path to the HTML file
    base_url : str, optional
        Base URL to prepend to relative URLs. Default is empty string.
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing faculty information
    """
    from bs4 import BeautifulSoup
    import pandas as pd
    
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # Find all faculty entries
        faculty_items = soup.select("div.views-row")
        faculty_data = []
        
        def get_text(elem):
            # Replace newlines with spaces
            return elem.get_text(separator=" ", strip=True) if elem else None
        
        for item in faculty_items:
            # Extract name from the title field
            name_elem = item.select_one("div.views-field-title span.field-content a")
            name = get_text(name_elem)
            
            # Extract profile URL from the name link
            profile_url = name_elem["href"] if name_elem else None
            # Add base URL if the link is relative and base_url is provided
            if profile_url and profile_url.startswith('/') and base_url:
                profile_url = base_url + profile_url
            
            # Extract affiliation/title information
            affiliation_elem = item.select_one("div.views-field-field-affiliation div.field-content")
            if affiliation_elem:
                # Get all list items in the affiliation
                affiliation_items = affiliation_elem.select("ul li")
                title = ", ".join([get_text(li) for li in affiliation_items]) if affiliation_items else get_text(affiliation_elem)
            else:
                title = None
            
            # Extract specialization
            specialization_elem = item.select_one("div.views-field-field-specialization div.field-content")
            specialization = get_text(specialization_elem)
            
            # Extract email
            email_elem = item.select_one("div.views-field-field-contact-email div.field-content a")
            email = get_text(email_elem)
            
            # Extract phone
            phone_elem = item.select_one("div.views-field-field-contact-phone div.field-content")
            phone = get_text(phone_elem)
            
            # Extract office location
            office_elem = item.select_one("div.views-field-field-office-location div.field-content")
            office = get_text(office_elem)
            
            # Extract website
            website_elem = item.select_one("div.views-field-field-website div.field-content a")
            website = website_elem["href"] if website_elem else None
            
            # Extract photo URL
            photo_elem = item.select_one("div.views-field-field-photo div.field-content a img")
            photo_url = photo_elem["src"] if photo_elem else None
            # Add base URL to photo URL if it's relative and base_url is provided
            if photo_url and photo_url.startswith('/') and base_url:
                photo_url = base_url + photo_url
            
            faculty_data.append({
                "Name": name,
                "Title(s)": title,
                "Specialization": specialization,
                "Email": email,
                "Phone": phone,
                "Office": office,
                "Website": website,
                "Profile URL": profile_url,
                "Photo URL": photo_url,
                "Department": "Chemistry and Biochemistry"
            })
    
    return pd.DataFrame(faculty_data)

In [None]:
# Test the Chemistry and Biochemistry scraper
df = chemistry_biochemistry(departments['Chemistry and Biochemistry']['filename'], departments['Chemistry and Biochemistry']['url'])
print(f"Found {len(df)} faculty members")
if len(df) > 0:
    display(df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in df['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")
# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(df['Name'].head(3), df['Title(s)'].head(3))):
    print(f"{name}: {title}")


Found 49 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Profile URL,Photo URL,Department
0,Mahdi Abu-Omar,"Professor, Mellichamp Chair of Green Chemistry...","Inorganic & Organometallic Energy, Catalysis &...",abuomar@chem.ucsb.edu,805-893-5043,Chem 3241,https://carlobroderick.wixsite.com/website,https://www.chem.ucsb.edu/people/academic/peop...,https://www.chem.ucsb.edu/sites/default/files/...,Chemistry and Biochemistry
1,Mercy Anyika,Lecturer,Organic Chemistry Education,manyika@ucsb.edu,,CHEM 4314,,https://www.chem.ucsb.edu/people/academic/peop...,https://www.chem.ucsb.edu/sites/default/files/...,Chemistry and Biochemistry
2,Christopher Bates,"Associate Professor Materials, Joint Appt: DCB",,cbates@ucsb.edu,(805) 893-5383,1518 Engineering II,https://labs.materials.ucsb.edu/bates/christop...,https://www.chem.ucsb.edu/people/academic/peop...,https://www.chem.ucsb.edu/sites/default/files/...,Chemistry and Biochemistry


All URLs are valid: True

Sample of faculty titles:
Mahdi Abu-Omar: Professor, Mellichamp Chair of Green Chemistry, Joint Appt: Chemical Engineering
Mercy Anyika: Lecturer
Christopher  Bates: Associate Professor Materials, Joint Appt: DCB


## Chicana and Chicano Studies

![chican_and_chicano_studies](../faculty_screenshots/Chicana_and_Chicano_Studies.png){width=300}

In [None]:
#| export
def chicana_chicano_studies(file_html, base_url=""):
    """
    Extract faculty data from the UCSB Chicana and Chicano Studies department HTML file.
    
    Parameters:
    -----------
    file_html : str
        Path to the HTML file
    base_url : str, optional
        Base URL to prepend to relative URLs. Default is empty string.
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing faculty information
    """
    from bs4 import BeautifulSoup
    import pandas as pd
    
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # Find all faculty entries
        faculty_items = soup.select("div.views-row.academic")
        faculty_data = []
        
        def get_text(elem):
            # Replace newlines with spaces
            return elem.get_text(separator=" ", strip=True) if elem else None
        
        for item in faculty_items:
            # Extract name from the title field
            name_elem = item.select_one("div.views-field-title span.field-content a")
            name = get_text(name_elem)
            
            # Extract profile URL from the name link
            profile_url = name_elem["href"] if name_elem else None
            # Add base URL if the link is relative and base_url is provided
            if profile_url and profile_url.startswith('/') and base_url:
                profile_url = base_url + profile_url
            
            # Extract affiliation/title information
            affiliation_elem = item.select_one("div.views-field-field-affiliation div.field-content")
            if affiliation_elem:
                # Get all list items in the affiliation
                affiliation_items = affiliation_elem.select("ul li")
                title = ", ".join([get_text(li) for li in affiliation_items]) if affiliation_items else get_text(affiliation_elem)
            else:
                title = None
            
            # Extract specialization
            specialization_elem = item.select_one("div.views-field-field-specialization div.field-content")
            specialization = get_text(specialization_elem)
            
            # Extract email
            email_elem = item.select_one("div.views-field-field-contact-email div.field-content a")
            email = get_text(email_elem)
            
            # Extract phone
            phone_elem = item.select_one("div.views-field-field-contact-phone div.field-content")
            phone = get_text(phone_elem)
            
            # Extract office location
            office_elem = item.select_one("div.views-field-field-office-location div.field-content")
            office = get_text(office_elem)
            
            # Extract website
            website_elem = item.select_one("div.views-field-field-website div.field-content a")
            website = website_elem["href"] if website_elem else None
            
            # Extract photo URL
            photo_elem = item.select_one("div.views-field-field-photo div.field-content a img")
            photo_url = photo_elem["src"] if photo_elem else None
            # Add base URL to photo URL if it's relative and base_url is provided
            if photo_url and photo_url.startswith('/') and base_url:
                photo_url = base_url + photo_url
            
            faculty_data.append({
                "Name": name,
                "Title(s)": title,
                "Specialization": specialization,
                "Email": email,
                "Phone": phone,
                "Office": office,
                "Website": website,
                "Profile URL": profile_url,
                "Photo URL": photo_url,
                "Department": "Chicana and Chicano Studies"
            })
    
    return pd.DataFrame(faculty_data)

In [None]:
# Test the chicana and chicano studies scraper
df = chicana_chicano_studies(departments['Chicana and Chicano Studies']['filename'], departments['Chicana and Chicano Studies']['url'])
print(f"Found {len(df)} faculty members")
if len(df) > 0:
    display(df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in df['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")
# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(df['Name'].head(3), df['Title(s)'].head(3))):
    print(f"{name}: {title}")


Found 12 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Profile URL,Photo URL,Department
0,Gerardo Aldana,Professor,,gvaldana@ucsb.edu,,1710 South Hall,,https://www.chicst.ucsb.edu/people/academic/pe...,https://www.chicst.ucsb.edu/sites/default/file...,Chicana and Chicano Studies
1,Ralph Armbruster-Sandoval,Professor,"Race, labor, empire, social movements, urban s...",ralpharmbruster@ucsb.edu,(805) 722-4556,1706 South Hall,,https://www.chicst.ucsb.edu/people/academic/pe...,https://www.chicst.ucsb.edu/sites/default/file...,Chicana and Chicano Studies
2,Giovanni Batz,Assistant Professor,"Maya social movements and resistance, extracti...",gbatz@ucsb.edu,,SH 1723,,https://www.chicst.ucsb.edu/people/academic/pe...,https://www.chicst.ucsb.edu/sites/default/file...,Chicana and Chicano Studies


All URLs are valid: True

Sample of faculty titles:
Gerardo Aldana: Professor
Ralph Armbruster-Sandoval: Professor
Giovanni Batz: Assistant Professor


## Classics

![classics](../faculty_screenshots/Classics.png){width=300}


In [None]:
#| export
def classics(file_html, base_url=""):
    """
    Extract faculty data from the UCSB Classics department HTML file.

    Parameters:
    -----------
    file_html : str
        Path to the HTML file
    base_url : str, optional
        Base URL to prepend to relative URLs. Default is empty string.

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing faculty information
    """
    from bs4 import BeautifulSoup
    import pandas as pd

    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    faculty_data = []
    table = soup.select_one("div#primary table.directory.responsive")

    if not table:
        return pd.DataFrame()  # Return empty DataFrame if no table found

    rows = table.find_all("tr")

    def get_text(elem):
        return elem.get_text(separator=" ", strip=True) if elem else None

    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 2:
            continue

        # Extract name and website from name link
        name_link = cols[0].find("a")
        name = get_text(name_link)
        website = urljoin(base_url, name_link['href']) if name_link and name_link.has_attr('href') else None

        # Title info from second column
        title = get_text(cols[1])

        # Email link if available
        email_elem = row.select_one("a[href^='mailto']")
        email = get_text(email_elem)

        # Profile photo URL from image
        img_elem = row.find("img")
        photo_url = urljoin(base_url, img_elem["src"]) if img_elem and img_elem.has_attr("src") else None

        faculty_data.append({
            "Name": name,
            "Title(s)": title,
            "Specialization": None,
            "Email": email,
            "Phone": None,
            "Office": None,
            "Website": website,
            "Profile URL": photo_url,
            "Photo URL": photo_url,
            "Department": "Classics"
        })

    return pd.DataFrame(faculty_data)

In [None]:
# Test the Classics scraper
df = classics(departments['Classics']['filename'], departments['Classics']['url'])
print(f"Found {len(df)} faculty members")
if len(df) > 0:
    display(df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in df['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")
# Test - extract title correctly
print("\nSample of faculty titles:")
for i, (name, title) in enumerate(zip(df['Name'].head(3), df['Title(s)'].head(3))):
    print(f"{name}: {title}")



Found 1 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Profile URL,Photo URL,Department
0,,"Capettini, Emilio Assistant Professor",,ecapettini@classics.ucsb.edu,,,https://www.classics.ucsb.edu/faculty/emilio-c...,https://www.classics.ucsb.edu/wp-content/uploa...,https://www.classics.ucsb.edu/wp-content/uploa...,Classics


All URLs are valid: True

Sample of faculty titles:
: Capettini, Emilio Assistant Professor


## Computer Science Department

The Computer Science department's faculty page uses a different structure than previous departments. Faculty information is displayed in `div` elements with a specific pattern for faculty details.

![computer_science](../faculty_screenshots/Computer_Science.png){width=300}

In [None]:
#| export
def computer_science(file_html, base_url="https://www.cs.ucsb.edu"):
    "Extract faculty data from the UCSB Computer Science department HTML file."
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    faculty_blocks = soup.select("div.views-row div.views-field-nothing span.field-content > div")
    faculty_data = []

    for block in faculty_blocks:
        # Each faculty member has two main div elements - one for photo, one for info
        info_div = block.find_all('div')[1] if len(block.find_all('div')) > 1 else None
        
        if not info_div:
            continue
            
        # Extract name and profile URL
        name_elem = info_div.find('h2')
        if not name_elem or not name_elem.find('a'):
            continue
            
        name = name_elem.find('a').get_text(strip=True)
        profile_url = name_elem.find('a').get('href', '')
        if profile_url and profile_url.startswith('/'):
            profile_url = f"https://www.cs.ucsb.edu{profile_url}"
        
        # Extract paragraphs for other information
        paragraphs = info_div.find_all('p')
        
        # Initialize values
        title = specialization = email = phone = office = personal_website = None
        
        # Parse paragraphs to extract information
        for i, p in enumerate(paragraphs):
            if i == 0 and "He/Him" not in p.text and "She/Her" not in p.text:
                # First paragraph is often the title
                title = p.get_text(strip=True)
            elif i == 1 or (i == 2 and title is None):
                # Usually the title or specialization
                if not title:
                    title = p.get_text(strip=True)
                else:
                    specialization = p.get_text(strip=True)
            elif "@" in p.text:
                # Extract email from paragraph containing @
                email_link = p.find('a')
                if email_link:
                    email = email_link.get_text(strip=True)
            elif p.find('a') and ("Personal Website" in p.text or "Google Scholar" in p.text):
                # Extract personal website
                website_link = p.find('a')
                if website_link:
                    personal_website = website_link.get('href', '')
            elif "Hall" in p.text:
                # Extract office location
                office = p.get_text(strip=True)
            elif p.text and p.text.strip() and not p.find('a') and "Ph.D" not in p.text:
                # Any paragraph with a phone number
                if re.search(r'\d{3}[.-]\d{3}[.-]\d{4}', p.text) or re.search(r'\(\d{3}\)\s*\d{3}-\d{4}', p.text):
                    phone = p.get_text(strip=True)

        faculty_data.append({
            "Name": name,
            "Title(s)": title,
            "Specialization": specialization,
            "Email": email,
            "Phone": phone,
            "Office": office,
            "Website": personal_website if personal_website else profile_url,
            "Photo URL": None,  # Could extract this from the image if needed
            "Department": "Computer Science"
        })

    return pd.DataFrame(faculty_data)

In [None]:
# Test the Computer Science scraper
file_path = "../faculty_html/Computer_Science.html"
cs_faculty = computer_science(file_path)
print(f"Found {len(cs_faculty)} faculty members")
if len(cs_faculty) > 0:
    display(cs_faculty.head(3))
    
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in cs_faculty['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")

# Test - extract title correctly
print("\nSample of CS faculty titles:")
for i, (name, title) in enumerate(zip(cs_faculty['Name'].head(3), cs_faculty['Title(s)'].head(3))):
    print(f"{name}: {title}")

Found 43 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL,Department
0,Divyakant Agrawal,Distinguished Professor & Chair,,agrawal@cs.ucsb.edu,(805)893-4385,3117 Harold Frank Hall,https://www.cs.ucsb.edu/~agrawal/,,Computer Science
1,Prabhanjan Ananth,Assistant Professor,,prabhanjan@ucsb.edu,,1119 Harold Frank Hall,https://sites.google.com/site/prabhanjanva/,,Computer Science
2,Jonathan Balkind,Assistant Professor,,jbalkind@ucsb.edu,,2007 Henley Hall,https://jbalkind.github.io/,,Computer Science


All URLs are valid: True

Sample of CS faculty titles:
Divyakant Agrawal: Distinguished Professor & Chair
Prabhanjan Ananth: Assistant Professor
Jonathan Balkind: Assistant Professor


## English Department


In [None]:
#| export
def english(file_html, base_url="https://www.english.ucsb.edu"):
    """Extract faculty data from the UCSB English department HTML file."""
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # Find all faculty list rows
        faculty_blocks = soup.select("li.table--list--row")
        faculty_data = []
        
        for block in faculty_blocks:
            # Skip the table header row
            if "table--head" in block.get("class", []):
                continue
                
            # Extract information from the flex table
            flex_table = block.select_one("div.flex--table")
            if not flex_table:
                continue
                
            # Extract name and profile URL
            name_elem = flex_table.select_one("div.table--name a")
            if not name_elem:
                continue
                
            name = name_elem.get_text(strip=True)
            profile_url = name_elem.get("href", "")
            if profile_url and not profile_url.startswith("http"):
                profile_url = f"https://www.english.ucsb.edu{profile_url}"
                
            # Extract title/position
            title_elem = flex_table.select_one("div.table--position")
            title = title_elem.get_text(strip=True) if title_elem else None
            
            # Extract office and email information
            contact_elem = flex_table.select_one("div.table--desc")
            office = None
            email = None
            
            if contact_elem:
                # The office location is typically before the <br> tag
                office_text = contact_elem.contents[0] if contact_elem.contents else ""
                if isinstance(office_text, str):
                    office = office_text.strip()
                
                # The email is typically in an <a> tag
                email_elem = contact_elem.select_one("a")
                if email_elem:
                    email = email_elem.get_text(strip=True)
            
            # Extract photo URL if available
            photo_elem = flex_table.select_one("div.img-portrait-archive img")
            photo_url = None
            if photo_elem:
                photo_url = photo_elem.get("src", "")
                if photo_url and not photo_url.startswith("http"):
                    photo_url = f"https://www.english.ucsb.edu{photo_url}"
            
            # Extract research areas
            research_areas_class = block.get("data-category", "").strip()
            research_areas = [area.strip() for area in research_areas_class.split(" ") if area.strip()]
            
            # Format the research areas to be more readable
            readable_research_areas = []
            for area in research_areas:
                # Replace hyphens with spaces and capitalize each word
                readable_area = " ".join(word.capitalize() for word in area.replace("-", " ").split())
                readable_research_areas.append(readable_area)
            
            specialization = ", ".join(readable_research_areas) if readable_research_areas else None
            
            faculty_data.append({
                "Name": name,
                "Title(s)": title,
                "Specialization": specialization,
                "Email": email,
                "Phone": None,  # Phone not available in this HTML
                "Office": office,
                "Website": profile_url,
                "Photo URL": photo_url,
                "Department": "English"
            })
            
        return pd.DataFrame(faculty_data)

In [None]:
# Test the English department scraper
file_path = "../faculty_html/English.html"
faculty = english(file_path)
print(f"Found {len(faculty)} faculty members")
if len(faculty) > 0:
    display(faculty.head(3))
    
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in faculty['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")

# Test - extract office location correctly
print("\nSample of English faculty office locations:")
for i, row in faculty.head(3).iterrows():
    print(f"{row['Name']}: Office = {row['Office']}")

Found 36 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL,Department
0,"Amin-Hong, Heidi",Assistant Professor,"American Literature, American Race And Ethnic ...",hamin-hong@english.ucsb.edu,,SH 2720,https://www.english.ucsb.edu/people/faculty/am...,https://www.english.ucsb.edu/wp-content/upload...,English
1,"Andrea, Bernadette",Professor,"C 1500 And Earlier, C 1500 1800, C 1800 1945, ...",bernadette.andrea@english.ucsb.edu,,South Hall 2501,https://www.english.ucsb.edu/people/faculty/an...,https://www.english.ucsb.edu/wp-content/upload...,English
2,"Batiste, Stephanie",Professor,"C 1945 Present, African American And Or Africa...",sbatiste@english.ucsb.edu,,South Hall 2722,https://www.english.ucsb.edu/people/faculty/ba...,https://www.english.ucsb.edu/wp-content/upload...,English


All URLs are valid: True

Sample of English faculty office locations:
Amin-Hong, Heidi: Office = SH 2720
Andrea, Bernadette: Office = South Hall 2501
Batiste, Stephanie: Office = South Hall 2722


## Mathematics Department Scraper

The Mathematics department uses a structure similar to the Asian American Studies and Anthropology departments. We can adapt that pattern for this scraper.

Note: Mathematics only specifies Ladder Faculty on their web pages, so we do not have full position information (Assistant, Associate, Full Professor)

In [None]:
#| export
def mathematics(file_html, base_url="https://www.math.ucsb.edu"):
    """
    Scraper specifically for the UCSB Mathematics Department faculty page.
    
    Parameters:
    -----------
    file_html : str
        Path to the HTML file to parse
    base_url : str, optional
        Base URL for the Math department website for converting relative URLs to absolute
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing faculty information
    """
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # Find all faculty entries in the views-row with class academic
        faculty_blocks = soup.select("div.views-row.academic")
        
        if not faculty_blocks:
            print("Warning: No faculty blocks found. Check HTML structure.")
        
        faculty_data = []
        
        for block in faculty_blocks:
            # Extract name and profile URL from title field
            title_field = block.select_one("div.views-field-title")
            if not title_field:
                continue
            
            name_link = title_field.select_one("a")
            if not name_link:
                continue
            
            name = name_link.get_text(strip=True)
            profile_url = name_link.get("href", "")
            if profile_url and not profile_url.startswith("http"):
                profile_url = urljoin(base_url, profile_url)
            
            # Extract photo URL
            photo_url = None
            photo_field = block.select_one("div.views-field-field-photo")
            if photo_field:
                img = photo_field.select_one("img")
                if img:
                    photo_url = img.get("src", "")
                    if photo_url and not photo_url.startswith("http"):
                        photo_url = urljoin(base_url, photo_url)
            
            # Extract affiliation/title
            title = None
            affiliation_field = block.select_one("div.views-field-field-affiliation")
            if affiliation_field:
                affiliation_items = affiliation_field.select("li")
                title_items = [item.get_text(strip=True) for item in affiliation_items]
                title = ", ".join(title_items) if title_items else None
            
            # Extract specialization
            specialization = None
            spec_field = block.select_one("div.views-field-field-specialization")
            if spec_field:
                # Get all paragraph contents
                spec_paras = spec_field.select("p")
                if spec_paras:
                    specialization = " ".join([p.get_text(strip=True) for p in spec_paras])
                else:
                    # If no paragraphs, get direct text content
                    specialization = spec_field.get_text(strip=True)
            
            # Extract contact information from the contact div
            contact_div = block.select_one("div.contact")
            email = phone = office = website = None
            
            if contact_div:
                # Extract email
                email_field = contact_div.select_one("div.views-field-field-contact-email")
                if email_field:
                    email_link = email_field.select_one("a")
                    if email_link:
                        email = email_link.get_text(strip=True)
                
                # Extract phone
                phone_field = contact_div.select_one("div.views-field-field-contact-phone")
                if phone_field:
                    phone_items = phone_field.select("li")
                    if phone_items:
                        phone = phone_items[0].get_text(strip=True)
                
                # Extract office location
                office_field = contact_div.select_one("div.views-field-field-office-location")
                if office_field:
                    office_items = office_field.select("li")
                    if office_items:
                        office = office_items[0].get_text(strip=True)
                
                # Extract personal website
                website_field = contact_div.select_one("div.views-field-field-website")
                if website_field:
                    website_link = website_field.select_one("a")
                    if website_link:
                        website = website_link.get("href", "")
            
            faculty_data.append({
                "Name": name,
                "Title(s)": title,
                "Specialization": specialization,
                "Email": email,
                "Phone": phone,
                "Office": office,
                "Website": website if website else profile_url,
                "Photo URL": photo_url,
                "Department": "Mathematics"
            })
        
        return pd.DataFrame(faculty_data)


#### Test the Mathematics Department scraper function

In [None]:
# Test the Mathematics department scraper
df = mathematics(departments['Mathematics']['filename'], departments['Mathematics']['domain'])
print(f"Found {len(df)} faculty members")
if len(df) > 0:
    display(df.head(3))
# Check if we have proper URLs
has_valid_urls = all(url.startswith('http') for url in df['Website'] if pd.notnull(url))
print(f"All URLs are valid: {has_valid_urls}")
# Test - extract title correctly
print("\nSample of Mathematics faculty titles:")
for i, (name, title) in enumerate(zip(df['Name'].head(3), df['Title(s)'].head(3))):
    print(f"{name}: {title}")


Found 35 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL,Department
0,Adebisi Agboola,Ladder Faculty,Number Theory,agboola@math.ucsb.edu,(805) 893-3844,RM. 6724,http://www.math.ucsb.edu/~agboola,,Mathematics
1,Paul Atzberger,Ladder Faculty,Applied Mathematics and Computational Mathematics,atzberg@ucsb.edu,(805) 893-3239,RM. 6712,http://web.math.ucsb.edu/~atzberg,https://www.math.ucsb.edu/sites/default/files/...,Mathematics
2,Stephen Bigelow,Ladder Faculty,Low-dimensional Topology,bigelow@math.ucsb.edu,,RM. 6514,http://www.math.ucsb.edu/~bigelow,https://www.math.ucsb.edu/sites/default/files/...,Mathematics


All URLs are valid: True

Sample of Mathematics faculty titles:
Adebisi Agboola: Ladder Faculty
Paul Atzberger: Ladder Faculty
Stephen Bigelow: Ladder Faculty


## Wordpress-based Department Scraper

Some departments like English use Wordpress instead of Drupal. This scraper is designed for those departments.

In [None]:
#| export

def wordpress_department(file_html, base_url=None):
    """
    Scraper for UCSB departments using WordPress sites.
    
    Args:
        file_html: Path to the HTML file
        base_url: Base URL for the department's website (for resolving relative links)
    """
    with open(file_html, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    
    # Try to detect the department domain if not provided
    if not base_url:
        meta_tags = soup.select('meta[content*="ucsb.edu"]')
        for tag in meta_tags:
            content = tag.get('content', '')
            if 'ucsb.edu' in content:
                match = re.search(r'https?://[^/]*\.ucsb\.edu', content)
                if match:
                    base_url = match.group(0)
                    break
    
    # WordPress sites often use different structures, but many use lists or grids for faculty
    faculty_rows = (
        soup.select("li.table--list--row") or  # English department style
        soup.select("div.faculty-member") or   # Another common pattern
        soup.select("article.faculty") or      # Another common pattern
        []
    )
    
    faculty_data = []

    for row in faculty_rows:
        # Try different selectors for name and profile URL
        name_elem = (
            row.select_one("div.table--name a") or
            row.select_one("h2 a") or
            row.select_one("h3 a") or
            row.select_one("a.faculty-name")
        )
        
        if not name_elem:
            continue
            
        name = name_elem.get_text(strip=True)
        
        # Extract profile URL
        profile_url = name_elem.get('href', '')
        if profile_url and profile_url.startswith('/') and base_url:
            profile_url = f"{base_url}{profile_url}"
        
        # Try different selectors for title/position
        title_elem = (
            row.select_one("div.table--position") or
            row.select_one("div.faculty-title") or
            row.select_one("p.title")
        )
        title = title_elem.get_text(strip=True) if title_elem else None
        
        # Look for email - WordPress sites often use mailto links
        email_elem = row.select_one("a[href^='mailto:']")
        email = email_elem.get_text(strip=True) if email_elem else None
        
        # Look for office location - typically in description/contact div or after email
        desc_div = (
            row.select_one("div.table--desc") or
            row.select_one("div.faculty-contact") or
            row.select_one("div.contact-info")
        )
        
        office = None
        if desc_div:
            # Try to parse office location from text
            content = desc_div.get_text(strip=True)
            # Common patterns for office locations: "Room X", "Building Y, Room Z"
            hall_match = re.search(r'(?:Room|Rm\.?|Office:?)\s+\w+[-\d]*|[A-Za-z]+ Hall [^,\.]*', content)
            if hall_match:
                office = hall_match.group(0)
            elif "Hall" in content:
                # Try to extract sentences containing "Hall"
                parts = re.split(r'[,\.]', content)
                office = next((part.strip() for part in parts if "Hall" in part), None)
        
        # Find photo if available
        photo_elem = (
            row.select_one("div.img-portrait-archive img") or
            row.select_one("div.faculty-photo img") or
            row.select_one("img.faculty-image")
        )
        photo_url = photo_elem.get('src', '') if photo_elem else None

        faculty_data.append({
            "Name": name,
            "Title(s)": title,
            "Specialization": None,  # Often not structured in WordPress sites
            "Email": email,
            "Phone": None,  # Often not structured in WordPress sites
            "Office": office,
            "Website": profile_url,
            "Photo URL": photo_url
        })

    return pd.DataFrame(faculty_data)

In [None]:
# Test the WordPress department scraper with English
file_path = "../faculty_html/English.html"
wp_faculty = wordpress_department(file_path, "https://www.english.ucsb.edu")
print(f"Found {len(wp_faculty)} faculty members")
wp_faculty.head(3)

Found 36 faculty members


Unnamed: 0,Name,Title(s),Specialization,Email,Phone,Office,Website,Photo URL
0,"Amin-Hong, Heidi",Assistant Professor,,hamin-hong@english.ucsb.edu,,,https://www.english.ucsb.edu/people/faculty/am...,https://www.english.ucsb.edu/wp-content/upload...
1,"Andrea, Bernadette",Professor,,bernadette.andrea@english.ucsb.edu,,South Hall 2501bernadette,https://www.english.ucsb.edu/people/faculty/an...,https://www.english.ucsb.edu/wp-content/upload...
2,"Batiste, Stephanie",Professor,,sbatiste@english.ucsb.edu,,South Hall 2722sbatiste@english,https://www.english.ucsb.edu/people/faculty/ba...,https://www.english.ucsb.edu/wp-content/upload...


## Main Scraper Function

Now we can define a main function that will attempt to determine the best scraper for each department based on page structure and fallback to the generic ones when needed.

In [None]:
# Create a full implementation of specific department scrapers
import os

def create_scrapers_for_all_departments():
    """Test each department with its own specialized scraper or fallback to generic scraper"""
    html_dir = "../faculty_html"
    all_files = [f for f in os.listdir(html_dir) if f.endswith('.html')]
    
    overall_results = {}
    
    for dept_file in all_files:
        dept_name = dept_file.replace('.html', '').replace('_', ' ')
        file_path = os.path.join(html_dir, dept_file)
        
        try:
            # Try department-specific scraper if it exists
            scraper_func, base_url = get_department_scraper(file_path)
            if base_url:
                faculty = scraper_func(file_path, base_url)
            else:
                faculty = scraper_func(file_path)
                
            # Record results
            faculty_count = len(faculty)
            valid_urls = 0
            if faculty_count > 0:
                valid_urls = sum(1 for url in faculty['Website'] if pd.notnull(url) and url.startswith('http'))
            
            overall_results[dept_name] = {
                'faculty_count': faculty_count,
                'valid_urls': valid_urls,
                'valid_url_pct': round(valid_urls / faculty_count * 100, 1) if faculty_count > 0 else 0
            }
            
        except Exception as e:
            overall_results[dept_name] = {
                'faculty_count': 0,
                'valid_urls': 0,
                'valid_url_pct': 0,
                'error': str(e)
            }
    
    # Create a DataFrame for better display
    results_df = pd.DataFrame.from_dict(overall_results, orient='index')
    results_df = results_df.sort_values('faculty_count', ascending=False)
    return results_df

# Run tests on all departments
all_dept_results = create_scrapers_for_all_departments()
print("Department Scraper Results Summary:")
display(all_dept_results)

Department Scraper Results Summary:


Unnamed: 0,faculty_count,valid_urls,valid_url_pct,error
Chicana and Chicano Studies,0,0,0,name 'get_department_scraper' is not defined
Film and Media Studies,0,0,0,name 'get_department_scraper' is not defined
History,0,0,0,name 'get_department_scraper' is not defined
Economics,0,0,0,name 'get_department_scraper' is not defined
Developmental Biology,0,0,0,name 'get_department_scraper' is not defined
Global and International Studies,0,0,0,name 'get_department_scraper' is not defined
Gevirtz Graduate School of Education,0,0,0,name 'get_department_scraper' is not defined
Chemical Engineering,0,0,0,name 'get_department_scraper' is not defined
College of Creative Studies,0,0,0,name 'get_department_scraper' is not defined
Comparative Literature,0,0,0,name 'get_department_scraper' is not defined


## Department-Specific Improvements

Based on the results above, let's create a dictionary of domain names to help with URL resolution for departments.

In [None]:
#| export
# Dictionary of department domains for proper URL resolution
DEPARTMENT_DOMAINS = {
    'anthropology': 'https://www.anth.ucsb.edu',
    'art': 'https://www.arts.ucsb.edu',
    'asian american studies': 'https://www.asamst.ucsb.edu',
    'black studies': 'https://www.blackstudies.ucsb.edu',
    'chemical engineering': 'https://chemengr.ucsb.edu',
    'chemistry and biochemistry': 'https://www.chem.ucsb.edu',
    'computer science': 'https://www.cs.ucsb.edu',
    'earth science': 'https://www.geol.ucsb.edu',
    'electrical and computer engineering': 'https://ece.ucsb.edu',
    'english': 'https://www.english.ucsb.edu',
    'history': 'https://www.history.ucsb.edu',
    'mathematics': 'https://www.math.ucsb.edu',
    'mechanical engineering': 'https://me.ucsb.edu',
    'physics': 'https://www.physics.ucsb.edu',
    'political science': 'https://www.polsci.ucsb.edu',
    'psychological brain sciences': 'https://www.psych.ucsb.edu',
    'religious studies': 'https://www.religion.ucsb.edu',
    'sociology': 'https://www.soc.ucsb.edu',
    'statistics and applied probability': 'https://www.pstat.ucsb.edu',
}

In [None]:
#| export

def scrape_department(file_html):
    """
    Main function to scrape faculty information from a department HTML file.
    Also ensures URLs are properly formatted with domain names.
    
    Args:
        file_html: Path to the HTML file
        
    Returns:
        A DataFrame with faculty information including properly formatted URLs
    """
    # Extract department name from file path
    department_name = file_html.split('/')[-1].replace('.html', '').replace('_', ' ')
    
    # Get appropriate scraper
    scraper_func, base_url = get_department_scraper(file_html)
    
    # Run the scraper
    if base_url:
        faculty_df = scraper_func(file_html, base_url)
    else:
        faculty_df = scraper_func(file_html)
    
    # Update relative URLs to absolute URLs
    faculty_df = update_department_urls(faculty_df, department_name)
    
    return faculty_df

## Improved Department-Specific Scrapers

Now let's update the get_scraper function to ensure it correctly handles the scrapers and the proper website domain names.

In [None]:
#| export

def get_scraper(name):
    """
    Return the appropriate scraper function for a unit.
    This is the main entry point used by the Unit class.
    
    Args:
        name: Department/unit name 
    
    Returns:
        A function that takes a file path and returns a DataFrame of faculty information
    """
    # The name is used both for selecting a specialized scraper and for URL domain lookup
    name_lower = name.lower()
    
    # Custom specialized scrapers
    if name_lower == "art":
        return art
    elif name_lower == "anthropology":
        return anthropology
    elif name_lower == "asian american studies":
        return asian_american_studies
    elif name_lower == "computer science":
        return computer_science
    elif name_lower == "english":
        return english
    elif name_lower == "mathematics":
        return mathematics
    else:
        # For other departments, use the generic scraper function
        # that will detect the best approach and handle URL fixing
        return scrape_department