# Determine the type of page

In [None]:
from bs4 import BeautifulSoup
import os
import re
import glob

In [None]:
def is_geog_msi_style(html):
    """
    Check if the HTML content matches the MSI style for Geography faculty."
    """
    soup = BeautifulSoup(html, "html.parser")

    # Check for outer faculty container
    container = soup.find("div", class_="view-content")
    if not container:
        return False

    # Loop through each faculty entry and check structure
    for row in container.find_all("div", class_="views-row"):
        has_group_first = row.find("div", class_="group-first") is not None
        has_group_second = row.find("div", class_="group-second") is not None
        has_media_blazy = row.find("div", class_="media--blazy") or row.find("div", class_="media--image")
        name_block = row.find("div", class_="group-second")
        has_name = name_block and name_block.find("h3") and name_block.find("h3").find("a")

        if has_group_first and has_group_second and has_media_blazy and has_name:
            return True

    return False

In [None]:
# Runme the function against every file in the faculty_html directoy
def test_is_geog_msi_style():
    """"
    "Test the is_geog_msi_style function against HTML files in the faculty_html directory.
    """
    from pathlib import Path

    faculty_html_dir = Path("faculty_html")
    for html_file in faculty_html_dir.glob("*.html"):
        with open(html_file, "r", encoding="utf-8") as file:
            html_content = file.read()
            if is_geog_msi_style(html_content):
                print(f"{html_file} is geog_msi style")
            #else:
                # print(f"{html_file} is not geog_msi style")

In [None]:

def detect_drupal_theme_robust(html_path):
    with open(html_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')

    result = {
        "file": os.path.basename(html_path),
        "drupal_version": None,
        "theme_from_meta": None,
        "theme_from_stylesheet": None,
        "theme_from_body_class": None
    }

    # 1. Drupal Version
    meta_gen = soup.find("meta", attrs={"name": "Generator"})
    if meta_gen and "Drupal" in meta_gen.get("content", ""):
        result["drupal_version"] = meta_gen["content"]

    # 2. Theme from stylesheets
    stylesheets = soup.find_all("link", rel="stylesheet")
    themes_found = []
    for sheet in stylesheets:
        href = sheet.get("href", "")
        # Match `/themes/theme-name/` or theme-name in CSS file names
        match = re.search(r'/themes/([^/]+)/', href)
        if match:
            themes_found.append(match.group(1))
        else:
            # Try matching `theme-name.css` if prefixed
            match = re.search(r'/([^/]*theme[^/]*).css', href)
            if match:
                themes_found.append(match.group(1))
    if themes_found:
        result["theme_from_stylesheet"] = themes_found[0]  # Return first match

    # 3. Theme from body class
    body = soup.find("body")
    if body:
        for cls in body.get("class", []):
            if 'theme' in cls or 'ucsb' in cls or 'sands' in cls or 'brick' in cls:
                result["theme_from_body_class"] = cls
                break

    return result

In [None]:

html_files = glob.glob("faculty_html/*.html")
theme_results = [detect_drupal_theme_robust(f) for f in html_files]
df_themes = pd.DataFrame(theme_results)
df_themes.to_csv("detected_themes.csv", index=False)

In [None]:
df_themes.to_csv("detected_themes.csv", index=False)

In [None]:
test_is_geog_msi_style()

faculty_html/Physics.html is geog_msi style
faculty_html/Molecular__Cellular__and_Developmental_Biology.html is geog_msi style
faculty_html/Earth_Science.html is geog_msi style
faculty_html/Economics.html is geog_msi style
faculty_html/Developmental_Biology.html is geog_msi style
faculty_html/Marine_Science_Graduate_Program.html is geog_msi style
faculty_html/Ecology__Evolution__and_Marine_Biology.html is geog_msi style
faculty_html/Electrical_and_Computer_Engineering.html is geog_msi style
faculty_html/Geography.html is geog_msi style
