In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import time

# Set a User-Agent header to mimic a real browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

def scrape_faculty_page(url):
    """
    Scrapes faculty names from a given URL.

    Args:
        url (str): The URL of the faculty page.

    Returns:
        list[str]: A list of faculty names found on the page.
    """
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        faculty_tags = soup.find_all('h2', class_='node__title node-title')
        return [tag.get_text(strip=True) for tag in faculty_tags]
    else:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        return []

def scrape_all_faculty_names(base_url):
    """
    Scrapes all faculty names across multiple paginated pages.

    Args:
        base_url (str): The base URL of the faculty listing.

    Returns:
        list[str]: All faculty names found.
    """
    all_names = []
    page = 0

    while True:
        url = base_url if page == 0 else f"{base_url}?page={page}"
        print(f"Scraping page {page + 1}: {url}")
        faculty_names = scrape_faculty_page(url)

        if not faculty_names:
            print("No more faculty names found, ending pagination.")
            break

        all_names.extend(faculty_names)
        page += 1
        time.sleep(2)  # Politeness delay between requests

    return all_names

def format_name(full_name):
    """
    Formats a full name into 'LastName Initials' format.

    Args:
        full_name (str): Full name as it appears on the site.

    Returns:
        str: Formatted name.
    """
    name = full_name.split(',')[0]
    parts = name.split()
    last_name = parts[-1]
    initials = ''.join([p[0] for p in parts[:-1]])
    return f"{last_name} {initials}"


In [None]:
def load_mesh_data(mesh_csv_path):
    """
    Loads MeSH terms from a CSV file into a dictionary.

    Args:
        mesh_csv_path (str): Path to the mesh.csv file.

    Returns:
        dict[str, str]: Dictionary mapping MeSH terms to their categories.
    """
    mesh_df = pd.read_csv(mesh_csv_path)
    return dict(zip(mesh_df['MeSH'], mesh_df['Category']))

def categorize_keywords(keywords, mesh_dict):
    """
    Matches keywords with their MeSH category using exact or partial matches.

    Args:
        keywords (list[str]): List of keywords from the abstracts.
        mesh_dict (dict): Dictionary of MeSH terms and categories.

    Returns:
        list[str]: List of matched MeSH categories.
    """
    categories = []
    for word in keywords:
        word_lower = word.lower()
        for mesh_term, category in mesh_dict.items():
            mesh_lower = mesh_term.lower()
            if mesh_lower in word_lower or word_lower in mesh_lower:
                categories.append(category)
                break  # Stop at first match
    return categories

def analyze_keyword_categories(keyword_csv_path, mesh_dict):
    """
    Analyzes keywords in a CSV file and assigns MeSH categories.

    Args:
        keyword_csv_path (str): Path to the file containing keyword lists.
        mesh_dict (dict): Dictionary of MeSH terms and categories.

    Returns:
        pd.DataFrame: DataFrame with keywords and matched MeSH categories.
    """
    keywords_df = pd.read_csv(keyword_csv_path)
    keywords = keywords_df['Keyword'].tolist()
    matched_categories = categorize_keywords(keywords, mesh_dict)

    result_df = pd.DataFrame({
        'Keyword': keywords,
        'Category': matched_categories
    })
    return result_df

def save_category_counts(df, output_csv_path):
    """
    Saves the count of each MeSH category to a CSV file.

    Args:
        df (pd.DataFrame): DataFrame containing categorized keywords.
        output_csv_path (str): Path to save the category count CSV.
    """
    category_counts = df['Category'].value_counts().reset_index()
    category_counts.columns = ['Category', 'Count']
    category_counts.to_csv(output_csv_path, index=False)

def visualize_categories(df, output_path=None):
    """
    Plots a bar chart of MeSH category frequencies.

    Args:
        df (pd.DataFrame): DataFrame with 'Category' column.
        output_path (str, optional): If provided, saves the plot to this path.
    """
    category_counts = df['Category'].value_counts()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=category_counts.values, y=category_counts.index, palette='viridis')
    plt.xlabel('Frequency')
    plt.ylabel('MeSH Category')
    plt.title('Keyword Category Distribution')
    plt.tight_layout()

    if output_path:
        plt.savefig(output_path)
    else:
        plt.show()


In [None]:
def main():
    base_url = 'https://medicine.umich.edu/dept/family-medicine/faculty/faculty'
    all_faculty_names = scrape_all_faculty_names(base_url)

    print(f"Total faculty members found: {len(all_faculty_names)}")

    names_df = pd.DataFrame(all_faculty_names, columns=['Full Name'])
    names_df['Formatted Name'] = names_df['Full Name'].apply(format_name)
    names_df.to_csv('names.csv', index=False)

    # Load and analyze MeSH keyword data
    mesh_dict = load_mesh_data('mesh.csv')
    categorized_keywords_df = analyze_keyword_categories('keywords_list.csv', mesh_dict)
    categorized_keywords_df.to_csv('matched_keywords.csv', index=False)

    # Save category counts and plot
    save_category_counts(categorized_keywords_df, 'category_counts.csv')
    visualize_categories(categorized_keywords_df)

if __name__ == "__main__":
    main()