In [None]:
# Import necessary libraries
import pandas as pd
import requests

# Load faculty names and ORCID data
names_df = pd.read_csv('names.csv')
names_df.index = names_df['Full Name']  # Set index for easy access

orcid_df = pd.read_csv('ORCID_Faculty.csv')
orcid_df['Full Name'] = orcid_df['FIRST, MIDDLE NAME(S)'] + ' ' + orcid_df['LAST NAME']

def get_orcid_publications(orcid_id):
    """
    Given an ORCID ID, retrieve publication metadata from the ORCID public API.

    Parameters:
        orcid_id (str): The ORCID identifier of a researcher.

    Returns:
        list: A list of dictionaries containing publication details.
    """
    try:
        r_name = orcid_df[orcid_df['ORCID ID#'] == orcid_id]['Full Name'].values[0]
    except IndexError:
        print(f"No name found for ORCID ID: {orcid_id}")
        return []

    url = f"https://pub.orcid.org/v3.0/{orcid_id}/works"
    headers = {"Accept": "application/json"}

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching data for {orcid_id}: {e}")
        return []

    data = response.json()
    publications = []

    for work in data.get("group", []):
        for summary in work.get("work-summary", []):
            try:
                publication_details = {
                    "title": summary.get("title", {}).get("title", {}).get("value", "No title found"),
                    "author": r_name,
                    "journal_title": summary.get("journal-title", {}).get("value", "No journal title found"),
                    "year": summary.get("publication-date", {}).get("year", {}).get("value", "No year found"),
                }
                publications.append(publication_details)
            except AttributeError:
                continue  # Skip malformed entries

    return publications

# Get list of valid ORCID IDs
valid_orcid_ids = orcid_df['ORCID ID#'].dropna().astype(str)

# Gather all publications
all_publications = []
for orcid_id in valid_orcid_ids:
    all_publications.extend(get_orcid_publications(orcid_id))

# Convert to DataFrame
df = pd.DataFrame(all_publications)

# -------------------- MeSH Keyword Section --------------------

# Load MeSH term-to-code mappings from file
def load_mesh_terms(filepath):
    """
    Load MeSH terms and their codes from a two-line alternating text file.

    Parameters:
        filepath (str): Path to the MeSH term file.

    Returns:
        dict: Dictionary of term -> code.
    """
    mesh_dict = {}
    with open(filepath, "r") as file:
        lines = [line.strip() for line in file if line.strip()]
        for i in range(0, len(lines), 2):
            if i + 1 < len(lines):
                mesh_dict[lines[i]] = lines[i + 1]
            else:
                print(f"Unmatched term without code: '{lines[i]}'")
    return mesh_dict

# Function to match MeSH keywords to a publication title
def find_mesh_keywords(title, mesh_terms):
    """
    Match MeSH keywords to a publication title (case-insensitive).

    Parameters:
        title (str): Publication title.
        mesh_terms (list): List of MeSH terms to check against.

    Returns:
        list: List of matched MeSH keywords found in the title.
    """
    title_lower = title.lower()
    return [term for term in mesh_terms if term.lower() in title_lower]

# Load MeSH data and apply keyword matching
mesh_dict = load_mesh_terms("mesh_words_for pubmed.txt")
mesh_df = pd.DataFrame(list(mesh_dict.items()), columns=["Term", "Code"])

df['keywords'] = df['title'].apply(lambda t: find_mesh_keywords(t, mesh_df['Term']))

# -------------------- Output --------------------

# Export final dataframe
df.to_csv('Cleaned_ORCID.csv', index=False)
