# OpenAlex Author Search

This notebook allows you to search for an author in OpenAlex by providing author information (first name, last name, ORCID).

The search follows the same pipeline as `authors_match.py`:
1. If ORCID is provided, search by ORCID first
2. If no ORCID or ORCID search fails, search by name/institution (Politecnico di Torino)
3. If multiple matches are found and matricola is provided, use DOI-based work analysis to find the best match
4. Display the match results with OpenAlex IDs


In [1]:
import os
import sys
import time
import requests
from pathlib import Path
from collections import Counter

# Add the project root directory to Python path to enable imports from utilities
# Get the current notebook's directory (authors_matching) and go up one level to project root
notebook_dir = Path.cwd()  # Current working directory
# Try to find project root by looking for utilities folder
if (notebook_dir / 'utilities').exists():
    project_root = str(notebook_dir)
elif (notebook_dir.parent / 'utilities').exists():
    project_root = str(notebook_dir.parent)
else:
    # Fallback: assume we're in authors_matching folder
    project_root = str(notebook_dir.parent)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from utilities.db_utils import test_connection, execute_query_with_connection
from utilities.sim_lib import author_similarity

# Constants
ROR_POLITO = 'https://ror.org/00bgk9508'  # ROR identifier for Politecnico di Torino
OPENALEX_API_BASE = 'https://api.openalex.org'  # Base URL for OpenAlex API
API_DELAY = 0.1  # Delay between API calls to respect rate limits (in seconds)
WORK_API_DELAY = 0.05  # Delay when fetching work data by DOI (in seconds)

# Test database connection (only needed if matricola is provided for DOI-based analysis)
conn = None
cursor = None
try:
    conn, cursor = test_connection()
    print("‚úÖ Database connection established")
except Exception as e:
    print(f"‚ö†Ô∏è  Database connection not available: {e}")
    print("   DOI-based analysis will not be available without database connection")


ModuleNotFoundError: No module named 'mysql'

## Input Author Information

Fill in the information about the author you want to search for. First name and last name are required.


In [None]:
# ===== USER INPUT =====
# Fill in the information about the author you want to search for

# First name (required)
first_name = ""

# Last name (required)
last_name = ""

# ORCID (optional) - if provided, will be used for the first search attempt
# Format: just the ORCID number, e.g., "0000-0001-2345-6789" or "0000000123456789"
orcid = None  # e.g., "0000-0001-2345-6789"

# Matricola (optional) - Employee ID from IRIS database
# If provided, enables DOI-based analysis when multiple matches are found
matricola = None  # e.g., 12345

# ===== END USER INPUT =====


## Search and Display Results

Run the cell below to search for the author in OpenAlex and display the results.


In [None]:
# Validate input
if not first_name:
    raise ValueError("First name is required. Please provide a first name.")
if not last_name:
    raise ValueError("Last name is required. Please provide a last name.")

# Display input information
print("=" * 80)
print("SEARCHING FOR AUTHOR:")
print("=" * 80)
print(f"First Name: {first_name}")
print(f"Last Name: {last_name}")
print(f"ORCID: {orcid if orcid else 'Not provided'}")
print(f"Matricola: {matricola if matricola else 'Not provided'}")
print("=" * 80)
print()

# STEP 1: Search OpenAlex for this author
# Try ORCID search first (more reliable), then fall back to name/affiliation search
search_successful = False
oa_authors = []  # List of (display_name, oa_id) tuples for matching authors

# If ORCID is present, search OpenAlex by ORCID first (more reliable than name search)
if orcid:
    print(f"üîç Searching by ORCID...")
    
    # Search OpenAlex using ORCID filter
    # ORCID format in OpenAlex API: https://orcid.org/{orcid}
    # Also include name in search query to improve accuracy
    search_query = f"{first_name}%20{last_name}"
    url = f"{OPENALEX_API_BASE}/authors?search={search_query}&filter=orcid:https://orcid.org/{orcid}"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            response_data = response.json()
            count = response_data.get("meta", {}).get("count", 0)
            if count >= 1:
                # Extract matching authors from results
                results = response_data.get('results', [])
                for match_author in results:
                    if match_author.get('id') and match_author.get('display_name'):
                        oa_authors.append((match_author['display_name'], match_author['id']))
                search_successful = True
                print(f"‚úÖ Found {len(oa_authors)} author(s) by ORCID")
            else:
                print(f"‚ö†Ô∏è  No match found by ORCID, falling back to name/affiliation search")
        else:
            print(f"‚ö†Ô∏è  API error with ORCID search, falling back to name/affiliation search")
    except Exception as e:
        print(f"‚ö†Ô∏è  Error searching by ORCID: {e}, falling back to name/affiliation search")

# If no ORCID or ORCID search failed, search by name and affiliation (Politecnico di Torino)
# This is a broader search that may return multiple potential matches
if not search_successful:
    if orcid:
        print(f"üîç Searching by name/institution...")
    else:
        print(f"üîç Searching by name/institution (no ORCID available)...")
    
    # Search by author name and filter by Politecnico di Torino ROR identifier
    search_query = f"{first_name}%20{last_name}"
    url = f"{OPENALEX_API_BASE}/authors?search={search_query}&filter=affiliations.institution.ror:{ROR_POLITO}"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # Extract matching authors from results
            results = response.json().get('results', [])
            for match_author in results:
                if match_author.get('id') and match_author.get('display_name'):
                    oa_authors.append((match_author['display_name'], match_author['id']))
            if oa_authors:
                print(f"‚úÖ Found {len(oa_authors)} author(s) by name/institution")
            else:
                print(f"‚ö†Ô∏è  No match found by name/institution")
        else:
            print(f"‚ö†Ô∏è  API error with name/institution search")
    except Exception as e:
        print(f"‚ùå Error searching OpenAlex for {first_name} {last_name}: {e}")

# Rate limiting: delay between API calls to avoid overwhelming the API
time.sleep(API_DELAY)

print()
print("=" * 80)

# If no matches found
if not oa_authors:
    print("‚ùå NO MATCHES FOUND")
    print("=" * 80)
    print(f"\nNo authors were found in OpenAlex matching your search criteria.")
    print("You may want to try:")
    print("  - Checking the spelling of the name")
    print("  - Providing an ORCID if available")
    print("  - Verifying the author is affiliated with Politecnico di Torino")
else:
    # Display all found matches
    print(f"üìö FOUND {len(oa_authors)} OPENALEX CANDIDATE(S):")
    print("=" * 80)
    for oa_idx, (display_name_choose, id_choose) in enumerate(oa_authors, 1):
        print(f"  {oa_idx}. {display_name_choose}")
        print(f"     OpenAlex ID: {id_choose}")
    print()
    
    # STEP 2: If multiple matches found and matricola is provided, use DOI-based work analysis
    if len(oa_authors) > 1 and matricola and cursor and conn:
        print("=" * 80)
        print("MULTIPLE MATCHES FOUND - PERFORMING DOI-BASED ANALYSIS")
        print("=" * 80)
        print()
        
        # Query to get all DOIs for publications by this author (matricola)
        select_doi_by_matricola_query = """
        SELECT DISTINCT iw.cd_doi as doi
        FROM pub_ri_prodotti_autori AS it
        INNER JOIN pub_ri_prodotti_base AS iw
        ON it.handle = iw.HANDLE 
        WHERE iw.cd_doi IS NOT NULL AND it.matricola = %s
        """
        
        try:
            dois = execute_query_with_connection(cursor, select_doi_by_matricola_query, params=(matricola,), conn=conn)
            print(f"Found {len(dois)} publication(s) with DOI")
            
            # Need at least one DOI to perform analysis
            if len(dois) == 0:
                print("‚ö†Ô∏è  No publications with DOI found - cannot perform detailed analysis")
                print("   Showing all candidates above")
            else:
                # Collect all authors from works to find which OpenAlex author appears most frequently
                all_authors_data = []  # List of all author tuples found in works
                counter = Counter()  # Count occurrences of each author across all works
                doi_count = 0
                
                print("Analyzing works...")
                # For each DOI, fetch the work from OpenAlex and extract all authors
                for d_item in dois:
                    doi_count += 1
                    doi = d_item.get("doi")
                    if not doi:
                        continue
                    
                    # Fetch work data from OpenAlex using DOI
                    url = f"{OPENALEX_API_BASE}/works/https://doi.org/{doi}"
                    
                    try:
                        response = requests.get(url)
                        if response.status_code == 200:
                            data = response.json()
                            authorships = data.get("authorships", [])
                            
                            # Extract all authors from this work
                            for authorship in authorships:
                                author = authorship.get("author")
                                if author and author.get("display_name") and author.get("id"):
                                    couple = (author["display_name"], author["id"])
                                    all_authors_data.append(couple)
                                    counter[couple] += 1  # Count how many times this author appears
                    except Exception as e:
                        print(f"  ‚ö†Ô∏è  Error fetching work by DOI {doi}: {e}")
                    
                    # Progress indicator
                    if doi_count % 10 == 0 or doi_count == len(dois):
                        print(f"  Processed {doi_count}/{len(dois)} works...", end="\r")
                    time.sleep(WORK_API_DELAY)  # Rate limiting
                
                print()  # New line after progress indicator
                
                # If no authors found in any works, cannot perform analysis
                if not counter:
                    print("‚ö†Ô∏è  No authors found in works")
                    print("   Showing all candidates above")
                else:
                    print()
                    print("=" * 80)
                    print("ANALYSIS RESULTS:")
                    print("=" * 80)
                    
                    # First pass: look for exact matches in top 3 most common authors from works
                    flag = False
                    ranking_position = 0
                    for item in counter.most_common(3):  # Check top 3 most frequent authors
                        ranking_position += 1
                        item_data, count = item
                        item_display_name = item_data[0]  # Author display name
                        item_id = item_data[1]  # OpenAlex author ID
                        
                        # Check if this author ID matches any of our OpenAlex candidates
                        for oa_display_name, oa_id in oa_authors:
                            if item_id == oa_id:
                                print(f"‚úì Compatible match found: {item_display_name}")
                                print(f"  OpenAlex ID: {item_id}")
                                print(f"  Appears in {count} work(s), rank {ranking_position}")
                                flag = True
                                break
                        
                        if flag:
                            break
                    
                    # Second pass: if no exact match found, look for similar names using similarity scoring
                    last_flag = False
                    if not flag:
                        for item in counter.most_common():
                            item_data, count = item
                            item_display_name = item_data[0]
                            item_id = item_data[1]
                            
                            # Calculate similarity between input name and OpenAlex author name
                            iris_full_name = f"{first_name} {last_name}"
                            score_similarity = author_similarity(item_display_name, iris_full_name)
                            
                            # If similarity is high enough (>0.7), consider it a match
                            if score_similarity > 0.7:
                                print(f"‚úì Similar match found: {item_display_name}")
                                print(f"  OpenAlex ID: {item_id}")
                                print(f"  Appears in {count} work(s), similarity: {score_similarity:.2f}")
                                last_flag = True
                                break
                        
                        # If still no match found, report the most frequent author (even if incompatible)
                        if not last_flag:
                            most_common = counter.most_common(1)
                            if most_common:
                                item_data, count = most_common[0]
                                print(f"‚úó No compatible match found")
                                print(f"  Most frequent (incompatible): {item_data[0]}")
                                print(f"  OpenAlex ID: {item_data[1]}")
                                print(f"  Appears in {count} work(s)")
                            else:
                                print(f"‚úó No result")
        except Exception as e:
            print(f"‚ùå Error during DOI-based analysis: {e}")
            print("   Showing all candidates above")
    elif len(oa_authors) == 1:
        # Single match found - no need for further analysis
        print("=" * 80)
        print("‚ú® SINGLE MATCH FOUND")
        print("=" * 80)
        oa_display_name, oa_id = oa_authors[0]
        print(f"\n‚úì Match: {oa_display_name}")
        print(f"  OpenAlex ID: {oa_id}")
        print(f"  No further analysis needed")
    elif len(oa_authors) > 1 and (not matricola or not cursor or not conn):
        print("=" * 80)
        print("‚ö†Ô∏è  MULTIPLE MATCHES FOUND")
        print("=" * 80)
        print("\nTo perform DOI-based analysis to find the best match:")
        print("  - Provide a matricola (employee ID) in the input cell above")
        print("  - Ensure database connection is available")
        print("\nAll candidates are shown above.")

print("=" * 80)
