# Setup and API Key Configuration
Install the elsapy library and configure the API key required for authentication with Elsevier's APIs.

In [12]:
# Import necessary libraries
import json
import csv
import time

from tqdm import tqdm
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elsclient import ElsClient

In [13]:
# Load API key from a configuration file
with open("config.json") as config_file:
    config = json.load(config_file)

# Initialize the ElsClient with the API key
client = ElsClient(config['apikey'])

# Initialize the Elsevier Client
Create an Elsevier client instance using your API key and configure the connection settings.

In [14]:
# Set the connection settings for the client
client.base_url = 'https://api.elsevier.com/content/search/scopus'

# Verify the client connection
if client:
    print("Client initialized successfully.")
else:
    print("Failed to initialize client.")

Client initialized successfully.


# Process and Analyze Search Results
Process the search results to extract relevant information like titles, authors, journals, and citation counts.

In [15]:
# List the tags within the results
def list_tags(results):
    # Initialize lists to store keys
    tags = set()  # Keys from search results
    full_tags = set()  # Keys from abstract documents

    # Extract keys from search results
    for result in results:
        if isinstance(result, dict):
            for key in result.keys():
                tags.add(key)
    
    # Sample a few results to get abstract document structure
    # Limit to avoid too many API calls
    sample_size = min(3, len(results))
    
    print(f"Sampling {sample_size} documents to extract abstract document tags...")
    
    for i in range(sample_size):
        if i >= len(results):
            break
            
        result = results[i]
        if not isinstance(result, dict) or "dc:identifier" not in result:
            continue
            
        scopus_id = result["dc:identifier"].replace("SCOPUS_ID:", "") if result["dc:identifier"].startswith("SCOPUS_ID:") else result["dc:identifier"]
        
        try:
            abs_doc = AbsDoc(scp_id=scopus_id)
            if abs_doc.read(client):
                # Recursively extract all keys from the abstract document data
                def extract_keys(data, prefix=""):
                    if not isinstance(data, dict):
                        return
                        
                    for key in data.keys():
                        full_key = f"{prefix}{key}" if prefix else key
                        full_tags.add(full_key)
                        
                        # Recursively process nested dictionaries
                        if isinstance(data[key], dict):
                            extract_keys(data[key], f"{full_key}.")
                
                extract_keys(abs_doc.data)
                print(f"Successfully extracted tags from document {i+1}")
            else:
                print(f"Failed to read abstract document for Scopus ID: {scopus_id}")
        except Exception as e:
            print(f"Error processing document {i+1} with Scopus ID {scopus_id}: {str(e)}")
            
    return list(tags), list(full_tags)

In [16]:
# Function to extract relevant information from search results with filtering
def extract_info(results, client, fetch_details=True, verbose=False):
    extracted_data = []

    # Statistics tracking
    stats = {
        "total": 0,
        "with_abstract": 0,
        "with_keywords": 0,
        "with_subject_areas": 0,
    }

    # Use tqdm for progress tracking
    for i, result in enumerate(tqdm(results, desc="Processing documents")):
        stats["total"] += 1

        # Basic metadata from search results
        data = {
            "title": result.get("dc:title", ""),
            "authors": result.get("dc:creator"),
            "journal": result.get("prism:publicationName"),
            "doi": result.get("prism:doi"),
            "publication_date": (
                result.get("prism:coverDate", "").split("-")[0]
                if result.get("prism:coverDate")
                else ""
            ),
            "document_type": result.get("subtypeDescription", ""),
            "prism:url" : result.get("prism:url"),
            "scopus_id": (
                result.get("dc:identifier", "").replace("SCOPUS_ID:", "")
                if result.get("dc:identifier")
                else None
            ),
        }

        # See if there's a description in the initial search results
        if "dc:description" in result:
            data["abstract"] = result["dc:description"]
            stats["with_abstract"] += 1

        # If detailed information is requested and we have a scopus_id or doi
        if fetch_details:
            try:
                # Try different methods to get the abstract
                abstract_found = False

                # Method 1: Try to get abstract from AbsDoc if scopus_id is available
                if data["scopus_id"] and not abstract_found:
                    abs_doc = AbsDoc(scp_id=data["scopus_id"])
                    if abs_doc.read(client):
                        # Try multiple paths for abstract
                        if (
                            "coredata" in abs_doc.data
                            and "dc:description" in abs_doc.data["coredata"]
                        ):
                            data["abstract"] = abs_doc.data["coredata"][
                                "dc:description"
                            ]
                            abstract_found = True
                            stats["with_abstract"] += 1
                        elif "dc:description" in abs_doc.data:
                            data["abstract"] = abs_doc.data["dc:description"]
                            abstract_found = True
                            stats["with_abstract"] += 1

                        # Extract keywords
                        if "authkeywords" in abs_doc.data:
                            keywords = abs_doc.data["authkeywords"]
                            keyword_list = []

                            # Process the nested structure to extract keyword values
                            if "author-keyword" in keywords:
                                author_kws = keywords["author-keyword"]

                                if isinstance(author_kws, list):
                                    for kw in author_kws:
                                        if isinstance(kw, dict) and "$" in kw:
                                            keyword_list.append(kw["$"])
                                elif isinstance(author_kws, dict) and "$" in author_kws:
                                    keyword_list.append(author_kws["$"])

                            if keyword_list:
                                data["author_keywords"] = keyword_list
                                stats["with_keywords"] += 1

                        # Extract subject areas
                        if "subject-areas" in abs_doc.data:
                            subject_areas = abs_doc.data["subject-areas"].get(
                                "subject-area", []
                            )
                            if isinstance(subject_areas, list):
                                data["subject_areas"] = [
                                    area.get("$")
                                    for area in subject_areas
                                    if "$" in area
                                ]
                            elif (
                                isinstance(subject_areas, dict) and "$" in subject_areas
                            ):
                                data["subject_areas"] = [subject_areas["$"]]

                            if "subject_areas" in data:
                                stats["with_subject_areas"] += 1

                # Avoid hitting rate limits
                time.sleep(0.2)

            except Exception as e:
                if verbose:
                    print(f"Error retrieving details for document {i+1}: {str(e)}")
                    print(
                        f"Failed document: {data['title']} (DOI: {data.get('doi')}, ScopusID: {data['scopus_id']})"
                    )

        extracted_data.append(data)

    # Print capture statistics
    print("\nData Capture Statistics:")
    print(f"Total documents processed: {stats['total']}")
    print(
        f"Documents with abstracts: {stats['with_abstract']} ({stats['with_abstract']*100/stats['total']:.1f}%)"
    )
    print(
        f"Documents with keywords: {stats['with_keywords']} ({stats['with_keywords']*100/stats['total']:.1f}%)"
    )
    print(
        f"Documents with subject areas: {stats['with_subject_areas']} ({stats['with_subject_areas']*100/stats['total']:.1f}%)"
    )

    return extracted_data


In [17]:
# Extract information from document search results with filtering and details
print("Retrieving detailed information for documents...")
# Read doc_info from JSON file
with open("../data/01_scopus_results.json", "r", encoding="utf-8") as file:
    results = json.load(file)

# tags, full_tags = list_tags(results)
# print(f"Tags in the results: {tags}")
# print(f"Full tags in the results: {full_tags}")

print(f"Total documents: {len(results)}")

doc_info = extract_info(results, client, fetch_details=True, verbose=False)

Retrieving detailed information for documents...
Total documents: 512


Processing documents: 100%|██████████| 512/512 [12:04<00:00,  1.42s/it]


Data Capture Statistics:
Total documents processed: 512
Documents with abstracts: 512 (100.0%)
Documents with keywords: 475 (92.8%)
Documents with subject areas: 475 (92.8%)





# Export Search Results
Export the search results to CSV or JSON format for further analysis.

In [20]:
# Function to export results to CSV with dynamic fields
def export_to_csv(data, filename):
    # Get all possible keys across all dictionaries
    all_keys = set()
    for item in data:
        all_keys.update(item.keys())

    # Convert to sorted list for consistent column order
    fieldnames = sorted(list(all_keys))

    with open(filename, "w", newline="", encoding="utf-8") as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        dict_writer.writeheader()
        dict_writer.writerows(data)

    print(f"CSV exported with {len(data)} records")


# Function to export results to JSON
def export_to_json(data, filename):
    with open(filename, "w", encoding="utf-8") as output_file:
        json.dump(data, output_file, indent=4, ensure_ascii=False)

    print(f"JSON exported with {len(data)} records")

In [None]:
# Export document search results to CSV and JSON
export_to_csv(doc_info, "../data/02_document_search_results.csv")
export_to_json(doc_info, "../data/02_document_search_results.json")

print("Search results exported successfully.")

CSV exported with 512 records
JSON exported with 512 records
Search results exported successfully.
