# Setup and API Key Configuration
Install the elsapy library and configure the API key required for authentication with Elsevier's APIs.

In [1]:
# Import necessary libraries
import json
import csv
import time

from tqdm import tqdm
import pandas as pd
from tqdm.auto import tqdm
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elsclient import ElsClient

tqdm.pandas()

In [2]:
# Load API key from a configuration file
with open("config.json") as config_file:
    config = json.load(config_file)

# Initialize the ElsClient with the API key
client = ElsClient(config['apikey'])

# Initialize the Elsevier Client
Create an Elsevier client instance using your API key and configure the connection settings.

In [3]:
# Set the connection settings for the client
client.base_url = 'https://api.elsevier.com/content/search/scopus'

# Verify the client connection
if client:
    print("Client initialized successfully.")
else:
    print("Failed to initialize client.")

Client initialized successfully.


# Process and Analyze Search Results
Process the search results to extract relevant information like titles, authors, journals, and citation counts.

In [4]:
# List the tags within the results
def list_tags(results):
    # Initialize lists to store keys
    tags = set()  # Keys from search results
    full_tags = set()  # Keys from abstract documents

    # Extract keys from search results
    for result in results:
        if isinstance(result, dict):
            for key in result.keys():
                tags.add(key)
    
    # Sample a few results to get abstract document structure
    # Limit to avoid too many API calls
    sample_size = min(3, len(results))
    
    print(f"Sampling {sample_size} documents to extract abstract document tags...")
    
    for i in range(sample_size):
        if i >= len(results):
            break
            
        result = results[i]
        if not isinstance(result, dict) or "dc:identifier" not in result:
            continue
            
        scopus_id = result["dc:identifier"].replace("SCOPUS_ID:", "") if result["dc:identifier"].startswith("SCOPUS_ID:") else result["dc:identifier"]
        
        try:
            abs_doc = AbsDoc(scp_id=scopus_id)
            if abs_doc.read(client):
                # Recursively extract all keys from the abstract document data
                def extract_keys(data, prefix=""):
                    if not isinstance(data, dict):
                        return
                        
                    for key in data.keys():
                        full_key = f"{prefix}{key}" if prefix else key
                        full_tags.add(full_key)
                        
                        # Recursively process nested dictionaries
                        if isinstance(data[key], dict):
                            extract_keys(data[key], f"{full_key}.")
                
                extract_keys(abs_doc.data)
                print(f"Successfully extracted tags from document {i+1}")
            else:
                print(f"Failed to read abstract document for Scopus ID: {scopus_id}")
        except Exception as e:
            print(f"Error processing document {i+1} with Scopus ID {scopus_id}: {str(e)}")
            
    return list(tags), list(full_tags)

In [5]:
def extract_info(results, client, fetch_details=True, verbose=False):
    # Convert results to DataFrame if not already
    if not isinstance(results, pd.DataFrame):
        df_results = pd.DataFrame(results)
    else:
        df_results = results
    
    # Statistics tracking
    stats = {
        "total": 0,
        "with_abstract": 0,
        "with_keywords": 0,
        "with_subject_areas": 0,
    }
    
    # Function to process a single document
    def process_document(result):
        nonlocal stats
        stats["total"] += 1
        
        # If result is a Series (row of DataFrame), convert to dict
        if isinstance(result, pd.Series):
            result = result.to_dict()
        
        # Basic metadata from search results
        data = {
            "title": result.get("dc:title", ""),
            "authors": result.get("dc:creator"),
            "journal": result.get("prism:publicationName"),
            "doi": result.get("prism:doi"),
            "publication_date": (
                result.get("prism:coverDate", "").split("-")[0]
                if result.get("prism:coverDate")
                else ""
            ),
            "document_type": result.get("subtypeDescription", ""),
            "prism:url": result.get("prism:url"),
            "scopus_id": (
                result.get("dc:identifier", "").replace("SCOPUS_ID:", "")
                if result.get("dc:identifier")
                else None
            ),
        }

        # See if there's a description in the initial search results
        if "dc:description" in result:
            data["abstract"] = result["dc:description"]
            stats["with_abstract"] += 1

        # If detailed information is requested and we have a scopus_id or doi
        if fetch_details:
            try:
                # Try different methods to get the abstract
                abstract_found = "abstract" in data
                
                # Method 1: Try to get abstract from AbsDoc if scopus_id is available
                if data["scopus_id"] and not abstract_found:
                    abs_doc = AbsDoc(scp_id=data["scopus_id"])
                    if abs_doc.read(client):
                        # Try multiple paths for abstract
                        if (
                            "coredata" in abs_doc.data
                            and "dc:description" in abs_doc.data["coredata"]
                        ):
                            data["abstract"] = abs_doc.data["coredata"]["dc:description"]
                            abstract_found = True
                            stats["with_abstract"] += 1
                        elif "dc:description" in abs_doc.data:
                            data["abstract"] = abs_doc.data["dc:description"]
                            abstract_found = True
                            stats["with_abstract"] += 1

                        # Extract keywords
                        if "authkeywords" in abs_doc.data:
                            keywords = abs_doc.data["authkeywords"]
                            keyword_list = []

                            # Process the nested structure to extract keyword values
                            if "author-keyword" in keywords:
                                author_kws = keywords["author-keyword"]

                                if isinstance(author_kws, list):
                                    for kw in author_kws:
                                        if isinstance(kw, dict) and "$" in kw:
                                            keyword_list.append(kw["$"])
                                elif isinstance(author_kws, dict) and "$" in author_kws:
                                    keyword_list.append(author_kws["$"])

                            if keyword_list:
                                data["author_keywords"] = keyword_list
                                stats["with_keywords"] += 1

                        # Extract subject areas
                        if "subject-areas" in abs_doc.data:
                            subject_areas = abs_doc.data["subject-areas"].get(
                                "subject-area", []
                            )
                            if isinstance(subject_areas, list):
                                data["subject_areas"] = [
                                    area.get("$")
                                    for area in subject_areas
                                    if "$" in area
                                ]
                            elif (
                                isinstance(subject_areas, dict) and "$" in subject_areas
                            ):
                                data["subject_areas"] = [subject_areas["$"]]

                            if "subject_areas" in data:
                                stats["with_subject_areas"] += 1

                # Avoid hitting rate limits
                time.sleep(0.2)

            except Exception as e:
                if verbose:
                    print(f"Error retrieving details for document: {str(e)}")
                    print(
                        f"Failed document: {data['title']} (DOI: {data.get('doi')}, ScopusID: {data['scopus_id']})"
                    )

        return data
    
    # Process documents with pandas progress bar
    print("Processing documents...")
    extracted_data = list(df_results.progress_apply(process_document, axis=1))
    
    # Print capture statistics
    print("\nData Capture Statistics:")
    print(f"Total documents processed: {stats['total']}")
    print(
        f"Documents with abstracts: {stats['with_abstract']} ({stats['with_abstract']*100/stats['total']:.1f}%)"
    )
    print(
        f"Documents with keywords: {stats['with_keywords']} ({stats['with_keywords']*100/stats['total']:.1f}%)"
    )
    print(
        f"Documents with subject areas: {stats['with_subject_areas']} ({stats['with_subject_areas']*100/stats['total']:.1f}%)"
    )

    return extracted_data

In [6]:
# Extract information from document search results with filtering and details

# Read doc_info from JSON file
with open("../data/01_scopus_results.json", "r", encoding="utf-8") as file:
    results = json.load(file)

# Convert to DataFrame for pandas processing
results_df = pd.DataFrame(results)
doc_info = extract_info(results_df, client, fetch_details=True, verbose=False)

Processing documents...


  0%|          | 0/973 [00:00<?, ?it/s]


Data Capture Statistics:
Total documents processed: 973
Documents with abstracts: 971 (99.8%)
Documents with keywords: 893 (91.8%)
Documents with subject areas: 893 (91.8%)


In [7]:
import re

# Analyze and clean the document data

# Create a DataFrame from doc_info if it's a list
if isinstance(doc_info, list):
    doc_info_df = pd.DataFrame(doc_info)
    print(f"Created DataFrame with {len(doc_info_df)} records and {len(doc_info_df.columns)} columns")
else:
    doc_info_df = doc_info
    print(f"Using existing DataFrame with {len(doc_info_df)} records")

# Display the columns to understand the structure
print("\nColumns in the document info DataFrame:")
print(doc_info_df.columns.tolist())

# Function to clean text by removing unwanted characters and weird symbols
def clean_text(text):
    if not isinstance(text, str):
        return text
    
    # Remove Unicode characters like \u00a0 (non-breaking space)
    text = text.replace('\u00a0', ' ')
    
    # Remove XML/HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Other common replacements
    text = text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
    
    # Trim whitespace
    text = text.strip()
    
    return text

# Apply cleaning to text columns
columns_to_clean = ['title', 'abstract']
if 'author_keywords' in doc_info_df.columns:
    # For lists of keywords, we need to clean each item
    doc_info_df['author_keywords'] = doc_info_df['author_keywords'].apply(
        lambda keywords: [clean_text(k) for k in keywords] if isinstance(keywords, list) else keywords
    )

# Apply the clean_text function to text columns
for col in columns_to_clean:
    if col in doc_info_df.columns:
        doc_info_df[col] = doc_info_df[col].apply(clean_text)
        print(f"Cleaned {col} column")

# Show sample of cleaned data
print("\nSample of cleaned data:")
sample_cols = [col for col in columns_to_clean if col in doc_info_df.columns]
if 'author_keywords' in doc_info_df.columns:
    sample_cols.append('author_keywords')
    
if sample_cols:
    print(doc_info_df[sample_cols].head(3))

# Update the original doc_info list with the cleaned data
if isinstance(doc_info, list):
    doc_info = doc_info_df.to_dict('records')
    print("\nUpdated original doc_info with cleaned data")

Created DataFrame with 973 records and 11 columns

Columns in the document info DataFrame:
['title', 'authors', 'journal', 'doi', 'publication_date', 'document_type', 'prism:url', 'scopus_id', 'abstract', 'author_keywords', 'subject_areas']
Cleaned title column
Cleaned abstract column

Sample of cleaned data:
                                               title  \
0  An interactive address matching method based o...   
1  Intelligent pattern design using 3D modelling ...   
2  Digital divides in scene recognition: uncoveri...   

                                            abstract  \
0  Problem: Modernizing and standardizing place n...   
1  3D modeling is actuality hired more and more b...   
2  Automatic scene classification has application...   

                                     author_keywords  
0  [Address matching, Attention-based feature int...  
1  [3D modelling, Urban sculpture designing, 3D-S...  
2                                                NaN  

Updated original d

# Export Search Results
Export the search results to CSV or JSON format for further analysis.

In [8]:
# Function to export results to CSV with dynamic fields
def export_to_csv(data, filename):
    # Get all possible keys across all dictionaries
    all_keys = set()
    for item in data:
        all_keys.update(item.keys())

    # Convert to sorted list for consistent column order
    fieldnames = sorted(list(all_keys))

    with open(filename, "w", newline="", encoding="utf-8") as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        dict_writer.writeheader()
        dict_writer.writerows(data)

    print(f"CSV exported with {len(data)} records")


# Function to export results to JSON
def export_to_json(data, filename):
    with open(filename, "w", encoding="utf-8") as output_file:
        json.dump(data, output_file, indent=4, ensure_ascii=False)

    print(f"JSON exported with {len(data)} records")

In [9]:
# Export document search results to CSV and JSON
# export_to_csv(doc_info, "../data/02_document_search_results.csv")
export_to_json(doc_info, "../data/02_document_search_results.json")

print("Search results exported successfully.")

JSON exported with 973 records
Search results exported successfully.
