# Fetch Reference Data from Scopus API

Resouces:

- [Scopus Abstract Retrieval Views](https://dev.elsevier.com/sc_abstract_retrieval_views.html)
- [Scopus Retrieval API](https://dev.elsevier.com/documentation/AbstractRetrievalAPI.wadl)
- [Interactive Scopus API](https://dev.elsevier.com/scopus.html)
- [API Settings (rate limits)](https://dev.elsevier.com/api_key_settings.html)
- Remember Logging In to Cisco VPN!!!


In this notebook, we use the `ScopusReferenceFetech` (from src.data.ScopusReferenceFetcher import ScopusReferenceFetcher). The class was created to send requests to the scopus search api and retrieve the references for a given scopus id.

In this class, we are applying the functionility to a list of scopus ids.

1. We provide the full dataframe with the scopus ids - the code runs until the api rate limit of 10,000 requests per week is reached.
2. It saves after every 500 requests.
3. After the first run, and the first rate limit is reached, we use addittional api keys to continue the process.
4. To do that, we first load all previously saved files from the data folder.
5. We save the 'eid's of the files that were already processed in a list to filter the dataframe to those that are not yet processed.
6. We retrieve the highest number of batches to know where to continue from.


In [1]:
import pandas as pd
import json
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm import tqdm
import json
import logging
from math import ceil
import sys
import logging
import datetime
import os
import os
import json
import pandas as pd
import logging
import datetime
import json
from tqdm import tqdm
from math import ceil
import sys


from src.data.ScopusReferenceFetcher import ScopusReferenceFetcher

In [2]:
class ScopusRefFetcherPrep:
    """
    Class to prepare the data for fetching references from Scopus.
    """

    @staticmethod
    def get_api_keys(path="../notebooks/api_key_scopus.json"):
        api_keys = json.load(open(path))
        return api_keys

    @staticmethod
    def load_fetched_reference_data(data_path):
        """
        This is only necessary after the inital fetch of the reference data.
        It loads the data and returns a list of EIDs and the highest batch number.
        """
        files = os.listdir(data_path)
        files = [file for file in files if file.endswith(".json")]
        # Extract batch numbers and find the maximum
        max_batch = max(int(file.split("_")[3].split(".")[0]) for file in files)
        print(f"Found {len(files)} files with batch numbers up to {max_batch}.")
        eids = []
        for file in files:
            with open(data_path + file, "r") as fp:
                data = json.load(fp)
                eids.extend(list(data.keys()))
        # remove data from memory
        del data
        return eids, max_batch

    @staticmethod
    def load_and_filter_articles(path, eids):
        df = pd.read_csv(path)
        df_filtered = df[~df["eid"].isin(eids)]
        df_filtered = df_filtered.reset_index(drop=True)
        print(f"Number of articles to fetch: {len(df_filtered)}")
        return df_filtered


class ScopusRefFetcherProcessor:
    """
    Class to fetch references from Scopus.
    """

    @staticmethod
    def setup_logging(log_directory, log_level=logging.INFO):
        """
        Set up logging to write to a file with a timestamp in its name.

        Args:
        log_directory (str): Directory where the log file will be saved.
        log_level (logging.Level): Logging level to capture. Default is logging.INFO.
        """

        # Create a directory for logs if it doesn't exist
        os.makedirs(log_directory, exist_ok=True)

        # Configure logging to write to a file with a timestamp in its name
        current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        log_filename = f"{log_directory}/scopus_fetcher_{current_time}.log"
        logging.basicConfig(
            filename=log_filename,
            level=log_level,
            format="%(asctime)s - %(levelname)s - %(message)s",
        )

    @staticmethod
    def process_scopus_batches(
        api_key, df_to_fetch, data_path, last_processed_batch, batch_size=500
    ):
        """
        Process batches of data and fetch references using ScopusReferenceFetcher.

        Args:
        api_key (str): API key for ScopusReferenceFetcher.
        df_to_fetch (DataFrame): DataFrame containing data to be processed.
        data_path (str): Path where the processed data will be saved.
        last_processed_batch (int): The last processed batch number for naming files.
        batch_size (int, optional): Number of records to process in each batch. Default is 500.
        """

        fetcher = ScopusReferenceFetcher(api_key)
        num_batches = ceil(len(df_to_fetch) / batch_size)

        for batch in tqdm(range(num_batches)):
            data_dict = {}

            for _, row in df_to_fetch.iloc[
                batch * batch_size : (batch + 1) * batch_size
            ].iterrows():
                try:
                    data_dict[row["eid"]] = fetcher.request_eid(row["eid"])
                except Exception as e:
                    logging.error(f"Error processing EID: {row['eid']}. Error: {e}")
                    if "429" in str(e):
                        logging.error("Too many requests, breaking the loop.")
                        with open(
                            f"{data_path}scopus_references_batch_{last_processed_batch + batch + 1}.json",
                            "w",
                        ) as fp:
                            json.dump(data_dict, fp)
                        logging.info(
                            f"Last possible batch {last_processed_batch + batch + 1} saved uncompleted."
                        )
                        sys.exit("Stopping script due to 429 Too Many Requests error.")

            with open(
                f"{data_path}scopus_references_batch_{last_processed_batch + batch + 1}.json",
                "w",
            ) as fp:
                json.dump(data_dict, fp)
            logging.info(
                f"Data for batch {last_processed_batch + batch + 1} saved to file."
            )

# Initial Reference Fetching


## get api key and load data


In [None]:
# first api key
api_keys = ScopusRefFetcherPrep.get_api_keys()
api_key = api_keys["api_key_A"]

# load data
df_path = "../data/01-raw/scopus/articles/final_scopus_results.csv"
df = pd.read_csv(df_path)

## setup logging and process the data


In [None]:
log_directory = "../notebooks/logs/scopus_fetcher"
ScopusRefFetcherProcessor.setup_logging(log_directory, log_level=logging.INFO)

# max batch is 0 since no data has been fetched yet
max_batch = 0
# Process batches
processor.process_scopus_batches(
    api_key=api_key,
    df_to_fetch=df,
    data_path="../data/01-raw/scopus/references/",
    last_processed_batch=max_batch,
    batch_size=500,
)

# Subsequent Reference Fetching


# prep


In [4]:
prepper = ScopusRefFetcherPrep()
api_keys = prepper.get_api_keys()
api_key = api_keys["api_key_deb"]

refs_path = "../data/01-raw/scopus/references/"
eids, max_batch = prepper.load_fetched_reference_data(refs_path)

df_path = "../data/01-raw/scopus/articles/final_scopus_results.csv"
df_filtered = prepper.load_and_filter_articles(df_path, eids)

Found 78 files with batch numbers up to 77.
Number of articles to fetch: 3744


# process


In [5]:
# import class
from src.data.ScopusReferenceFetcher import ScopusReferenceFetcher

In [8]:
processor = ScopusRefFetcherProcessor()

# Set up logging
log_directory = "../notebooks/logs/scopus_fetcher"
processor.setup_logging(log_directory, log_level=logging.INFO)

# Process batches
processor.process_scopus_batches(
    api_key=api_key,
    df_to_fetch=df_filtered,
    data_path="../data/01-raw/scopus/references/",
    last_processed_batch=max_batch,
    batch_size=500,
)

100%|██████████| 8/8 [1:41:40<00:00, 762.51s/it]
