First we need to read the articles in the citation format. We stored the document in github for easy access

In [18]:
#I loaded the original TXT file
import requests
# URL of the text file
file_url = 'https://raw.githubusercontent.com/jsanc223/dataset/main/africa_research.txt'

# I Sent a GET request to the URL and read the response content
response = requests.get(file_url)
file_contents = response.text

# Split the file contents by blank lines to separate the records
records = file_contents.split('\n\n')

We load the API Key and Institutional Token. Those are stored in config.json local file

In [19]:
import json

with open('config.json') as f:
    config = json.load(f)

api_key = config['api_key']
inst_token = config['inst_token']

Iterate to each journal article and get only the title, DOI, Elsevier URL and Article year. Then, store it into a dataframe to manage data properly

In [None]:
import pandas as pd

# Initializing empty lists to store the parsed values
ti_values = []
do_values = []
ur_values = []
py_values = []

# Iterate over each record
for record in records:
    lines = record.split('\n')
    ti_value = None
    do_value = None
    ur_value = None
    py_value = None

    # Extracting the values for 'TI', 'DO', 'UR', and 'PY' fields
    for line in lines:
        if line.startswith('TI  - '): # Title
            ti_value = line[6:].strip()
        elif line.startswith('DO  - '): # DOI
            do_value = line[6:].strip()
        elif line.startswith('UR  - '):  # URLs
            ur_value = line[6:].strip()
        elif line.startswith('PY  - '):  # Year
            py_value = line[6:].strip()

    # Appending the values to the respective lists
    if ti_value is not None or do_value is not None or ur_value is not None or py_value is not None:
        ti_values.append(ti_value)
        do_values.append(do_value)
        ur_values.append(ur_value)
        py_values.append(py_value)

# I created a DataFrame from the parsed values
df = pd.DataFrame({'TI': ti_values, 'DO': do_values, 'UR': ur_values, 'PY': py_values})  # Add 'PY' to the DataFrame
df


For testing purposes, I grab a sample of the first ten records from the actual dataframe.

In [None]:
df = df.head(10)

In [23]:
import numpy as np

# I created a copy of the original DataFrame before making modifications
df_copy = df.copy()

# I replaced the empty strings and other potential "null" values with NaN
df_copy.replace({"": np.nan, " ": np.nan, None: np.nan}, inplace=True)

# Then I applied the same filtering as before:
df_null = df_copy[df_copy['DO'].isnull()]
df_not_null = df_copy[df_copy['DO'].notnull()]


We will generate a new dataframe withou duplicates DOI from the initial dataframe. In addition, the df_subset dataframe will be exported into a csv format for the visualization tool. Also, we generate a new dataframe, Author_df, where will initiate the columns ID (unique identifier) and Study_ID which is the ID from the df_subset datrame and needed for the visualization tool for references purposes. Also, I initialized the other columns to be filled out later.

In [None]:
# I removed the duplicates based on the 'DO' column
df_not_null_no_duplicates = df_not_null.drop_duplicates('DO', keep='first')
df_not_null_no_duplicates
df_subset = df_not_null_no_duplicates.copy()
df_subset.reset_index(inplace=True)
df_subset.rename(columns={'index': 'ID'}, inplace=True)

Author_df = pd.DataFrame()

# I added the columns to Author_df
Author_df = pd.DataFrame()
Author_df['ID'] = df_subset.index
Author_df['Study_ID'] = df_subset['ID']
Author_df['author'] = np.nan
Author_df['affiliation_id'] = np.nan
Author_df['affiliation_info'] = np.nan
Author_df['affiliation_city'] = np.nan
Author_df['affiliation_country'] = np.nan

df_subset

Export to csv df_subset which contains only articles information such as, ID (Unique identifier), TI (Title), DO(DOI), UR (Elsevier URL) and PY (Article year)

In [25]:
# Export author_df to a CSV file
df_subset.to_csv('Articles.csv', index=False)

The provided code is the critical part of this process. It has two functions to retrieve and organize author and affiliation information from the Elsevier API using article DOI values.

1. get_author_info(DO_value):
   - Inputs: DO_value (a DOI value).
   - Functionality: Fetches the abstract of the article with the given DOI from the Elsevier API and extracts authors and their affiliations. Sends additional requests to obtain detailed affiliation info.
   - Outputs: Lists of authors' names, affiliation IDs, names, cities, and countries.

2. populate_author_df(df_subset):
   - Inputs: df_subset (a DataFrame subset).
   - Functionality: Iterates over the DataFrame rows, calls get_author_info() to obtain author and affiliation info for each DOI, creates a mini DataFrame for each DOI, and appends it to a list.
   - Outputs: A concatenated DataFrame, Author_df, of all the mini DataFrames.


In [40]:
import requests
import pandas as pd
import time  # import the time module

def get_author_info(DO_value):
    headers = {
        "X-ELS-APIKey": api_key,
        "X-ELS-Insttoken": inst_token,
        "Accept": "application/json"
    }

    url = f"https://api.elsevier.com/content/abstract/doi/{DO_value}"
    time.sleep(1)
    response = requests.get(url, headers=headers)
    data = response.json()

    authors = []
    affiliation_ids = []
    affiliation_info_list = []
    affiliation_city_list = []
    affiliation_country_list = []

    def get_affiliation_info(affiliation_id):
        url = f"https://api.elsevier.com/content/search/affiliation?query=af-id({affiliation_id})"
        response = requests.get(url, headers=headers)
        data = response.json()

        entry = data['search-results']['entry'][0]
        affiliation_info = entry['affiliation-name'] if 'affiliation-name' in entry else None
        affiliation_city = entry['city'] if 'city' in entry else None
        affiliation_country = entry['country'] if 'country' in entry else None

        return affiliation_info, affiliation_city, affiliation_country

    try:
        author_data = data['abstracts-retrieval-response']['authors']['author']
        if isinstance(author_data, list):
            for author in author_data:
                authors.append(author['preferred-name']['ce:indexed-name'])
                affiliations = author.get('affiliation', [])
                if isinstance(affiliations, list):
                    for aff in affiliations:
                        affiliation_ids.append(aff.get('@id', None) if isinstance(aff, dict) else None)
                else:
                    affiliation_ids.append(affiliations.get('@id', None) if isinstance(affiliations, dict) else None)
        else:
            authors.append(author_data['preferred-name']['ce:indexed-name'])
            affiliations = author_data.get('affiliation', [])
            if isinstance(affiliations, list):
                for aff in affiliations:
                    affiliation_ids.append(aff.get('@id', None) if isinstance(aff, dict) else None)
            else:
                affiliation_ids.append(affiliations.get('@id', None) if isinstance(affiliations, dict) else None)

        for affiliation_id in affiliation_ids:
            if affiliation_id:
                info, city, country = get_affiliation_info(affiliation_id)
                affiliation_info_list.append(info or 'N/A')
                affiliation_city_list.append(city or 'N/A')
                affiliation_country_list.append(country or 'N/A')
            else:
                affiliation_info_list.append('N/A')
                affiliation_city_list.append('N/A')
                affiliation_country_list.append('N/A')

        return authors, affiliation_ids, affiliation_info_list, affiliation_city_list, affiliation_country_list

    except KeyError:
        return [], [], [], [], []

def populate_author_df(df_subset):
    frames = []
    for index, row in df_subset.iterrows():
        doi = row['DO']
        id = row['ID']  # 'ID' column from df_subset

        try:
            authors, affiliation_ids, affiliation_info_list, affiliation_city_list, affiliation_country_list = get_author_info(doi)

            # Check if authors were found
            if not authors:
                print(f"No authors found for ID: {id}, DO: {doi}")
                continue

            mini_df = pd.DataFrame({
                'ID': [id] * len(authors),  # Use 'ID' instead of 'Study_ID'
                'Study_ID': [id] * len(authors),  # Populate with the same 'ID' if 'Study_ID' is not a separate column
                'author': authors,
                'affiliation_id': affiliation_ids,
                'affiliation_info': affiliation_info_list,
                'affiliation_city': affiliation_city_list,
                'affiliation_country': affiliation_country_list
            })
            frames.append(mini_df)
        except Exception as e:
            print(f"Error processing DOI: {doi}. Error: {str(e)}")

    if frames:
        Author_df = pd.concat(frames, ignore_index=True)
    else:
        print("No authors found in the provided data subset.")
        Author_df = pd.DataFrame()  # return an empty DataFrame

    return Author_df

Export to csv Author_df which contains the author information such as, ID (unique identifier), Study_ID (ID from the articles), Author Full Name, Affiliation ID, Affiliation Information, Affiliation City and Affiliation Country.

In [None]:
Author_df = populate_author_df(df_subset)
Author_df.to_csv('Author.csv', index=False)
#Author_df