In [1]:
import contextlib
import csv
from csv import DictReader
import requests
import json
import joblib
from joblib import Parallel, delayed
import os
from pathlib import Path
import time
from tqdm import tqdm

Relation types are listed at https://support.datacite.org/docs/eventdata-guide#relation-type-id. This script looks for the resources (subjects) that are related somehow to each dataset PID (object) in the given CSV or TXT file.

I'm considering only the relation types that the Dataverse software will consider when displaying counts of citations for each dataset:
- Is-cited-by
- cites
- is-supplement-to
- is-supplemented-by
- is-referenced-by
- references

In [2]:
# Context manager to patch joblib to report into tqdm progress bar given as argument
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()


# Create function that adds "citations" information for a given DOI and
# to a CSV file (citationsOutputFile)
def get_citation_counts(doi, relationTypesList, citationsOutputFile):

    dataciteEventsAPI = 'https://api.datacite.org/events'
    # 'https://api.test.datacite.org/events?doi=10.7910/DVN/28075&relation-type-id=references&page[number]=1'

    params = {
        'doi': doi.replace('doi:', ''),
        'page[number]': 1,
        'page[size]': 1
    }

    try:
        response = requests.get(dataciteEventsAPI, params=params)
        relationData = response.json()

        if relationData['meta']['total'] > 0:
            for relationType in relationTypesList:

                params['page[number]'] = 1
                params['page[size]'] = 25
                params['relation-type-id'] = relationType

                try:
                    response = requests.get(dataciteEventsAPI, params=params)
                    relationData = response.json()

                    totalPages = relationData['meta']['total-pages']

                    if totalPages > 0:

                        # Initialization for paginating through Search API results and showing progress
                        condition = True

                        while condition:
                            # print(f"\tGetting citations from page {params['page[number]']} of {totalPages}")
                            response = requests.get(dataciteEventsAPI, params=params)
                            relationData = response.json()
                            for i in relationData['data']:
                                subjectId = i['attributes']['subj-id']

                                # Record relationship only if the subject is not a DataCite report
                                if 'https://api.datacite.org/reports/' not in subjectId:

                                    objectId = i['attributes']['obj-id']
                                    relationType = i['attributes']['relation-type-id']
                                    occurredAt = i['attributes']['occurred-at']
                                    timeStamp = i['attributes']['timestamp']

                                    # Write values of the three variables to a new row in the CSV
                                    with open(citationsOutputFile, mode='a', newline='') as citation:
                                        citation = csv.writer(citation, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                                        citation.writerow([subjectId, relationType, objectId, occurredAt, timeStamp])

                            params['page[number]'] += 1
                            condition = params['page[number]'] <= totalPages
                except Exception as e:
                    errorTimeString = time.strftime('%Y.%m.%d_%H.%M.%S')
                    errorType = f'Call for {relationType} data failed'
                    with open(errorLog, mode='a', newline='') as error:
                        error = csv.writer(error, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                        error.writerow([pid, errorType, e, errorTimeString])

    except Exception as e:
        errorType = 'Failed to get any data'
        with open(errorLog, mode='a', newline='') as error:
            error = csv.writer(error, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            error.writerow([pid, errorType, e, errorTimeString])

In [8]:
relationTypesList = [
    'Is-cited-by',
    'cites',
    'is-supplement-to',
    'is-supplemented-by',
    'is-referenced-by',
    'references'
]

currrentWorkingDirectory = os.getcwd()

# Enter path to text file containing list of dataset DOIs
datasetPIDFile = str(Path(currrentWorkingDirectory + '/' + 'datasets.txt'))

# Read in list of dataset PIDs from given CSV or text file
datasetPIDs = []
errorGettingEventData = []

datasetPIDFile = open(datasetPIDFile, 'r')
for datasetPID in datasetPIDFile:
    datasetPIDs.append(datasetPID.rstrip())

In [None]:
pidTotal = len(datasetPIDs)

print(f'Searching for citations for {pidTotal} datasets')

currentTimeString = time.strftime('%Y.%m.%d_%H.%M.%S')

# Create CSV file for writing data requested from DataCite API
citationsOutputFile = str(Path(currrentWorkingDirectory + '/' + f'citations_of_hdv_datasets_{currentTimeString}.csv'))
with open(citationsOutputFile, mode='w', newline='') as opencsvfile:
    opencsvfile = csv.writer(opencsvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    opencsvfile.writerow(['subject_id', 'relation_type', 'object_id', 'occured_at', 'timestamp'])

# Create error log to record when the get_citation_counts fails to get citation information
errorLog = str(Path(currrentWorkingDirectory + '/' + f'error_log_{currentTimeString}.csv'))
with open(errorLog, 'w', newline='') as openErrorFile:
    openErrorFile = csv.writer(openErrorFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    openErrorFile.writerow(['pid', 'error_type', 'error_message', 'error_timestamp'])

In [None]:

# Use joblib library to use multiple CPU cores to speed things up
# and track progress using tqdm progress bars
with tqdm_joblib(tqdm(bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}', total=pidTotal)) as progress_bar:
    Parallel(n_jobs=4, backend='threading')(delayed(get_citation_counts)(pid, relationTypesList, citationsOutputFile) for pid in datasetPIDs)