This script gets the counts of relation types for each dataset in a given CSV file, where the PIDs are formatted as "doi:####", e.g. doi=10.7910/DVN/123456, in a 'persistent_id' column. PIDs of datasets in Dataverse repositories can be retrieved using the Dataverse APIs, by querying the repository's database if you have access to the database, or by scraping OAI-PMH feeds if the repository is publishing dataset metadata over OAI-PMH.

The relation types are listed at https://support.datacite.org/docs/eventdata-guide#relation-type-id.

In [None]:
import csv
from csv import DictReader
import requests
import json


In [None]:
# Enter name of CSV file containing list of dataset PIDs
datasetPIDFile = 'dataset_pids_Harvard Dataverse_2021.08.06_18.36.06.csv'

# Name of CSV file that will store count of relationtypes
output = 'qdr_citation_counts.csv'

In [None]:
# Read in list of dataset PIDs from given CSV or text file
datasetPIDs = []

with open(datasetPIDFile, mode='r', encoding='utf-8') as f:
    csvDictReader = DictReader(f, delimiter=',')
    for row in csvDictReader:
        datasetPIDs.append(row['persistent_id'].rstrip())

In [None]:
print(len(datasetPIDs))

In [None]:
# Create CSV file for writing data requested from DataCite API
with open(output, mode='w') as opencsvfile:
    opencsvfile = csv.writer(opencsvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    opencsvfile.writerow(['pid', 'relation-type', 'count'])


In [None]:
# Create variables for keeping count of script's progress
pidTotal = len(datasetPIDs)
print('Saving counts of relation types of %s to CSV file' % (pidTotal))

pidCount = 0

for pid in datasetPIDs:
    pidForDatacite = pid.replace(':', '=')
    dataciteEventsAPI = 'https://api.datacite.org/events?%s&page[size]=0' % (pidForDatacite)
    response = requests.get(dataciteEventsAPI)
    relationData = response.json()
    if 'relation-types' in relationData['meta'] and len(relationData['meta']['relation-types']) > 0:
        for type in relationData['meta']['relation-types']:
            relationType = type['id']
            count = type['count']
            with open(output, mode='a') as datasets:
                datasets = csv.writer(datasets, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                datasets.writerow([pid, relationType, count])
    pidCount += 1
    print('%s of %s' % (pidCount, pidTotal), end='\r', flush=True)
