In [1]:
import csv
from csv import DictReader
import requests
import json


Relation types are listed at https://support.datacite.org/docs/eventdata-guide#relation-type-id. This script looks for the resources (subjects) that are related somehow to each dataset PID (object) in the given CSV or TXT file.

I'm ignoring the following relation types since I'm fairly confident that publishers aren't using these to describe the relationship between two different studies:
- is-version-of
- is-new-version-of
- is-previous-version-of
- is-part-of
- has-part
- is-identical-to

In [2]:
relationships = [
    'Is-cited-by',
    'cites',
    'is-supplement-to',
    'is-supplemented-by',
    'is-referenced-by',
    'references',
    'is-continued-by',
    'continues',
    'describes',
    'is-described-by',
    'has-metadata',
    'is-metadata-for',
    'has-version',
    'is-documented-by',
    'documents',
    'is-compiled-by',
    'compiles',
    'is-variant-form-of',
    'is-original-form-of',
    'is-reviewed-by',
    'reviews',
    'is-derived-from',
    'is-source-of',
    'is-required-by',
    'requires'
]


In [3]:
# datasetPIDs = [
#     'doi:10.7910/dvn/28075',
#     'doi:10.7910/dvn/ohhukh',
#     'doi:10.7910/dvn/gdf6z0']

# datasetPIDs = [
#     'doi:10.7910/DVN/28075']



In [36]:
# Enter path to CSV or text file containing list of dataset PIDs
datasetPIDFile = 'dataset_pids_QDR Main Collection_2021.08.05_18.55.01.csv'

# Read in list of dataset PIDs from given CSV or text file
datasetPIDs = []

if '.csv' in datasetPIDFile:
    with open(datasetPIDFile, mode='r', encoding='utf-8') as f:
        csvDictReader = DictReader(f, delimiter=',')
        for row in csvDictReader:
            datasetPIDs.append(row['persistent_id'].rstrip())
elif 'txt' in datasetPIDFile:
    for datasetPID in datasetPIDFile:
        datasetPIDs.append(datasetPID.rstrip())

In [4]:
print(len(datasetPIDs))

1


In [51]:
# Create CSV file for writing data requested from DataCite API
with open('data.csv', mode='w') as opencsvfile:
    opencsvfile = csv.writer(opencsvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    opencsvfile.writerow(['subject', 'relation', 'object (dataset)', 'occured_at', 'timestamp'])

In [5]:
# Create variables for keeping count of script's progress
pidCount = 1
pidTotal = len(datasetPIDs)

dataciteEventsAPI = 'https://api.datacite.org/events'
for pid in datasetPIDs:
    params = {
        'pid': pid.replace(':', '='),
        'page[number]': 1,
        'page[size]': 0        
    }
    response = requests.get(dataciteEventsAPI)
    relationData = response.json()
    print(json.dumps(relationData, indent=4))
#     print('Searching for metrics for %s (%s of %s)' %(pid, pidCount, pidTotal))

{
    "data": [
        {
            "id": "e312e438-ebd1-4dac-aac3-4365c2521a40",
            "type": "events",
            "attributes": {
                "subj-id": "https://doi.org/10.6084/m9.figshare.8330180.v1",
                "obj-id": "https://doi.org/10.6084/m9.figshare.8330180",
                "source-id": "datacite-related",
                "relation-type-id": "is-identical-to",
                "total": 1,
                "message-action": "create",
                "source-token": "29a9a478-518f-4cbd-a133-a0dcef63d547",
                "license": "https://creativecommons.org/publicdomain/zero/1.0/",
                "occurred-at": "2019-06-27T00:41:10.000Z",
                "timestamp": "2019-06-27T00:41:37.356Z"
            },
            "relationships": {
                "subj": {
                    "data": {
                        "id": "https://doi.org/10.6084/m9.figshare.8330180.v1",
                        "type": "objects"
                    }
                },

In [None]:
    if len(relationData['data']) > 0:
        totalPages = relationData['meta']['total-pages']

        # Initialization for paginating through Search API results and showing progress
#         pageNumber = 1
#         pageSize = 10
        condition = True
        relationCount = 0

        while condition:
#             params = params
            response = requests.get(dataciteEventsAPI)
            relationData = response.json()
            for i in relationData['data']:
                if i['attributes']['relation-type-id'] in relationships:
                    subject = i['attributes']['subj-id']
                    relationTypeId = i['attributes']['relation-type-id']
                    object = i['attributes']['obj-id']
                    occurredAt = i['attributes']['occurred-at']
                    timeStamp = i['attributes']['timestamp']
                    
                    print(subject, relationTypeId, object, occurredAt, timeStamp)
                    
                    # Write values of the three variables to a new row in the CSV
#                     with open('data.csv', mode='a') as datasets:
#                         datasets = csv.writer(datasets, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#                         datasets.writerow([subject, relationTypeId, object, occurredAt, timeStamp])
                    
                    relationCount += 1

            params['page[number]'] += 1
            condition = params['page[number]'] <= totalPages
    else:
        print('No relation data found')
    pidCount += 1