For each source below, create a file containing all date-related metadata for each record.

### SHARE

In [189]:
import os

# SHARE data file, each line is a record
SHARE_FILE = os.path.join('..', '..', 'data', 'share-jan-2019.json')
SHARE_OUTPUT_FILE = os.path.join('..', '..', 'data', 'share_dates.csv')

In [190]:
date_fields = ['date', 
               'date_created',
               'date_modified',
               'date_updated',
               'date_published'
]

fields_to_extract = ['osf_id', 'arxiv_id', 'doi'] + date_fields

In [193]:
import json, csv

with open(SHARE_OUTPUT_FILE, 'w') as o:
    writer = csv.DictWriter(o, fieldnames = fields_to_extract)
    writer.writeheader()
    
    with open(SHARE_FILE) as f:
        for line in f:
            data = json.loads(line)

            osf_ids = set()
            arxiv_ids = set()
            dois = set()
            for identifier in data['identifiers']:
                if identifier.startswith('http://osf.io/'):
                    osf_id = identifier.replace('http://osf.io/', '')
                    if osf_id.endswith('/'):
                        osf_id = osf_id[:-1]
                        
                    if '|' in osf_id:
                        raise Exception('Pipe character used in identifier. Change delimiter')
                    osf_ids.add(osf_id)
                
                elif identifier.startswith('http://arxiv.org/abs/'):
                    arxiv_id = identifier.replace('http://arxiv.org/abs/', '')
                    if arxiv_id.endswith('/'):
                        arxiv_id = arxiv_id[:-1]
                    if arxiv_id[-2] == 'v' and arxiv_id[-1].isdigit():
                        arxiv_id = arxiv_id[:-2]
                    if arxiv_id[-3] == 'v' and arxiv_id[-2].isdigit() and arxiv_id[-1].isdigit():
                        arxiv_id = arxiv_id[:-3]
                        
                    if '|' in arxiv_id:
                        raise ValueError('Pipe character used in identifier. Change delimiter')
                    arxiv_ids.add(arxiv_id)
            
                elif identifier.startswith('http://dx.doi.org/'):
                    doi = identifier.replace('http://dx.doi.org/', '')
                    if doi.endswith('/'):
                        doi = doi[:-1]
                                             
                    if '|' in doi:
                        raise Exception('Pipe character used in identifier. Change delimiter')
                    dois.add(doi)
            
            row = {}
            row['osf_id'] = '|'.join(osf_ids)
            row['arxiv_id'] = '|'.join(arxiv_ids)
            row['doi'] = '|'.join(dois)
                                             
            for date_field in date_fields:
                row[date_field] = data[date_field]     
                
            writer.writerow(row)         

### OSF PREPRINT API

In [195]:
import os

OSF_PREPRINT_FILE = os.path.join('..', '..', 'raw_data', 'osf.json')
OSF_PREPRINT_OUTPUT_FILE = os.path.join('..', '..', 'data', 'osf_dates.csv')

In [196]:
date_fields = ['date_last_transitioned', 
               'date_modified',
               'original_publication_date',
               'date_published',
               'date_withdrawn', 
               'preprint_doi_created',
               'date_created']

fields_to_extract = ['id'] + date_fields

In [197]:
import json, csv

with open(OSF_PREPRINT_OUTPUT_FILE, 'w') as o:
    writer = csv.DictWriter(o, fieldnames = fields_to_extract)
    writer.writeheader()
    
    with open(OSF_PREPRINT_FILE, 'r') as f:
        for line in f:
            data = json.loads(line)

            for record in data['data']:
                row = {'id' : record['id'] }
                
                for date_field in date_fields:
                    row[date_field] = record['attributes'][date_field]
                
                writer.writerow(row)

### ARXIV

In [213]:
import os

ARXIV_FOLDER = os.path.join('..', '..', 'raw_data', 'arXiv')
ARXIVRAW_FOLDER = os.path.join('..', '..', 'raw_data', 'arXivRaw')

ARXIV_OUTPUT_FILE = os.path.join('..', '..', 'data', 'arxiv_dates.csv')

In [198]:
import xml.etree.ElementTree as ET, csv

arxiv_files = os.listdir(ARXIV_FOLDER)
if '.DS_Store' in arxiv_files:
    arxiv_files.remove('.DS_Store')
    
namespaces = {'oai' : 'http://www.openarchives.org/OAI/2.0/',
              'arxiv' : 'http://arxiv.org/OAI/arXiv/' }

arxiv_data = {}
for arxiv_file in arxiv_files:
    root = ET.parse(os.path.join(ARXIV_FOLDER, arxiv_file))

    for record in root.findall('oai:ListRecords/oai:record', namespaces):
        row = {}
        row['id'] = record.find('oai:header/oai:identifier', namespaces).text.replace('oai:arXiv.org:', '')

        row['datestamp'] = record.find('oai:header/oai:datestamp', namespaces).text

        row['created'] = record.find('oai:metadata/arxiv:arXiv/arxiv:created', namespaces)
        if row['created'] is not None:
            row['created'] = row['created'].text

        row['updated'] = record.find('oai:metadata/arxiv:arXiv/arxiv:updated', namespaces)
        if row['updated'] is not None:
            row['updated'] = row['updated'].text
            
        arxiv_data[row['id']] = row

In [210]:
arxivraw_files = os.listdir(ARXIVRAW_FOLDER)
if '.DS_Store' in arxivraw_files:
    arxivraw_files.remove('.DS_Store')
    
namespaces = {'oai' : 'http://www.openarchives.org/OAI/2.0/',
              'arxiv' : 'http://arxiv.org/OAI/arXivRaw/' }

for arxivraw_file in arxivraw_files:
    root = ET.parse(os.path.join(ARXIVRAW_FOLDER, arxivraw_file))

    for record in root.findall('oai:ListRecords/oai:record', namespaces):
        id_ = record.find('oai:header/oai:identifier', namespaces).text.replace('oai:arXiv.org:', '')
        
        if id_ in arxiv_data:
            version_xml = record.findall('oai:metadata/arxiv:arXivRaw/arxiv:version', namespaces)
            versions = {}
            for version in version_xml:
                versions[int(version.attrib['version'][1:])] = version.find('arxiv:date', namespaces).text

            # concatenate all version dates, delimited by pipe character
            version_str = ''
            for i in range(len(versions.keys())):
                version_str += versions[i + 1] + '|'
            version_str = version_str[:-1]

            arxiv_data[id_]['versions'] = version_str

In [212]:
with open(ARXIV_OUTPUT_FILE, 'w') as o:
    fieldnames = ['id', 'datestamp', 'created', 'updated', 'versions']
    
    writer = csv.DictWriter(o, fieldnames = fieldnames)
    writer.writeheader()
    
    for row in arxiv_data.values():
        writer.writerow(row)

### CROSSREF

In [215]:
import os

CROSSREF_FILE = os.path.join('..', '..', 'raw_data', 'CrossRef.json')
CROSSREF_OUTPUT_FILE = os.path.join('..', '..', 'data', 'crossref_dates.csv')

In [226]:
# get all date fields first

import json

crossref_ids = set() #using a set in case there are any duplicate ids

date_keys = set()
with open(CROSSREF_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        for key in data.keys():
            if isinstance(data[key], dict) and 'date-parts' in data[key]:
                date_keys.add(key)
            elif isinstance(data[key], dict) and 'date-time' in data[key]:
                date_keys.add(key)
            elif isinstance(data[key], dict) and 'timestamp' in data[key]:
                date_keys.add(key)

In [228]:
date_keys

{'accepted', 'created', 'deposited', 'indexed', 'issued', 'posted'}

In [229]:
date_fields = list(date_keys)

fields_to_extract = ['id'] + date_fields

In [235]:
import json, csv

with open(CROSSREF_OUTPUT_FILE, 'w') as o:
    writer = csv.DictWriter(o, fieldnames = fields_to_extract)
    writer.writeheader()
    
    with open(CROSSREF_FILE, 'r') as f:
        for line in f:
            data = json.loads(line)

            row = {'id' : data['DOI'] }
                
            for date_field in date_fields:
                if date_field in data:
                    if 'date-time' in data[date_field]:
                        row[date_field] = data[date_field]['date-time']
                    elif 'date-parts' in data[date_field]:
                        # date-parts is a list but should only have one value
                        if len(data[date_field]['date-parts']) > 1:
                            raise ValueError('Multiple date-parts')
                            
                        row[date_field] = '-'.join([str(s) for s in data[date_field]['date-parts'][0]])
                
            writer.writerow(row)