For each source below, create a file containing IDs of the records present in both SHARE and that source.

** basically do an inner join between SHARE and each source

In [4]:
import os

SHARE_FILE = os.path.join('..', '..', 'data', 'share-jan-2019.json')

OUTPUT_FOLDER = os.path.join('..', '..', 'data', 'share_source_ids')
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)

In [12]:
def save_to_file(ids, source_name):
    with open(os.path.join(OUTPUT_FOLDER, source_name + '.txt'), 'w') as o:
        for id_ in ids:
            o.write(id_ + '\n')

### OSF PREPRINT API

In [13]:
OSF_PREPRINT_FILE = os.path.join('..', '..', 'raw_data', 'osf.json')

In [35]:
import json

osf_ids = set() #using a set in case there are any duplicate ids
with open(OSF_PREPRINT_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)

        for record in data['data']:
            osf_ids.add(record['id'])

share_osf_ids = set()
with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)

        for identifier in data['identifiers']:
            if identifier.startswith('http://osf.io/'):
                id_ = identifier.replace('http://osf.io/', '')
                if id_.endswith('/'):
                    id_ = id_[:-1]
                
                share_osf_ids.add(id_)   

ids_in_common = osf_ids.intersection(share_osf_ids)
print('{} ids in common.'.format(len(ids_in_common)))
print('{} ids in SHARE but not in OSF Preprint API data.'.format(len(share_osf_ids - osf_ids)))

save_to_file(ids_in_common, 'osf_preprint_api')

21186 ids in common.
40 ids in SHARE but not in OSF Preprint API data.


In [38]:
share_osf_ids - osf_ids

{'3q5if',
 '3x2nc',
 '3ygkq',
 '6p2ws',
 '6vpjm',
 '74dte',
 '7bsp4',
 '7tc6p',
 '8krmb',
 '92xts',
 'b5j3w',
 'dgmzf',
 'envb7',
 'g7vry',
 'gcnmj',
 'hvzej',
 'j2abz',
 'kfv2h',
 'phjxg',
 'preprints/engrxiv/s47fh',
 'preprints/psyarxiv/4dcw6',
 'preprints/psyarxiv/98v3f',
 'preprints/psyarxiv/jkc4n',
 'preprints/psyarxiv/n4yy2',
 'preprints/psyarxiv/zzbka',
 'preprints/socarxiv/d67x5',
 'preprints/socarxiv/vuhhp',
 'q2x87',
 'qjuky',
 'qtu37',
 'r4nmy',
 's2kym',
 'tgez8',
 'ufjwv',
 'uh5c8',
 'urxqs',
 'xtavb',
 'ys5jk',
 'ytc5d',
 'z4knb'}

### arXiv

In [36]:
ARXIV_FOLDER = os.path.join('..', '..', 'raw_data', 'arXiv')

In [40]:
import xml.etree.ElementTree as ET, csv

arxiv_files = os.listdir(ARXIV_FOLDER)
if '.DS_Store' in arxiv_files:
    arxiv_files.remove('.DS_Store')
    
namespaces = {'oai' : 'http://www.openarchives.org/OAI/2.0/',
              'arxiv' : 'http://arxiv.org/OAI/arXiv/'}

arxiv_ids = set()
for arxiv_file in arxiv_files:
    root = ET.parse(os.path.join(ARXIV_FOLDER, arxiv_file))

    for record in root.findall('oai:ListRecords/oai:record', namespaces):
        arxiv_ids.add(record.find('oai:header/oai:identifier', namespaces).text.replace('oai:arXiv.org:', ''))

share_arxiv_ids = set()
with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)

        for identifier in data['identifiers']:
            if identifier.startswith('http://arxiv.org/abs/'):
                id_ = identifier.replace('http://arxiv.org/abs/', '')
                if id_.endswith('/'):
                    id_ = id_[:-1]
                if id_[-2] == 'v' and id_[-1].isdigit():
                    id_ = id_[:-2]
                if id_[-3] == 'v' and id_[-2].isdigit() and id_[-1].isdigit():
                    id_ = id_[:-3]
                    
                share_arxiv_ids.add(id_)
                
ids_in_common = arxiv_ids.intersection(share_arxiv_ids)
print('{} ids in common.'.format(len(ids_in_common)))
print('{} ids in SHARE but not in arXiv data.'.format(len(share_arxiv_ids - arxiv_ids)))

save_to_file(ids_in_common, 'arxiv')

1301577 ids in common.
0 ids in SHARE but not in arXiv data.


In [41]:
share_arxiv_ids - arxiv_ids

set()

### CrossRef

In [42]:
CROSSREF_FILE = os.path.join('..', '..', 'raw_data', 'CrossRef.json')

In [50]:
import json

crossref_ids = set() #using a set in case there are any duplicate ids
with open(CROSSREF_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        crossref_ids.add(data['DOI'])

share_crossref_ids = set()
with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)

        for identifier in data['identifiers']:
            if identifier.startswith('http://dx.doi.org/'):
                id_ = identifier.replace('http://dx.doi.org/', '')
                if id_.endswith('/'):
                    id_ = id_[:-1]
                
                share_crossref_ids.add(id_.lower()) 
                
            if identifier.startswith('http://doi.org/'):
                id_ = identifier.replace('http://doi.org/', '')
                if id_.endswith('/'):
                    id_ = id_[:-1]
                    
                share_crossref_ids.add(id_.lower())

ids_in_common = crossref_ids.intersection(share_crossref_ids)
print('{} ids in common.'.format(len(ids_in_common)))
print('{} ids in SHARE but not in CrossRef.'.format(len(share_crossref_ids - crossref_ids)))

save_to_file(ids_in_common, 'crossref')

62839 ids in common.
681886 ids in SHARE but not in CrossRef.
