In [3]:
OUTPUT_FOLDER = os.path.join('..', '..', 'data', 'combined_records')
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)

### COMBINE SHARE AND OSF PREPRINT DATA

In [2]:
import os

# contains the list of IDs present in both SHARE and the OSF Preprint API
ID_LIST_FILE = os.path.join('..', '..', 'data', 'share_source_ids', 'osf_preprint_api.txt')

In [7]:
ids = []
with open(ID_LIST_FILE, 'r') as f:
    for line in f:
        ids.append(line.strip())

In [8]:
OSF_PREPRINT_FILE = os.path.join('..', '..', 'raw_data', 'osf.json')

SHARE_FILE = os.path.join('..', '..', 'data', 'share-jan-2019.json')

In [9]:
SHARE_OSF_OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, 'share_osf.json')

In [13]:
import json

records = {}
with open(SHARE_FILE) as f:
    for line in f:
        data = json.loads(line)

        osf_ids = set()
        for identifier in data['identifiers']:
            if identifier.startswith('http://osf.io/'):
                osf_id = identifier.replace('http://osf.io/', '')
                if osf_id.endswith('/'):
                    osf_id = osf_id[:-1]

                if '|' in osf_id:
                    raise Exception('Pipe character used in identifier. Change delimiter')
                osf_ids.add(osf_id)

        if len(osf_ids) > 1:
            print('Multiple OSF IDs in one SHARE record: {}'.format(osf_ids))
        
            ids_found = 0
            for osf_id in osf_ids:
                if osf_id in ids:
                    ids_found += 1
                    
            print('{} of {} IDs found in OSF'.format(ids_found, len(osf_ids)))
            
        for osf_id in osf_ids:
            if osf_id in ids:
                records[osf_id] = { 'id' : osf_id, 'share' : data }

with open(OSF_PREPRINT_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)

        for record in data['data']:
            if record['id'] in records:
                records[record['id']]['osf'] = record    
                
with open(SHARE_OSF_OUTPUT_FILE, 'w') as o:
    for id_, record in records.items():
        json.dump(record, o)
        o.write('\n')

Multiple OSF IDs in one SHARE record: {'z49tk', 'tgez8'}
1 of 2 IDs found in OSF
Multiple OSF IDs in one SHARE record: {'dqrrx', 'preprints/socarxiv/d67x5'}
1 of 2 IDs found in OSF
Multiple OSF IDs in one SHARE record: {'zzbka', 'preprints/psyarxiv/zzbka'}
1 of 2 IDs found in OSF
Multiple OSF IDs in one SHARE record: {'preprints/socarxiv/vuhhp', 'vuhhp'}
1 of 2 IDs found in OSF
Multiple OSF IDs in one SHARE record: {'dgmzf', 'w3s3s'}
1 of 2 IDs found in OSF
Multiple OSF IDs in one SHARE record: {'preprints/engrxiv/s47fh', 's47fh'}
1 of 2 IDs found in OSF
Multiple OSF IDs in one SHARE record: {'3q5if', 'fma3p'}
1 of 2 IDs found in OSF
