In [1]:
import os, json

OSF_DATA = os.path.join('..', '..', 'data', 'osf_all_data.json')

osf_records = []
with open(OSF_DATA, 'r') as f:
    for line in f:
        data = json.loads(line)
        osf_records.append(data)

In [43]:
OSF_PREPRINT_SERVER_FILE = os.path.join('..', '..', 'raw_data', 'osf-preprint-providers.json')

servers = []
with open(OSF_PREPRINT_SERVER_FILE, 'r') as f:
    data = json.load(f)
    
    for server in data['data']:
        servers.append(server['id'])
        #preprint_servers[server['id']] = { 'name' : server['attributes']['name'] } 

In [44]:
# parse date string and ignore the microseconds

from datetime import datetime

def parse_date(datestr):
    try:
        dt = datetime.strptime(datestr, '%Y-%m-%d')
        dt_format = 'ymd'
    except:
        try:
            dt = datetime.strptime(datestr, '%Y-%m')
            dt_format = 'ym'
        except:
            dt = datetime.strptime(datestr, '%Y')
            dt_format = 'y'
    return dt, dt_format

In [75]:
import numpy as np

preprint_servers = {}
for server in servers:
    preprint_servers[server] = {'records' : 0,
                                'date of first submission' : None,
                                'records with submitted DOIs' : 0,
                                'authors per record' : [] }

cutoff_date = parse_date('2019-6-1')[0]

for record in osf_records:
    server = record['osf_record']['relationships']['provider']['links']['related']['href'].replace( \
                'https://api.osf.io/v2/providers/preprints/', '').replace('/?format=json', '')
    
    date_published = parse_date(record['osf_record']['attributes']['date_published'][:10])[0]
    
    if date_published < cutoff_date:
        preprint_servers[server]['records'] += 1
        
        if preprint_servers[server]['date of first submission'] == None or \
            date_published < preprint_servers[server]['date of first submission']:
            preprint_servers[server]['date of first submission'] = date_published
           
        if record['osf_record']['attributes']['doi'] is not None and \
            record['osf_record']['attributes']['doi'] != \
            record['osf_record']['links']['preprint_doi'].replace('https://doi.org/', ''):
            preprint_servers[server]['records with submitted DOIs'] += 1
            
        if len(record['osf_authors'][0]) > 0:
            preprint_servers[server]['authors per record'].append(\
                record['osf_authors'][0][0]['links']['meta']['total_bibliographic'])
        else:
            preprint_servers[server]['authors per record'].append(0)
            
for server in preprint_servers.keys():
    if preprint_servers[server]['date of first submission'] is not None:
        preprint_servers[server]['date of first submission'] = \
            preprint_servers[server]['date of first submission'].isoformat()
    
    if len(preprint_servers[server]['authors per record']) > 0:
        q75, q25 = np.percentile(preprint_servers[server]['authors per record'], [75 ,25])
        
        if q25 == q75:
            preprint_servers[server]['authors per record'] = int(q25)
        else:
            preprint_servers[server]['authors per record'] = '{}-{}'.format(int(q25), int(q75))
    else:
        preprint_servers[server]['authors per record'] = '--'

In [76]:
import pandas as pd

df = pd.DataFrame(preprint_servers).transpose()
df.to_csv(os.path.join('..', '..', 'data_analysis_results', 'osf.csv'))

df

Unnamed: 0,authors per record,date of first submission,records,records with submitted DOIs
africarxiv,1-2,2018-06-22T00:00:00,50,7
agrixiv,1-2,2017-02-15T00:00:00,126,10
arabixiv,1,2018-01-14T00:00:00,297,162
bodoarxiv,1,2019-03-19T00:00:00,22,13
eartharxiv,2-5,2017-10-23T00:00:00,822,364
ecoevorxiv,3-6,2018-03-21T00:00:00,70,14
ecsarxiv,1-5,2018-05-11T00:00:00,62,1
engrxiv,1-4,2016-07-27T00:00:00,548,158
focusarchive,1-3,2017-09-20T00:00:00,39,0
frenxiv,1-2,2018-07-06T00:00:00,42,24
