## READ DATA

In [3]:
# parse date string and ignore the microseconds

from datetime import datetime

def parse_date(datestr):
    try:
        dt = datetime.strptime(datestr, '%Y-%m-%d')
        dt_format = 'ymd'
    except:
        try:
            dt = datetime.strptime(datestr, '%Y-%m')
            dt_format = 'ym'
        except:
            dt = datetime.strptime(datestr, '%Y')
            dt_format = 'y'
                    
    
    return dt, dt_format

In [5]:
import os, json

OSF_DATA = os.path.join('..', '..', 'data', 'osf', 'osf_all_data.json')

cutoff_date = parse_date('2019-6-1')[0]

main_subjects = set()

osf_records = []
with open(OSF_DATA, 'r') as f:
    for line in f:
        record = json.loads(line)

        record['date_published'] = parse_date(record['osf_record']['attributes']['date_published'][:10])[0]
        
        if record['date_published'] < cutoff_date and \
            (record['osf_record']['attributes']['date_withdrawn'] is None or 
            parse_date(record['osf_record']['attributes']['date_withdrawn'][:10])[0] >= cutoff_date):
            osf_records.append(record)
            
            for subject in record['osf_record']['attributes']['subjects']:
                main_subjects.add(subject[0]['text'])

In [3]:
len(osf_records)

28398

In [4]:
OSF_PREPRINT_SERVER_FILE = os.path.join('..', '..', 'raw_data', 'osf-preprint-providers.json')

osf_servers = []
with open(OSF_PREPRINT_SERVER_FILE, 'r') as f:
    data = json.load(f)
    
    for server in data['data']:
        osf_servers.append(server['id'])
        #preprint_servers[server['id']] = { 'name' : server['attributes']['name'] } 

In [5]:
osf_dois = []
for record in osf_records:
    osf_dois.append(record['doi'])

## SAVE ANALYSIS FILE CONTAINING SUBSET OF METADATA

In [11]:
import pandas as pd

OSF_ANALYSIS_FILE = os.path.join('..', '..', 'data', 'osf_analysis.csv')

osf_analysis_records = []
for record in osf_records:
    server = record['osf_record']['relationships']['provider']['links']['related']['href'].replace( \
                'https://api.osf.io/v2/providers/preprints/', '').replace('/?format=json', '')
    
    record_subjects = set()
    for subject in record['osf_record']['attributes']['subjects']:
        record_subjects.add(subject[0]['text'])
        
    if record['osf_record']['attributes']['doi'] is not None:
        record['osf_record']['attributes']['doi'] = record['osf_record']['attributes']['doi'].lower()
    
        if record['osf_record']['attributes']['doi'] == \
            record['osf_record']['links']['preprint_doi'].replace('https://doi.org/', ''):
            record['osf_record']['attributes']['doi'] = None
            
        if 'osf.io/' in record['osf_record']['attributes']['doi']:
            published_doi_is_osf_doi = True
            
            if record['osf_record']['attributes']['doi'] in osf_dois:
                published_doi_is_osf_record = True
            else:
                published_doi_is_osf_record = False
        else:
            published_doi_is_osf_doi = False
    
    if record['osf_peerrev_publications'] is not None and 'message' in record['osf_peerrev_publications']:
        osf_peerrev_publication = record['osf_peerrev_publications']['message']
        try:
            published_online_date = osf_peerrev_publication['published-online']['date-parts'][0]
        except:
            published_online_date = None

        try:
            published_print_date = osf_peerrev_publication['published-print']['date-parts'][0]
        except:
            published_print_date = None

        try:
            issued_date = osf_peerrev_publication['issued']['date-parts'][0]
        except:
            issued_date = None

        try:
            created_date = osf_peerrev_publication['created']['date-parts'][0]
        except:
            created_date = None
        
        
        if osf_peerrev_publication['type'] == 'posted-content':
            published_doi_is_preprint = True
        else:
            published_doi_is_preprint = False
            
        published_title = osf_peerrev_publication['title']
        
        published_authors = []
        
        if 'author' in osf_peerrev_publication:
            for author in osf_peerrev_publication['author']:
                author_name = ''
                if 'given' in author:
                    author_name = author['given']

                if 'family' in author:
                    author_name = author_name + ' ' + author['family']

                if 'suffix' in author:
                    author_name = author_name + ' ' + author['suffix']

                # it seems that if name is present, there are no other name fields (e.g. given, family)
                if 'name' in author:
                    author_name = author_name + ' ' + author['name']

                published_authors.append(author_name)
            
    else:
        published_online_date = None
        published_print_date = None
        issued_date = None
        created_date = None
        
        published_doi_is_preprint = None
        published_doi_is_osf_record = True
        published_doi_is_osf_record = None
        published_doi_is_osf_doi = None
        
        published_title = None
        published_authors = []
    
    preprint_authors = []
    for author_list in record['osf_authors'][0]:
        for author in author_list['data']:
            if author['attributes']['bibliographic']:
                try:
                    author_data = author['embeds']['users']['data']['attributes']
                except Exception as e:
                    author_data = author['embeds']['users']['errors'][0]['meta']
        
                preprint_authors.append(author_data['full_name'])
    
    row = {
        'id' : 'osf.io/' + record['id'],
        'preprint_title' : record['osf_record']['attributes']['title'],
        'server' : server,
        'preprint_date_published' : record['date_published'],
        'submitted_doi' : record['osf_record']['attributes']['doi'],
        'published_online_date' : published_online_date,
        'published_print_date' : published_print_date,
        'issued_date' : issued_date,
        'created_date' : created_date,
        'main_subjects' : len(record_subjects),
        'specific_subjects' : len(record['osf_record']['attributes']['subjects']),
        'preprint_authors' : preprint_authors,
        'preprint_total_authors' : len(preprint_authors),
        'published_title' : published_title,
        'published_authors' : published_authors,
        'published_total_authors' : len(published_authors),
        'published_doi_is_preprint' : published_doi_is_preprint,
        'published_doi_is_osf_record' : published_doi_is_osf_record,
        'published_doi_is_osf_doi' : published_doi_is_osf_doi, # not necessarily an OSF record
    }
    
    for subject in main_subjects:
        if subject in record_subjects:
            row['SUBJ_' + subject] = 1
        else:
            row['SUBJ_' + subject] = 0
    
    osf_analysis_records.append(row)
    
df_osf_analysis_records = pd.DataFrame(osf_analysis_records)
df_osf_analysis_records.to_csv(OSF_ANALYSIS_FILE)

## PRELIMINARY STATISTICS

### TOTAL NUMBER OF PUBLICATIONS AND AUTHORS PER PUBLICATION

In [41]:
import numpy as np
import statistics

preprint_servers = {}
for server in osf_servers:
    preprint_servers[server] = {'records' : 0,
                                'date of first submission' : None,
                                'records with submitted DOIs' : 0,
                                'records with OSF-related DOIs' : 0,
                                'records with OSF publication DOIs' : 0,
                                'authors per record' : [] }

submitted_osf_dois = []
    
for record in osf_records:
    server = record['osf_record']['relationships']['provider']['links']['related']['href'].replace( \
                'https://api.osf.io/v2/providers/preprints/', '').replace('/?format=json', '')
    
    preprint_servers[server]['records'] += 1
    
    if preprint_servers[server]['date of first submission'] == None or \
        record['date_published'] < preprint_servers[server]['date of first submission']:
        preprint_servers[server]['date of first submission'] = record['date_published']

    if record['osf_record']['attributes']['doi'] is not None and \
        record['osf_record']['attributes']['doi'] != \
        record['osf_record']['links']['preprint_doi'].replace('https://doi.org/', ''):
        preprint_servers[server]['records with submitted DOIs'] += 1
        
        if 'osf.io' in record['osf_record']['attributes']['doi'].lower():
            preprint_servers[server]['records with OSF-related DOIs'] += 1
            
            if record['osf_record']['attributes']['doi'].lower() in osf_dois:
                preprint_servers[server]['records with OSF publication DOIs'] += 1
                
                try:
                    peerrev_dp = parse_date('-'.join([str(dt) for dt in \
                                      record['osf_peerrev_publications']['message']['issued']['date-parts'][0]]))[0]
                except:
                    '-'.join([str(dt) for dt in \
                                      record['osf_peerrev_publications']['message']['issued']['date-parts'][0]])
                             
                submitted_osf_dois.append({
                    'preprint' : 'osf.io/' + record['id'],
                    'submitted_doi' : record['osf_record']['attributes']['doi'].lower(),
                    'preprint_date_published' : record['date_published'],
                    'peerrev_date_published' : peerrev_dp,
                    'date_diff' : peerrev_dp - record['date_published']
                })
                
    if len(record['osf_authors'][0]) > 0:
        preprint_servers[server]['authors per record'].append(\
            record['osf_authors'][0][0]['links']['meta']['total_bibliographic'])
    else:
        preprint_servers[server]['authors per record'].append(0)
            
# get author statistics
for server in preprint_servers.keys():
    if preprint_servers[server]['date of first submission'] is not None:
        preprint_servers[server]['date of first submission'] = \
            preprint_servers[server]['date of first submission'].isoformat()
    
    if len(preprint_servers[server]['authors per record']) > 0:
        q75, q25 = np.percentile(preprint_servers[server]['authors per record'], [75 ,25])
        
        if q25 == q75:
            preprint_servers[server]['authors-IQR'] = int(q25)
        else:
            preprint_servers[server]['authors-IQR'] = '{}-{}'.format(int(q25), int(q75))
            
        preprint_servers[server]['authors-median'] = statistics.median(preprint_servers[server]['authors per record'])
        preprint_servers[server]['authors-range'] = '{}-{}'.format(\
           min(preprint_servers[server]['authors per record']), max(preprint_servers[server]['authors per record']))
    else:
        preprint_servers[server]['authors-IQR'] = '--'
        preprint_servers[server]['authors-median'] = '--'
        preprint_servers[server]['authors-range'] = '--'

In [413]:
import pandas as pd

pd.DataFrame(submitted_osf_dois)

Unnamed: 0,date_diff,peerrev_date_published,preprint,preprint_date_published,submitted_doi
0,0 days,2019-05-29,osf.io/pz23f,2019-05-29,10.31227/osf.io/62rks
1,-536 days,2017-10-21,osf.io/hz9m4,2019-04-10,10.31227/osf.io/he853
2,-536 days,2017-10-21,osf.io/kshf3,2019-04-10,10.31227/osf.io/he853
3,-160 days,2018-09-28,osf.io/nua2x,2019-03-07,10.31219/osf.io/79ext
4,-2 days,2019-03-04,osf.io/y5du2,2019-03-06,10.31227/osf.io/ksy4d
5,-208 days,2018-08-06,osf.io/gs67u,2019-03-02,10.31227/osf.io/wxsrg
6,-1 days,2019-02-27,osf.io/4mkts,2019-02-28,10.31227/osf.io/7hm4p
7,-1 days,2019-02-27,osf.io/6pxcy,2019-02-28,10.31227/osf.io/7hm4p
8,-1 days,2019-02-27,osf.io/75z3n,2019-02-28,10.31227/osf.io/7hm4p
9,-74 days,2018-12-15,osf.io/tca4z,2019-02-27,10.31227/osf.io/rfsyx


In [407]:
import pandas as pd

df = pd.DataFrame(preprint_servers).transpose()
df = df.drop(columns = 'authors per record')

df.to_csv(os.path.join('..', '..', 'data_analysis_results', 'osf.csv'))

df

Unnamed: 0,authors-IQR,authors-median,authors-range,date of first submission,records,records with OSF publication DOIs,records with OSF-related DOIs,records with submitted DOIs
africarxiv,1-2,1,1-21,2018-06-22T00:00:00,50,0,0,7
agrixiv,1-2,1,1-23,2017-02-15T00:00:00,126,0,1,10
arabixiv,1,1,1-8,2018-01-14T00:00:00,297,0,0,162
bodoarxiv,1,1,1-2,2019-03-19T00:00:00,21,0,0,13
eartharxiv,2-5,4,1-35,2017-10-23T00:00:00,821,0,0,364
ecoevorxiv,3-6,4,1-53,2018-03-21T00:00:00,66,0,1,14
ecsarxiv,1-5,3,1-18,2018-05-11T00:00:00,61,0,0,1
engrxiv,1-4,2,1-21,2016-07-27T00:00:00,541,0,0,154
focusarchive,1-3,2,1-9,2017-09-20T00:00:00,39,0,0,0
frenxiv,1-2,1,1-22,2018-07-06T00:00:00,42,0,0,24


### GROWTH IN THE NUMBER OF PUBLICATIONS

In [355]:
OUTPUT_FOLDER_GROWTH = os.path.join('..', '..', 'data_analysis_results', 'growth_per_month')
if not os.path.exists(OUTPUT_FOLDER_GROWTH):
    os.mkdir(OUTPUT_FOLDER_GROWTH)

In [328]:
preprint_servers = {}
for server in servers:
    preprint_servers[server] = {}

all_servers = {}

for record in osf_records:
    server = record['osf_record']['relationships']['provider']['links']['related']['href'].replace( \
                'https://api.osf.io/v2/providers/preprints/', '').replace('/?format=json', '')
    
    if record['date_published'].year not in all_servers:
        all_servers[record['date_published'].year] = defaultdict(int)
    
    all_servers[record['date_published'].year][record['date_published'].month] += 1
    
    if record['date_published'].year not in preprint_servers[server]:
        preprint_servers[server][record['date_published'].year] = defaultdict(int)
    
    preprint_servers[server][record['date_published'].year][record['date_published'].month] += 1

In [348]:
import calendar
all_servers_pub_per_month = []
for year in sorted(all_servers.keys()):
    for month, count in sorted(all_servers[year].items()):
        all_servers_pub_per_month.append({ 'year' : year, 
                                   'month' : calendar.month_name[month],
                                   'publications' : count})

In [356]:
df = pd.DataFrame(all_servers_pub_per_month)
df.to_csv(os.path.join(OUTPUT_FOLDER_GROWTH, 'all_servers_pub_per_month.csv'))

display(df)

Unnamed: 0,month,publications,year
0,July,290,2016
1,August,230,2016
2,September,158,2016
3,October,69,2016
4,November,67,2016
5,December,221,2016
6,January,253,2017
7,February,188,2017
8,March,292,2017
9,April,225,2017


In [358]:
import calendar

for server, data in preprint_servers.items():
    pub_per_month = []
    for year in sorted(data.keys()):
        for month, count in sorted(data[year].items()):
            pub_per_month.append({ 'year' : year, 
                                       'month' : calendar.month_name[month],
                                       'publications' : count})
    df = pd.DataFrame(pub_per_month)
    df.to_csv(os.path.join(OUTPUT_FOLDER_GROWTH, server + '.csv'))

#### GROWTH IN THE NUMBER OF PUBLICATIONS (FOR BIOLOGY AND LIFE SCIENCES)

In [395]:
OUTPUT_FOLDER_GROWTH_BIO = os.path.join('..', '..', 'data_analysis_results', 'osf_bio_growth_per_month')
if not os.path.exists(OUTPUT_FOLDER_GROWTH_BIO):
    os.mkdir(OUTPUT_FOLDER_GROWTH_BIO)

In [397]:
preprint_servers = {}
for server in osf_servers:
    preprint_servers[server] = {}

all_servers = {}

for record in osf_records:
    record_is_biology_related = False
    for subject in record['osf_record']['attributes']['subjects']:
        if subject[0]['text'] == 'Life Sciences':
            record_is_biology_related = True
            break
            
    if record_is_biology_related:
        server = record['osf_record']['relationships']['provider']['links']['related']['href'].replace( \
                    'https://api.osf.io/v2/providers/preprints/', '').replace('/?format=json', '')

        if record['date_published'].year not in all_servers:
            all_servers[record['date_published'].year] = defaultdict(int)

        all_servers[record['date_published'].year][record['date_published'].month] += 1

        if record['date_published'].year not in preprint_servers[server]:
            preprint_servers[server][record['date_published'].year] = defaultdict(int)

        preprint_servers[server][record['date_published'].year][record['date_published'].month] += 1

In [398]:
import calendar
all_servers_pub_per_month = []
for year in sorted(all_servers.keys()):
    for month, count in sorted(all_servers[year].items()):
        all_servers_pub_per_month.append({ 'year' : year, 
                                   'month' : calendar.month_name[month],
                                   'publications' : count})

In [399]:
df = pd.DataFrame(all_servers_pub_per_month)
df.to_csv(os.path.join(OUTPUT_FOLDER_GROWTH_BIO, 'all_servers_pub_per_month.csv'))

display(df)

Unnamed: 0,month,publications,year
0,August,2,2016
1,September,1,2016
2,October,1,2016
3,November,1,2016
4,December,55,2016
5,January,12,2017
6,February,9,2017
7,March,12,2017
8,April,11,2017
9,May,13,2017


In [400]:
import calendar

for server, data in preprint_servers.items():
    pub_per_month = []
    for year in sorted(data.keys()):
        for month, count in sorted(data[year].items()):
            pub_per_month.append({ 'year' : year, 
                                       'month' : calendar.month_name[month],
                                       'publications' : count})
    df = pd.DataFrame(pub_per_month)
    df.to_csv(os.path.join(OUTPUT_FOLDER_GROWTH_BIO, server + '.csv'))

### DUPLICATE RECORDS

In [365]:
OUTPUT_FOLDER_DUPLICATE = os.path.join('..', '..', 'data_analysis_results', 'duplicate_records')
if not os.path.exists(OUTPUT_FOLDER_DUPLICATE):
    os.mkdir(OUTPUT_FOLDER_DUPLICATE)

In [6]:
titles = {}

for record in osf_records:
    server = record['osf_record']['relationships']['provider']['links']['related']['href'].replace( \
                'https://api.osf.io/v2/providers/preprints/', '').replace('/?format=json', '')
    
    if record['osf_record']['attributes']['title'] not in titles:
        titles[record['osf_record']['attributes']['title']] = []
        
    authors = []
    for author_list in record['osf_authors'][0]:
        for author in author_list['data']:
            if author['attributes']['bibliographic']:
                try:
                    author_data = author['embeds']['users']['data']['attributes']
                    author_id = author['embeds']['users']['data']['id']
                except Exception as e:
                    author_data = author['embeds']['users']['errors'][0]['meta']
        
                authors.append(author_data['full_name'])
            
    titles[record['osf_record']['attributes']['title']].append({'id' : record['id'],
                                                               'authors' : authors,
                                                               'server' : server,
                                                               'doi' : record['osf_record']['attributes']['doi'],
                                                               'date' : record['osf_record']['attributes']['date_published'][:10]})

In [8]:
a = 0
for title, r in titles.items():
    if len(r) >1:
        a += 1
a

848

In [304]:
def have_same_authors(author_list_1, author_list_2):   
    if len(author_list_1) == len(author_list_2):
        for k in range(len(author_list_1)):
            if author_list_1[k] != author_list_2[k]:
                return False

        return True
    else:
        return False

In [381]:
# this processes all records with duplicate titles

from collections import defaultdict

# SAME: authors, date, server, doi
sa_sdp_ss_sdoi = []

# SAME: authors, date, doi DIFF: server
sa_sdp_ds_sdoi = []

# SAME: authors, date, doi DIFF: server
sa_sdp_ds_sdoi = []

for title, records in titles.items():
    if len(records) > 1:
        date_published = set()
        servers = set()
        dois = set()
        
        for record in records:
            date_published.add(record['date'])
            servers.add(record['server'])
            dois.add(record['doi'])
         
        all_same_authors = True
        for i in range(len(records)):
            for j in range(i + 1, len(records)):
                if not have_same_authors(records[i]['authors'], records[j]['authors']):
                    all_same_authors = False
                    break
            if not all_same_authors:
                break

        if all_same_authors and len(servers) == 1 and len(date_published) == 1 and len(dois) == 1:
            ids = []
            for record in records:
                ids.append(record['id'])
            
            sa_sdp_ss_sdoi.append({
                'title' : title,
                'server' : list(servers)[0],
                'date_published' : list(date_published)[0],
                'ids' : ids,
                'doi' : list(dois)[0]
            })
            
        if all_same_authors and len(servers) > 1 and len(date_published) == 1 and len(dois) == 1:
            ids = []
            for record in records:
                ids.append(record['id'])
            
            sa_sdp_ds_sdoi.append({
                'title' : title,
                'server' : servers,
                'date_published' : list(date_published)[0],
                'ids' : ids,
                'doi' : list(dois)[0]
            })
        
        if all_same_authors and len(servers) > 1 and len(date_published) == 1 and len(dois) == 1:
            ids = []
            for record in records:
                ids.append(record['id'])
            
            sa_sdp_ds_sdoi.append({
                'title' : title,
                'server' : servers,
                'date_published' : list(date_published)[0],
                'ids' : ids,
                'doi' : list(dois)[0]
            })

In [382]:
df_sa_sdp_ss_sdoi = pd.DataFrame(sa_sdp_ss_sdoi)
df_sa_sdp_ss_sdoi.to_csv(os.path.join(OUTPUT_FOLDER_DUPLICATE, 'same_au_dp_serv_doi.csv'))

In [383]:
df_sa_sdp_ds_sdoi = pd.DataFrame(sa_sdp_ds_sdoi)
df_sa_sdp_ds_sdoi.to_csv(os.path.join(OUTPUT_FOLDER_DUPLICATE, 'same_au_dp_doi_diff_serv.csv'))

In [311]:
from collections import defaultdict

date_diff = defaultdict(int)

samesies = 0
not_same = 0
not_recs = 0 
recs = 0
for title, records in titles.items():
    if len(records) > 1:
        date_published = defaultdict(int)
        
        servers = set()
        for record in records:
            date_published[record['date']] += 1
            servers.add(record['server'])
            
        if len(date_published.keys()) == 1:
            if len(servers) > 1:
#                 print(servers)
                not_same += 1
                not_recs += len(records)
                
#                 if len(records) > 2:
#                     print(records)
            else:
                
                
                all_same_authors = True
                for i in range(len(records)):
                    for j in range(i + 1, len(records)):
                        if not have_same_authors(records[i]['authors'], records[j]['authors']):
                            all_same_authors = False
                            break
                    if not all_same_authors:
                        break
                
                if all_same_authors:
                    samesies += 1
                    recs += len(records)
                else:
                    for record in records:
                        print(record['id'])
                        print(record['authors'])
                
                print('----')
            
#         date_diff[len(date_published.keys())] += 1
        
#         if len(date_published.keys()) > 1:
#             print(date_published)
#             for record in records:
#                 print('osf.io/' + record['id'])

# date_diff
print(recs)
print(not_same)
print(not_recs)
samesies

jfegz
['Rafika andella']
npfq4
['Dela Wulandari']
----
3d8uk
['Putri Pattitria Ningrum Nasution', 'Hade Afriansyah']
6er4b
['Aulia Alqiva', 'Hade Afriansyah']
----
vpk5z
['Achmad Affandi', 'Hade Afriansyah']
7hqfx
['Achmad Affandi']
----
w4c8b
['gusti', 'ratna', 'titin afwirdha']
kqc2d
['gusti']
----
ja9u6
['Adhitya dwi putra']
ayvjm
['galant emerald']
----
----
f6q4g
['nuruzzaitun adawiyah']
tebjh
['clairine gitta zerlina sutedjo ']
cksn3
['Gita Sirini Purwanto']
----
----
psqrv
['Rahmawati Nur Arifah']
a2qgy
['Dewi Nopitaningrum']
----
----
----
----
----
----
----
----
----
----
----
----
----
----
----
c4whb
['Arif Fadillah']
zhf28
['Salma Sakinah']
----
----
y2fcs
['Refki Prayoga']
b63ac
['Ranita Sari']
ndgek
['Mayang sunesti']
bnd5s
['Muhammad Lubis Angsori']
vhr83
['Muhamad Alwi Alfagih']
dm29e
['Dwi Febriana']
----
a2wfe
['Muhammad Lubis Angsori']
da3ns
['Muhamad Alwi Alfagih']
6u7fg
['Dwi Febriana']
----
9txhn
['Muhammad Lubis Angsori']
qkw5a
['Muhamad Alwi Alfagih']
2krzc
['D

141

In [295]:
from collections import defaultdict

date_diff = defaultdict(int)

for title, records in titles.items():
    if len(records) > 1:
        date_published = defaultdict(int)
        
        for record in records:
            date_published[record['date']] += 1
        
        date_diff[len(date_published.keys())] += 1
        
        if len(date_published.keys()) > 1:
            print(date_published)
            for record in records:
                print('osf.io/' + record['id'])

date_diff

defaultdict(<class 'int'>, {'2019-05-31': 1, '2018-10-12': 1})
osf.io/scj5d
osf.io/v93xm
defaultdict(<class 'int'>, {'2019-05-30': 1, '2019-03-07': 1})
osf.io/7zw5g
osf.io/4a9xt
defaultdict(<class 'int'>, {'2019-05-30': 1, '2019-05-28': 1})
osf.io/h3q2r
osf.io/tfqhp
defaultdict(<class 'int'>, {'2019-05-29': 1, '2019-05-27': 1, '2019-05-21': 2, '2019-04-28': 1})
osf.io/htnp2
osf.io/w3uc6
osf.io/p4dtb
osf.io/dgyhp
osf.io/aejhy
defaultdict(<class 'int'>, {'2019-05-29': 1, '2019-05-21': 1, '2019-05-20': 1, '2019-05-19': 1, '2019-03-25': 1})
osf.io/9zqwh
osf.io/wc76d
osf.io/mg97x
osf.io/ufrh8
osf.io/cb8zp
defaultdict(<class 'int'>, {'2019-05-28': 1, '2019-05-25': 1})
osf.io/hz48a
osf.io/c8f3b
defaultdict(<class 'int'>, {'2019-05-28': 1, '2019-04-29': 1})
osf.io/5dnw9
osf.io/bjz8f
defaultdict(<class 'int'>, {'2019-05-27': 1, '2019-05-25': 1})
osf.io/wfxm5
osf.io/ndpxv
defaultdict(<class 'int'>, {'2019-05-27': 2, '2019-05-26': 1})
osf.io/ype3k
osf.io/7e4u8
osf.io/ymu7f
defaultdict(<class 'int

osf.io/dg692
osf.io/gyd6q
defaultdict(<class 'int'>, {'2017-09-03': 1, '2017-01-29': 1})
osf.io/pfxtd
osf.io/es9d7
defaultdict(<class 'int'>, {'2017-09-01': 1, '2017-08-29': 1})
osf.io/p46jg
osf.io/9swvu
defaultdict(<class 'int'>, {'2017-09-01': 1, '2017-08-29': 1})
osf.io/cfzpd
osf.io/7b5n6
defaultdict(<class 'int'>, {'2017-09-01': 1, '2017-08-29': 1})
osf.io/wjzv4
osf.io/257zs
defaultdict(<class 'int'>, {'2017-09-01': 1, '2017-08-29': 1})
osf.io/tke2p
osf.io/w7s9d
defaultdict(<class 'int'>, {'2017-09-01': 1, '2017-08-29': 1})
osf.io/7hez3
osf.io/zcv5t
defaultdict(<class 'int'>, {'2017-08-31': 1, '2017-08-29': 2})
osf.io/q3gu7
osf.io/ua348
osf.io/2se9x
defaultdict(<class 'int'>, {'2017-08-31': 1, '2017-08-29': 1})
osf.io/3r6kn
osf.io/zadn9
defaultdict(<class 'int'>, {'2017-08-31': 1, '2017-08-29': 1})
osf.io/fq569
osf.io/j4ny2
defaultdict(<class 'int'>, {'2017-08-31': 1, '2017-08-29': 1})
osf.io/zhnqs
osf.io/4stvk
defaultdict(<class 'int'>, {'2017-08-31': 1, '2017-08-24': 1})
osf.io/x

defaultdict(int, {2: 528, 1: 238, 4: 14, 5: 6, 6: 3, 3: 58, 15: 1})

In [236]:
duplicate_titles = 0

records_with_duplicate_title = []

duplicate_titles_same_authors = {}
duplicate_titles_same_first_authors = {}

records_with_duplicate_titles_same_authors = []
records_with_duplicate_titles_same_first_author = []

records_from_inarxiv = set()

for title, records in titles.items():
    if len(records) > 1:
        duplicate_titles += 1
        
        records_with_duplicate_title.append({ 'title' : title, 'records' : len(records) })

        for i in range(len(records)):
            for j in range(i + 1, len(records)):

                if len(records[i]['authors']) == len(records[j]['authors']):
                    same_authors = True
                    
                    for k in range(len(records[i]['authors'])):
                        if records[i]['authors'][k] != records[j]['authors'][k]:
                            same_authors = False
                            break
                    
                    if same_authors:
                        records_with_duplicate_titles_same_authors.append({ 'title' : title,
                                                                          'id_1' : 'osf.io/' + records[i]['id'],
                                                                          'id_2' : 'osf.io/' + records[j]['id'],
                                                                          'server_1' : records[i]['server'],
                                                                          'server_2' : records[j]['server'],
                                                                          'doi_1' : records[i]['doi'],
                                                                          'doi_2' : records[j]['doi']})
                        
                        if title not in duplicate_titles_same_authors:
                            duplicate_titles_same_authors[title] = set()
                        
                        duplicate_titles_same_authors[title].add(records[i]['id'])
                        duplicate_titles_same_authors[title].add(records[j]['id'])
                        
#                         if records[i]['server'] == 'inarxiv':
#                             records_from_inarxiv.add(records[i]['id'])
#                         if records[j]['server'] == 'inarxiv':
#                             records_from_inarxiv.add(records[j]['id'])
                        
                        
                    elif len(records[i]['authors']) > 0 and len(records[j]['authors']) > 0 and \
                            records[i]['authors'][0] == records[j]['authors'][0]:
                            records_with_duplicate_titles_same_first_author.append({ 'title' : title,
                                                                          'id_1' : 'osf.io/' + records[i]['id'],
                                                                          'id_2' : 'osf.io/' + records[j]['id'],
                                                                          'server_1' : records[i]['server'],
                                                                          'server_2' : records[j]['server'],
                                                                          'doi_1' : records[i]['doi'],
                                                                          'doi_2' : records[j]['doi']})
                            
                            if title not in duplicate_titles_same_first_authors:
                                duplicate_titles_same_first_authors[title] = set()

                            duplicate_titles_same_first_authors[title].add(records[i]['id'])
                            duplicate_titles_same_first_authors[title].add(records[j]['id'])
                            
                elif len(records[i]['authors']) > 0 and len(records[j]['authors']) > 0 and \
                            records[i]['authors'][0] == records[j]['authors'][0]:
                            records_with_duplicate_titles_same_first_author.append({ 'title' : title,
                                                                          'id_1' : 'osf.io/' + records[i]['id'],
                                                                          'id_2' : 'osf.io/' + records[j]['id'],
                                                                          'server_1' : records[i]['server'],
                                                                          'server_2' : records[j]['server'],
                                                                          'doi_1' : records[i]['doi'],
                                                                          'doi_2' : records[j]['doi']})
                            if title not in duplicate_titles_same_first_authors:
                                duplicate_titles_same_first_authors[title] = set()

                            duplicate_titles_same_first_authors[title].add(records[i]['id'])
                            duplicate_titles_same_first_authors[title].add(records[j]['id'])
                    
duplicate_titles

848

In [237]:
dtsa = []

for title, records in duplicate_titles_same_authors.items():
    dtsa.append({ 'title' : title, 'records' : len(records)})

df = pd.DataFrame(dtsa)
df.to_csv('duplicate_titles_same_authors.csv')

In [238]:
df = pd.DataFrame(records_with_duplicate_titles_same_authors)
df.to_csv('records_with_exact_titles_and_all_authors.csv')

In [239]:
df = pd.DataFrame(records_with_duplicate_titles_same_first_author)
df.to_csv('records_with_exact_titles_and_first_authors.csv')

In [222]:
len(records_from_inarxiv)

640

In [225]:
dtsfa = []

for title, records in duplicate_titles_same_first_authors.items():
    dtsfa.append({ 'title' : title, 'records' : len(records)})

df = pd.DataFrame(dtsfa)
df.to_csv('duplicate_titles_same_first_authors.csv')

In [226]:
len(records_with_duplicate_titles_same_authors)

1071

In [271]:
import numpy as np
from collections import defaultdict

cutoff_date = parse_date('2019-6-1')[0]

author_names = {}
author_ids = {}

name_id_map = {}

rec = 0
no_rec = 0

for record in osf_records:
    server = record['osf_record']['relationships']['provider']['links']['related']['href'].replace( \
                'https://api.osf.io/v2/providers/preprints/', '').replace('/?format=json', '')
    
    if server not in author_names:
        author_names[server] = defaultdict(int)
        author_ids[server] = defaultdict(int)
    
    for author_list in record['osf_authors'][0]:
        for author in author_list['data']:
            if author['attributes']['bibliographic']:
                try:
#                     parse_date(author['embeds']['users']['data']['attributes']['date_registered'][:10])
                    author_data = author['embeds']['users']['data']['attributes']
                    author_id = author['embeds']['users']['data']['id']
                    
                    rec += 1
                    
                    author_names[server][author_data['full_name']] += 1
                    author_names[server][author_id] += 1
                    
#                     input(author)
                    if author_data['full_name'] in name_id_map and name_id_map[author_data['full_name']] != author_id:
                        print('Same name diff ids: {}--{}'.format(name_id_map[author_data['full_name']], author_id))
                        print(author_data['full_name'])
                    name_id_map[author_data['full_name']] = author_id
                    
                except Exception as e:
                    author_data = author['embeds']['users']['errors'][0]['meta']
                    
                    no_rec +=1
#                     input( author['embeds']['users'])

Same name diff ids: 4q23e--tvnzp
Benjamin Davies
Same name diff ids: b37ef--zu6wy
Abdul Malik
Same name diff ids: p4grw--9sgwp
Abbas Bahroudi
Same name diff ids: qctpn--4ctka
Ninuk Wiliani
Same name diff ids: f3qad--x4sgq
Asrul Sani
Same name diff ids: ypbmh--5pu3r
Suharto
Same name diff ids: bnepj--tvp8m
miftahul jannah
Same name diff ids: 4tuk2--4j57a
Habibulla Yunuskhodjaev
Same name diff ids: by9s6--ser2w
Maryam Musfiroh
Same name diff ids: kav87--heqtw
Umida Khamidullayevna Saydikramova
Same name diff ids: qne2z--zs4j6
Khurmat Sabirovna Valieva
Same name diff ids: t3pae--qyv28
Shakhnovza Bakhtiyorovna Sagatova
Same name diff ids: 5thvs--hf2rg
Nilufar Bakhadirovna Mukhamadalieva
Same name diff ids: ep6ds--zqxch
Мухаббат Рахмоналиевна Исаева
Same name diff ids: b9av3--rf5n9
Fahrida Inayati
Same name diff ids: 9j3xn--bwdfk
Esdavina Elvandari
Same name diff ids: prdyt--8zru7
Hafizatul Bahri
Same name diff ids: 8zru7--g56nf
Hafizatul Bahri
Same name diff ids: knurz--7zpaf
Christer Joha

Same name diff ids: 82tpb--t2y3m
Harry Sanjaya
Same name diff ids: nhdeq--r8xcy
Veronica Toffolutti
Same name diff ids: 2tsx7--yuw6n
Gary R. Turner
Same name diff ids: axzc8--2e4wh
Klaas Sijtsma
Same name diff ids: y4vuf--avwsj
Samliok Ndobe
Same name diff ids: a75gq--2snq5
Lisa Feldman Barrett
Same name diff ids: pf4nt--3sw8k
Hardianto Djanggih
Same name diff ids: 9r4g7--vy59d
Amber L. Martin
Same name diff ids: rb3xq--x3wbq
Youn-Jeng Choi
Same name diff ids: 926zs--trn67
Jayashree Balakrishna
Same name diff ids: 4f39b--h8jqb
Alia Azmi
Same name diff ids: 6qpyg--ja56x
Ike Sylvia
Same name diff ids: yq9r3--d98hm
Desy Mardhiah
Same name diff ids: hdj63--5w6cp
Han van der Maas
Same name diff ids: 5a2yk--483ht
Idrus Hentihu
Same name diff ids: xtzvd--7dyrf
Rosita Umanailo
Same name diff ids: mxrth--5cbev
Hamiru
Same name diff ids: 9n6cu--bdsnj
Mansyur Nawawi
Same name diff ids: grv98--c2szx
Sukainap Pulhehe
Same name diff ids: ge5u4--jm942
Mirja Ohoibor
Same name diff ids: qm3fx--n2e4t
Lu

Same name diff ids: xyqs2--yu9kt
Richard E. Lucas
Same name diff ids: q8mzj--b3xuj
Kirby Sainsbury
Same name diff ids: jzcr5--bw75z
Deddy Wahyudin Purba
Same name diff ids: y479c--csw2z
Abdulrahman M. Alshabeb
Same name diff ids: kgctv--43fc9
Rusma Kalra
Same name diff ids: 9hdwf--jsyx3
Susilo Susilo
Same name diff ids: 4wnpz--a6dc9
Sri Wuli Fitriati
Same name diff ids: gb3xv--usrx3
Lilik Eko Widodo
Same name diff ids: pdcwr--ugkyq
Azim Shariff
Same name diff ids: 9qtf2--5t4gp
Kevin Robinson
Same name diff ids: ecknx--qtasy
Deming Wang
Same name diff ids: bu9s3--8zy2u
Filip Lievens
Same name diff ids: 9w8v5--9edk7
Anne Fernald
Same name diff ids: eu2tv--avmc5
La Ode Husen
Same name diff ids: s4e6p--s3kb9
Tengku Erwinsyahbana
Same name diff ids: usrx3--kqtn3
Lilik Eko Widodo
Same name diff ids: fynx4--6m9au
Tedy Agung Cahyadi
Same name diff ids: kqtn3--j9ang
Lilik Eko Widodo
Same name diff ids: 6m9au--p92xn
Tedy Agung Cahyadi
Same name diff ids: vuc4k--x7s94
Anna M. Borghi
Same name dif

In [191]:
rec

61152

In [192]:
no_rec

62

In [94]:
for record in osf_records:
    if record['id'] == 'bmp89':
        print(record.keys())

dict_keys(['id', 'doi', 'osf_record', 'osf_authors', 'crossref_preprint', 'crossref_peerrev_publications', 'osf_peerrev_publications', 'crossref_search_results'])


In [272]:
duplicate_titles = 0

records_with_duplicate_title = []

duplicate_titles_same_authors = {}
duplicate_titles_same_first_authors = {}

records_with_duplicate_titles_same_authors = []
records_with_duplicate_titles_same_first_author = []

records_from_inarxiv = set()

for title, records in titles.items():
    if len(records) > 1:
        duplicate_titles += 1
        
        records_with_duplicate_title.append({ 'title' : title, 'records' : len(records) })

        for i in range(len(records)):
            for j in range(i + 1, len(records)):

                if len(records[i]['authors']) == len(records[j]['authors']):
                    same_authors = True
                    
                    for k in range(len(records[i]['authors'])):
                        if records[i]['authors'][k] != records[j]['authors'][k]:
                            same_authors = False
                            break
                    
                    if same_authors:
                        records_with_duplicate_titles_same_authors.append({ 'title' : title,
                                                                          'id_1' : 'osf.io/' + records[i]['id'],
                                                                          'id_2' : 'osf.io/' + records[j]['id']})
                        
                        if title not in duplicate_titles_same_authors:
                            duplicate_titles_same_authors[title] = set()
                        
                        duplicate_titles_same_authors[title].add(records[i]['id'])
                        duplicate_titles_same_authors[title].add(records[j]['id'])
                        
                        if records[i]['server'] == 'inarxiv':
                            records_from_inarxiv.add(records[i]['id'])
                        if records[j]['server'] == 'inarxiv':
                            records_from_inarxiv.add(records[j]['id'])
                        
                        
                    elif len(records[i]['authors']) > 0 and len(records[j]['authors']) > 0 and \
                            records[i]['authors'][0] == records[j]['authors'][0]:
                            records_with_duplicate_titles_same_first_author.append({ 'title' : title,
                                                                          'id_1' : 'osf.io/' + records[i]['id'],
                                                                          'id_2' : 'osf.io/' + records[j]['id']})
                            
                            if title not in duplicate_titles_same_first_authors:
                                duplicate_titles_same_first_authors[title] = set()

                            duplicate_titles_same_first_authors[title].add(records[i]['id'])
                            duplicate_titles_same_first_authors[title].add(records[j]['id'])
                            
                elif len(records[i]['authors']) > 0 and len(records[j]['authors']) > 0 and \
                            records[i]['authors'][0] == records[j]['authors'][0]:
                            records_with_duplicate_titles_same_first_author.append({ 'title' : title,
                                                                          'id_1' : 'osf.io/' + records[i]['id'],
                                                                          'id_2' : 'osf.io/' + records[j]['id']})
                            if title not in duplicate_titles_same_first_authors:
                                duplicate_titles_same_first_authors[title] = set()

                            duplicate_titles_same_first_authors[title].add(records[i]['id'])
                            duplicate_titles_same_first_authors[title].add(records[j]['id'])
                    
duplicate_titlesWe 

NameError: name 'duplicate_titlesWe' is not defined

In [273]:
osf_records[0]

{'id': 'a8ewc',
 'doi': '10.31227/osf.io/a8ewc',
 'osf_record': {'relationships': {'node': {'links': {'self': {'href': 'https://api.osf.io/v2/preprints/a8ewc/relationships/node/?format=json',
      'meta': {}}}},
   'files': {'links': {'related': {'href': 'https://api.osf.io/v2/preprints/a8ewc/files/?format=json',
      'meta': {}}}},
   'identifiers': {'links': {'related': {'href': 'https://api.osf.io/v2/preprints/a8ewc/identifiers/?format=json',
      'meta': {}}}},
   'contributors': {'links': {'related': {'href': 'https://api.osf.io/v2/preprints/a8ewc/contributors/?format=json',
      'meta': {}}}},
   'license': {'data': {'type': 'licenses', 'id': '563c1cf88c5e4a3877f9e96a'},
    'links': {'related': {'href': 'https://api.osf.io/v2/licenses/563c1cf88c5e4a3877f9e96a/?format=json',
      'meta': {}}}},
   'citation': {'links': {'related': {'href': 'https://api.osf.io/v2/preprints/a8ewc/citation/?format=json',
      'meta': {}}}},
   'primary_file': {'data': {'type': 'files',
     'i