# CONTRIBUTOR STATISTICS

1. [Median, min, max](#gen-stats)
2. [Identifiers](#identifiers)
3. [Contributor field issues](#issues)
4. [Order cited issues](#order-cited)

In [1]:
import os

# SHARE data file, each line is a record
SHARE_FILE = os.path.join('..', '..', 'data', 'share-jan-2019.json')

In [8]:
import json
from collections import defaultdict

contributors_per_record = {'all' : [], 'person' : []}

contributors_per_record_per_subject = {'all' : {}, 'person' : {}}

total_contributors = {}

with open(SHARE_FILE) as f:
    for line in f:
        data = json.loads(line)

        person_contributors = 0
        if 'contributors' in data['lists']:
            contributors_per_record['all'].append(len(data['lists']['contributors']))
             
            for contributor in data['lists']['contributors']:
                if contributor['type'] == 'person':
                    person_contributors += 1
                    
                if contributor['type'] not in total_contributors:
                    total_contributors[contributor['type']] = defaultdict(int)
                total_contributors[contributor['type']][contributor['relation']] += 1
            
            contributors_per_record['person'].append(person_contributors)
        else:
            contributors_per_record['all'].append(0)
            
        main_subjects = set()
        
        for subject in data['subjects']:
            # subject format: taxonomy|main_subject|rest of subject
            main_subject = subject.split('|', 1)[1].split('|')[0]
            main_subjects.add(main_subject)
            
        for subject in main_subjects:
            if subject not in contributors_per_record_per_subject['all']:
                contributors_per_record_per_subject['all'][subject] = []
                contributors_per_record_per_subject['person'][subject] = []
                
            contributors_per_record_per_subject['all'][subject].append(len(data['lists']['contributors']))
            contributors_per_record_per_subject['person'][subject].append(person_contributors)

## <a id='gen-stats'>GENERAL STATISTICS</a>

### MEDIAN, MIN, MAX

In [9]:
import statistics
print('RECORDS WITH MISSING CONTRIBUTOR FIELD: {}'.format(contributors_per_record['all'].count(0)))

print('MEDIAN NUMBER OF CONTRIBUTORS PER RECORD: {}'.format(statistics.median(contributors_per_record['all'])))
print('MIN NUMBER OF CONTRIBUTORS PER RECORD: {}'.format(min(contributors_per_record['all'])))
print('MAX NUMBER OF CONTRIBUTORS PER RECORD: {}'.format(max(contributors_per_record['all'])))

print('\n')

print('MEDIAN NUMBER OF PERSON CONTRIBUTORS PER RECORD: {}'.format(statistics.median(contributors_per_record['person'])))
print('MIN NUMBER OF PERSON CONTRIBUTORS PER RECORD: {}'.format(min(contributors_per_record['person'])))
print('MAX NUMBER OF PERSON CONTRIBUTORS PER RECORD: {}'.format(max(contributors_per_record['person'])))

RECORDS WITH MISSING CONTRIBUTOR FIELD: 33
MEDIAN NUMBER OF CONTRIBUTORS PER RECORD: 2
MIN NUMBER OF CONTRIBUTORS PER RECORD: 0
MAX NUMBER OF CONTRIBUTORS PER RECORD: 86670


MEDIAN NUMBER OF PERSON CONTRIBUTORS PER RECORD: 2.0
MIN NUMBER OF PERSON CONTRIBUTORS PER RECORD: 0
MAX NUMBER OF PERSON CONTRIBUTORS PER RECORD: 86666


In [10]:
import pandas as pd

median_contributors_per_subject = {}

for key in contributors_per_record_per_subject.keys():
    median_contributors_per_subject[key] = {}
    for subject in contributors_per_record_per_subject[key].keys():
        median_contributors_per_subject[key][subject] = \
            statistics.median(contributors_per_record_per_subject[key][subject])

pd.DataFrame(median_contributors_per_subject)

Unnamed: 0,all,person
Architecture,1.0,1.0
Arts and Humanities,2.0,2.0
Business,2.0,2.0
Earth and Life Sciences,3.0,3.0
Education,1.0,1.0
Engineering,3.0,3.0
Engineering Psychology,2.0,2.0
Law,2.0,2.0
Life Sciences,4.0,4.0
Medicine and Health Sciences,3.0,3.0


### TOTAL NUMBER OF CONTRIBUTOR INSTANCES BY TYPE AND RELATION

In [11]:
import pandas as pd
df_total_contributors = pd.DataFrame(total_contributors)
df_total_contributors = df.fillna(0)

df_total_contributors.loc['Total',:]= df_total_contributors.sum(axis=0)
df_total_contributors.loc[:,'Total'] = df_total_contributors.sum(axis=1)

df_total_contributors.astype(int)

Unnamed: 0,person,institution,organization,consortium,Total
contributor,536122,62794,16497,0,615413
creator,29673379,1849,21735,7,29696970
Total,30209501,64643,38232,7,30312383


## <a id='identifiers'>IDENTIFIERS</a>

In [17]:
import os, json
from collections import defaultdict

# WAIT THIS MIGHT NOT WORK SINCE A RECORD CAN BE FROM MULTIPLE SOURCES
# list of sources where each contributor published
# only based on contributor id
contributor_sources = {}

# number of contributions by each contributor for each source
# also only based on contributor id
contributions_per_source = {}

# number of contributors with a particular identifier type (see list below)
contributor_identifiers = defaultdict(int)

# these are all the identifier types from the dataset
identifiers = { 'figshare' : 'http://figshare.com/authors/',
    'orcid' : 'http://orcid.org/',
    'gravatar' : 'http://secure.gravatar.com/avatar/',
    'mendeley' : 'http://www.mendeley.com/profiles/',
    'usgs' : 'urn://share/gov.usgs:',
    'doi' : 'http://dx.doi.org/',
    'api.osf' : 'http://api.osf.io/v2/institutions/',
    'osf' : 'http://osf.io/',
    'grid.ac' : 'http://www.grid.ac/institutes/',
    'issn' : 'urn://issn/',
    'isbn' : 'urn://isbn/',
    'csupomona' : 'http://www.csupomona.edu'
}

with open(SHARE_FILE) as f:
    for line in f:
        data = json.loads(line)

        if 'contributors' in data['lists']:
            for contributor in data['lists']['contributors']:
                for contributor_identifier in contributor['identifiers']:
                    for code, url in identifiers.items():
                        id_type_found = False
                        if contributor_identifier.startswith(url):
                            contributor_identifiers[code] += 1
                            id_type_found = True
                            break
                            
                    # in case there is another identifier type, add it to the list above
                    if not id_type_found:
                        print(contributor_identifier)
                
                if contributor['id'] not in contributor_sources:
                    contributor_sources[contributor['id']] = set()
                
                for source in data['sources']:
                    contributor_sources[contributor['id']].add(source)
                    
                    if source not in contributions_per_source:
                        contributions_per_source[source] = defaultdict(int)
                        
                    contributions_per_source[source][contributor['id']] += 1

### NUMBER OF CONTRIBUTOR INSTANCES WITH IDENTIFIERS

In [21]:
df_contributor_identifiers = pd.DataFrame(sorted(contributor_identifiers.items(), key = lambda kv : kv[1], \
                                                 reverse = True), columns = ['Identifier', 'Contributors'])

df_contributor_identifiers

Unnamed: 0,Identifier,Contributors
0,orcid,62580
1,osf,49557
2,gravatar,35495
3,issn,4114
4,doi,2842
5,api.osf,69
6,figshare,59
7,mendeley,39
8,usgs,18
9,isbn,16


In [27]:
print('Number of unique contributors based on contributor id: {}'.format(len(contributor_sources.keys())))

Number of unique contributors based on contributor id: 30177981


In [None]:
# check for duplicate contributor ids in an single record
# this shouldn't happen

import os, json
from collections import defaultdict

records_with_duplicate_contributor_ids = 0

with open(os.path.join('..', '..', 'data', 'share-jan-2019.json')) as f:
    for line in f:
        data = json.loads(line)

        if 'contributors' in data['lists']:
            contributor_ids = []
            for contributor in data['lists']['contributors']:
                if contributor['id'] in contributor_ids:
                    records_with_duplicate_contributor_ids += 1
                    break
                else:
                    contributor_ids.append(contributor['id'])

## <a id='issues'>CONTRIBUTOR FIELD INCONSISTENCIES + MISSING FIELDS</a>

There are two places where the list of contributors can be found:
- data['contributors'] (just an array of strings)
- data['lists']['contributors'] (an arrray of objects - contains contributor information)
    
The next code block looks for the following issues:
1. Inconsistencies with counts of data['contributors'] vs data['lists']['contributors']
2. If data['contributors'] is there but data['lists']['contributors'] isn't
3. If data['lists']['contributors'] is there but data['contributors'] isn't

In [6]:
import os, json
from collections import defaultdict

records_with_missing_contributor_field = 0
records_with_inconsistent_contributor_numbers = 0

contributor_fields = defaultdict(int)

with open(SHARE_FILE) as f:
    for line in f:
        data = json.loads(line)

        if 'contributors' in data and 'contributors' in data['lists']:
            if len(data['contributors']) != len(data['lists']['contributors']):
                records_with_inconsistent_contributor_numbers += 1
                
        elif ('contributors' in data  and not 'contributors' in data['lists']):
            records_with_missing_contributor_field += 1
            
        elif (not 'contributors' in data  and 'contributors' in data['lists']):
            records_with_missing_contributor_field + 1
        
        if 'contributors' in data['lists']:
            for contributor in data['lists']['contributors']:
                for field in contributor.keys():
                    contributor_fields[field] += 1

In [7]:
print('RECORDS WITH INCONSISTENT CONTRIBUTOR FIELDS: {}'.format(records_with_inconsistent_contributor_numbers))
print('RECORDS WITH ONE MISSING CONTRIBUTOR FIELD: {}'.format(records_with_missing_contributor_field))

RECORDS WITH INCONSISTENT CONTRIBUTOR FIELDS: 0
RECORDS WITH ONE MISSING CONTRIBUTOR FIELD: 0


In [10]:
import pandas as pd

df_contributor_fields = pd.DataFrame(sorted(contributor_fields.items(), key = lambda kv : kv[1], \
                                                 reverse = True), columns = ['Field', 'Contributor Instances'])

df_contributor_fields

Unnamed: 0,Field,Contributor Instances
0,id,30312383
1,type,30312383
2,name,30312383
3,identifiers,30312383
4,cited_as,30312383
5,affiliations,30312383
6,awards,30312383
7,types,30312383
8,relation,30312383
9,given_name,30207132


## <a id='order-cited'>ORDER CITED</a>

In [42]:
import json, numpy as np
from collections import defaultdict

contributors_with_no_order_cited = {}

records_with_missing_order_cited = 0
records_with_incorrect_order_cited_number = 0
records_with_duplicate_order_cited_number = 0
records_with_incorrect_and_duplicate_order_cited_number = 0

with open(SHARE_FILE) as f:
    for line in f:
        data = json.loads(line)

        if 'contributors' in data['lists']:
            record_has_missing_order_cited = False
            
            highest_order_cited_number = 0
            order_cited_values = []
            for contributor in data['lists']['contributors']:
                if not 'order_cited' in contributor:
                    record_has_missing_order_cited = True
                    
                    if contributor['type'] not in contributors_with_no_order_cited:
                        contributors_with_no_order_cited[contributor['type']] = defaultdict(int)
                        
                    contributors_with_no_order_cited[contributor['type']][contributor['relation']] += 1
                else:
                    order_cited_values.append(int(contributor['order_cited']))
                    
            if (len(order_cited_values) == 0 and len(data['lists']['contributors']) > 0) \
                or max(order_cited_values) + 1 != len(data['lists']['contributors']):
                records_with_incorrect_order_cited_number += 1
                
            if record_has_missing_order_cited:
                records_with_missing_order_cited += 1
            
            if len(order_cited_values) != 0 and len(order_cited_values) != len(np.unique(order_cited_values)):
                records_with_duplicate_order_cited_number += 1
                
            # get the overlap of records with missing order cited and duplicate order cited
            if record_has_missing_order_cited and \
                len(order_cited_values) != 0 and len(order_cited_values) != len(np.unique(order_cited_values)):
                records_with_incorrect_and_duplicate_order_cited_number += 1

### NO ORDER CITED FIELD

In [32]:
df_no_order_cited = pd.DataFrame(contributors_with_no_order_cited)
df_no_order_cited = df_no_order_cited.fillna(0)
df_no_order_cited = df_no_order_cited.astype(int)

df_no_order_cited

Unnamed: 0,person,institution,organization,consortium
contributor,526725,62794,16468,0
creator,0,0,0,2


### HIGHEST ORDER CITED NUMBER != TOTAL NUMBER OF CONTRIBUTORS

Three cases can happen here:
1. A number is skipped: e.g. 2 authors - one has order_cited = 0 and the other has order_cited = 2 (see https://osf.io/preprints/inarxiv/h2v7n/ for example) - This is actually fine. There are not really any issues that we should be concerned with in this case.
2. There is a contributor with no order cited number
3. A number is duplicated: e.g. 2 authors have order_cited = 0

In [46]:
print('RECORDS WITH INCONSISTENCIES ON ORDER CITED NUMBER : {}'.format(records_with_incorrect_order_cited_number))

print('\nNOTE: THE FOLLOWING RESULTS ARE NOT A DEFINITE SUBSET OF THE RESULT ABOVE')
print('RECORDS WITH AT LEAST 1 CONTRIBUTOR MISSING ORDER CITED FIELD : {}'.format(records_with_missing_order_cited))
print('RECORDS WITH AT LEAST 1 DUPLICATE ORDER CITED NUMBER : {}'.format(records_with_duplicate_order_cited_number))

print('RECORDS WITH AT LEAST 1 CONTRIBUTOR MISSING ORDER CITED FIELD ' + \
      'AND LEAST 1 DUPLICATE ORDER CITED NUMBER : {}'.format(records_with_incorrect_and_duplicate_order_cited_number))

RECORDS WITH INCONSISTENCIES ON ORDER CITED NUMBER : 478452

NOTE: THE FOLLOWING RESULTS ARE NOT A DEFINITE SUBSET OF THE RESULT ABOVE
RECORDS WITH AT LEAST 1 CONTRIBUTOR MISSING ORDER CITED FIELD : 86090
RECORDS WITH AT LEAST 1 DUPLICATE ORDER CITED NUMBER : 455373
RECORDS WITH AT LEAST 1 CONTRIBUTOR MISSING ORDER CITED FIELD AND LEAST 1 DUPLICATE ORDER CITED NUMBER : 63273


### DUPLICATE ORDER_CITED NUMBERS

In [60]:
import json, numpy as np
from collections import defaultdict

# number of times each repeated order cited field was duplicated
# e.g. order_cited = 0 appeared in three contributors in a record
order_cited_duplicate_times = []

single_duplicate_contributor_relation = defaultdict(int)
single_duplicate_contributor_type = defaultdict(int)

with open(SHARE_FILE) as f:
    for line in f:
        data = json.loads(line)

        contributors_by_order_cited = {}
        if 'contributors' in data['lists']:
            for contributor in data['lists']['contributors']:
                if 'order_cited' in contributor:
                    if contributor['order_cited'] not in contributors_by_order_cited:
                        contributors_by_order_cited[contributor['order_cited']] = []
                        
                    contributors_by_order_cited[contributor['order_cited']].append(contributor)
                    
            for order_cited, contributors in contributors_by_order_cited.items():
                if len(contributors) > 1:
                    order_cited_duplicate_times.append(len(contributors) - 1)
                
                    if len(contributors) == 2:
                        duplicate_contributor_relation = [contributors[0]['relation'], contributors[1]['relation']]
                        duplicate_contributor_type = [contributors[0]['type'], contributors[1]['type']]
                        
                        duplicate_contributor_relation.sort()
                        single_duplicate_contributor_relation['-'.join(duplicate_contributor_relation)] += 1
                        
                        duplicate_contributor_type.sort()
                        single_duplicate_contributor_type['-'.join(duplicate_contributor_type)] += 1

In [56]:
print('NUMBER OF TIMES EACH REPEATED ORDER_CITED WAS DUPLICATED: {}'.format(len(order_cited_duplicate_times)))

NUMBER OF TIMES EACH REPEATED ORDER_CITED WAS DUPLICATED: 5904007


In [54]:
np.unique(order_cited_duplicate_times, return_counts = True)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
        36, 37, 40, 41, 42, 45]),
 array([3519060,  820487,  328085,  122953,  187517,  139816,  151305,
         103936,  110854,   92722,   91097,   52278,   32108,   27155,
          24000,   28066,   27560,    7916,    8061,   12133,    6767,
           3706,     286,    1738,     472,      28,     656,      25,
             68,    2199,       2,       3,       2,       3,     873,
              2,       1,      65,       1,       1]))

In [65]:
pd.DataFrame(sorted(single_duplicate_contributor_relation.items(), reverse = True))

Unnamed: 0,0,1
0,creator-creator,3512858
1,contributor-creator,6200
2,contributor-contributor,2


In [66]:
pd.DataFrame(sorted(single_duplicate_contributor_type.items(), reverse = True))

Unnamed: 0,0,1
0,person-person,3517519
1,organization-person,1328
2,organization-organization,132
3,institution-person,59
4,institution-organization,14
5,institution-institution,7
6,consortium-person,1
