# SUBJECT STATISTICS

1. [Median, min, max](#gen-stats)
2. [Number of subjects per record (table)](#subjects-per-record) 
3. [Number of records per subject (table)](#records-per-subject) 
4. [No subjects](#no-subjects)
5. [Subject taxonomies](#taxonomy)
6. [Taxonomy issues](#taxonomy-issues)

In [3]:
import os

# SHARE data file, each line is a record
SHARE_FILE = os.path.join('..', '..', 'data', 'share-jan-2019.json')

In [153]:
import json
from collections import defaultdict

# number of subjects for each record
# note that a record may have multiple subjects and subjects are up to 3 different levels of specificity
# since bepress is the most common taxonomy, also store the number of bepress subjs per record
# if the same subject appears but from two diff taxonomies (e.g. arXiv|Life Sciences, biorXiv|Life Sciences),
#   count it only once
subjects_per_record = { 'specific' : [], 'main' : [], 'bepress_specific' : [], 'bepress_main' : [] }

# number of records per MAIN subject and source
records_per_subject_and_source = {}

# number of records per MAIN BEPRESS subject and source
records_per_bepress_subject_and_source = {}

# number of records with no subjects
records_with_no_subjects_per_source = { 
    'without subjects from any taxonomy' : defaultdict(int), 
    'with subjects from any taxonomy except bepress' : defaultdict(int)
}

# number of records per taxonomy
records_per_taxonomy = defaultdict(int)

# contains all specific subjects that appear in the dataset
all_specific_subjects = set()

# contains all specific bepress subjects that appear in the dataset
all_specific_bepress_subjects = set()

with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        # checks if there are duplicate subjects in a single record
        # this should not happen
        if len(data['subjects']) != len(set(data['subjects'])):
            print('Duplicate subjects: {}'.format(data['id']))
            print(data['subjects'])
        
        specific_subjects = set()
        specific_bepress_subjects = set()
        main_subjects = set()
        main_bepress_subjects = set()
        
        taxonomies = set()
        
        for subject in data['subjects']:
            # subject format: taxonomy|main_subject|rest of subject
            # specific subject = main_subject|rest of subject
            subject = subject.split('|', 1)
            taxonomy = subject[0]
            specific_subject = subject[1]
            main_subject = subject[1].split('|')[0]
            
            specific_subjects.add(specific_subject)
            main_subjects.add(main_subject)
            
            taxonomies.add(taxonomy)
            
            if subject[0] == 'bepress':
                specific_bepress_subjects.add(specific_subject)
                main_bepress_subjects.add(main_subject)
            
        for main_subject in main_subjects:
            if main_subject not in records_per_subject_and_source:
                records_per_subject_and_source[main_subject] = defaultdict(int)
                
            for source in data['sources']:
                records_per_subject_and_source[main_subject][source] += 1
            
            records_per_subject_and_source[main_subject]['-Total Records-'] += 1
            
        for main_subject in main_bepress_subjects:
            if main_subject not in records_per_bepress_subject_and_source:
                records_per_bepress_subject_and_source[main_subject] = defaultdict(int)
                
            for source in data['sources']:
                records_per_bepress_subject_and_source[main_subject][source] += 1
            
            records_per_bepress_subject_and_source[main_subject]['-Total Records-'] += 1
        
        for taxonomy in taxonomies:
            records_per_taxonomy[taxonomy] += 1
        
        # check if record uses a taxonomy other than bepress or it's own source taxonomy
        # e.g. bioRxiv records should only use bioRxiv and bepress taxonomies
        taxonomy_source = taxonomies - set(data['sources'] + ['bepress'])
        if len(taxonomy_source) > 1:
            print(taxonomies)
            print(set(data['sources'] + ['bepress']))
            print(taxonomy_source)

        if len(main_subjects) == 0:
            records_with_no_subjects_per_source['without subjects from any taxonomy']['-Total Records-'] += 1
            for source in data['sources']:
                records_with_no_subjects_per_source['without subjects from any taxonomy'][source] += 1
        
        if len(main_subjects) > 0 and len(main_bepress_subjects) == 0:
            records_with_no_subjects_per_source['with subjects from any taxonomy except bepress']['-Total Records-'] += 1
            for source in data['sources']:
                records_with_no_subjects_per_source['with subjects from any taxonomy except bepress'][source] += 1
                
        subjects_per_record['specific'].append(len(specific_subjects))
        subjects_per_record['main'].append(len(main_subjects))
        subjects_per_record['bepress_specific'].append(len(specific_bepress_subjects))
        subjects_per_record['bepress_main'].append(len(main_bepress_subjects))
        
        all_specific_subjects = all_specific_subjects.union(specific_subjects)
        all_specific_bepress_subjects = all_specific_bepress_subjects.union(specific_bepress_subjects)

## <a id='gen-stats'>GENERAL STATISTICS</a>

### ALL SUBJECTS

In [143]:
import statistics
print('TOTAL NUMBER OF UNIQUE SPECIFIC SUBJECTS: {}'.format(len(all_specific_subjects)))
print('MEDIAN NUMBER OF SPECIFIC SUBJECTS PER RECORD: {}'.format(statistics.median(subjects_per_record['specific'])))
print('MIN NUMBER OF SPECIFIC SUBJECTS PER RECORD: {}'.format(min(subjects_per_record['specific'])))
print('MAX NUMBER OF SPECIFIC SUBJECTS PER RECORD: {}'.format(max(subjects_per_record['specific'])))

print('\n')

print('TOTAL NUMBER OF UNIQUE MAIN SUBJECTS: {}'.format(len(records_per_subject_and_source.keys())))
print('MEDIAN NUMBER OF MAIN SUBJECTS PER RECORD: {}'.format(statistics.median(subjects_per_record['main'])))
print('MIN NUMBER OF MAIN SUBJECTS PER RECORD: {}'.format(min(subjects_per_record['main'])))
print('MAX NUMBER OF MAIN SUBJECTS PER RECORD: {}'.format(max(subjects_per_record['main'])))

TOTAL NUMBER OF UNIQUE SPECIFIC SUBJECTS: 1449
MEDIAN NUMBER OF SPECIFIC SUBJECTS PER RECORD: 0
MIN NUMBER OF SPECIFIC SUBJECTS PER RECORD: 0
MAX NUMBER OF SPECIFIC SUBJECTS PER RECORD: 123


TOTAL NUMBER OF UNIQUE MAIN SUBJECTS: 19
MEDIAN NUMBER OF MAIN SUBJECTS PER RECORD: 0
MIN NUMBER OF MAIN SUBJECTS PER RECORD: 0
MAX NUMBER OF MAIN SUBJECTS PER RECORD: 10


### BEPRESS SUBJECTS

In [144]:
import statistics
print('TOTAL NUMBER OF UNIQUE BEPRESS SPECIFIC SUBJECTS: {}'.format(len(all_specific_bepress_subjects)))
print('MEDIAN NUMBER OF SPECIFIC BEPRESS SUBJECTS PER RECORD: {}'.format(statistics.median(subjects_per_record['bepress_specific'])))
print('MIN NUMBER OF SPECIFIC BEPRESS SUBJECTS PER RECORD: {}'.format(min(subjects_per_record['bepress_specific'])))
print('MAX NUMBER OF SPECIFIC BEPRESS SUBJECTS PER RECORD: {}'.format(max(subjects_per_record['bepress_specific'])))

print('\n')

print('TOTAL NUMBER OF UNIQUE MAIN SUBJECTS: {}'.format(len(records_per_bepress_subject_and_source.keys())))
print('MEDIAN NUMBER OF MAIN BEPRESS BEPRESS SUBJECTS PER RECORD: {}'.format(statistics.median(subjects_per_record['bepress_main'])))
print('MIN NUMBER OF MAIN BEPRESS SUBJECTS PER RECORD: {}'.format(min(subjects_per_record['bepress_main'])))
print('MAX NUMBER OF MAIN BEPRESS SUBJECTS PER RECORD: {}'.format(max(subjects_per_record['bepress_main'])))

TOTAL NUMBER OF UNIQUE BEPRESS SPECIFIC SUBJECTS: 944
MEDIAN NUMBER OF SPECIFIC BEPRESS SUBJECTS PER RECORD: 0
MIN NUMBER OF SPECIFIC BEPRESS SUBJECTS PER RECORD: 0
MAX NUMBER OF SPECIFIC BEPRESS SUBJECTS PER RECORD: 50


TOTAL NUMBER OF UNIQUE MAIN SUBJECTS: 10
MEDIAN NUMBER OF MAIN BEPRESS BEPRESS SUBJECTS PER RECORD: 0
MIN NUMBER OF MAIN BEPRESS SUBJECTS PER RECORD: 0
MAX NUMBER OF MAIN BEPRESS SUBJECTS PER RECORD: 10


## <a id='subjects-per-record'>SUBJECTS PER RECORD</a>

In [135]:
ALL_SPR_OUTPUT_FILE = os.path.join('..', '..', 'data_exploration_results', 'subjects_per_record.csv')

In [150]:
import collections
import pandas as pd

subjects_per_record_freq = {}
for key in subjects_per_record.keys():
    subjects_per_record_freq[key] = collections.Counter(subjects_per_record[key])

df_subjects_per_record = pd.DataFrame(subjects_per_record_freq)
df_subjects_per_record = df.fillna(0)
df_subjects_per_record = df.astype(int)

df.to_csv(ALL_SPR_OUTPUT_FILE)

df_subjects_per_record[:11]

Unnamed: 0,specific,main,bepress_specific,bepress_main
0,1517874,1517874,1532954,1532954
1,441927,669492,440819,656420
2,202813,31364,199096,29690
3,44355,3027,40094,2799
4,7933,646,6390,599
5,2973,32,2056,17
6,1164,6,578,4
7,565,5,195,2
8,391,5,103,3
9,280,2,69,1


## <a id='records-per-subject'>RECORDS PER SUBJECT</a>

### ALL SUBJECTS

In [118]:
ALL_RPS_OUTPUT_FILE = os.path.join('..', '..', 'data_exploration_results', 'records_per_subject.csv')

In [149]:
df_records_per_subject = pd.DataFrame(records_per_subject_and_source)
df_records_per_subject = df_records_per_subject.fillna(0)
df_records_per_subject = df_records_per_subject.astype(int)

df_records_per_subject.to_csv(ALL_RPS_OUTPUT_FILE)

df_records_per_subject.loc[['-Total Records-']].transpose().sort_values('-Total Records-', ascending = False)

Unnamed: 0,-Total Records-
Physical Sciences and Mathematics,640712
Life Sciences,38359
Social and Behavioral Sciences,32734
Engineering,11268
Arts and Humanities,9366
Education,3886
Medicine and Health Sciences,2816
Business,2266
Law,2169
Neuroscience,411


### BEPRESS SUBJECTS

In [126]:
BEPRESS_RPS_OUTPUT_FILE = os.path.join('..', '..', 'data_exploration_results', 'records_per_bepress_subject.csv')

In [148]:
df_records_per_bepress_subject = pd.DataFrame(records_per_bepress_subject_and_source)
df_records_per_bepress_subject = df_records_per_bepress_subject.fillna(0)
df_records_per_bepress_subject = df_records_per_bepress_subject.astype(int)

df_records_per_bepress_subject.to_csv(BEPRESS_RPS_OUTPUT_FILE)

df_records_per_bepress_subject.loc[['-Total Records-']].transpose().sort_values('-Total Records-', ascending = False)

Unnamed: 0,-Total Records-
Physical Sciences and Mathematics,639572
Life Sciences,37505
Social and Behavioral Sciences,24772
Engineering,9949
Arts and Humanities,7998
Medicine and Health Sciences,2428
Education,1939
Business,1650
Law,877
Architecture,119


## <a id='no-subjects'>NO SUBJECTS</a>

In [154]:
NO_SUBJECT_OUTPUT_FILE = os.path.join('..', '..', 'data_exploration_results', 'no_subjects.csv')

In [163]:
df_records_with_no_subjects = pd.DataFrame(records_with_no_subjects_per_source)
df_records_with_no_subjects = df_records_with_no_subjects.fillna(0)
df_records_with_no_subjects = df_records_with_no_subjects.astype(int)

df_records_with_no_subjects.to_csv(NO_SUBJECT_OUTPUT_FILE)

df_records_with_no_subjects.sort_values('without subjects from any taxonomy', ascending = False)[:11]

Unnamed: 0,without subjects from any taxonomy,with subjects from any taxonomy except bepress
-Total Records-,1517874,15080
Research Papers in Economics,788175,0
arXiv,665430,3
CrossRef,302698,13350
Hyper Articles en Ligne (HAL),57718,1
DoE's SciTech Connect Database,23670,0
bioRxiv,21635,0
CERN Document Server,19757,0
AgEcon Search,14278,0
Department of Energy Pages,5265,0


## <a id='taxonomy'>TAXONOMIES</a>

In [134]:
df_records_per_taxonomy = pd.DataFrame(sorted(records_per_taxonomy.items(), key = lambda kv : kv[1], reverse = True),
                                      columns = ['Sources', 'Records'])

df_records_per_taxonomy

Unnamed: 0,Sources,Records
0,bepress,689541
1,INA-Rxiv,5990
2,PsyArXiv,3730
3,SocArXiv,3151
4,LawArXiv,972
5,EarthArXiv,604
6,engrXiv,392
7,Thesis Commons,334
8,MarXiv,330
9,LIS Scholarship Archive,147


## <a id='taxonomy-issues'>TAXONOMY ISSUES</a>

In [164]:
BEPRESS_SUBJECTS_FILE = os.path.join('..','..', 'data_exploration_results', 'bepress_subjects.txt')

In [175]:
bepress_subject_master_list = set()
with open(BEPRESS_SUBJECTS_FILE, 'r') as f:
    for line in f:
        bepress_subject_master_list.add(line.strip())

subjects_not_in_bepress = {} 
with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        for subject in data['subjects']:
            # subject format: taxonomy|main_subject|rest of subject
            # specific subject = main_subject|rest of subject
            subject = subject.split('|', 1)
            taxonomy = subject[0]
            specific_subject = subject[1]
            main_subject = subject[1].split('|')[0]
                
            if specific_subject not in bepress_subject_master_list:
                if taxonomy not in subjects_not_in_bepress:
                    subjects_not_in_bepress[taxonomy] = defaultdict(int)
                    
                subjects_not_in_bepress[taxonomy][specific_subject] += 1

In [179]:
ISSUES_OUTPUT_FILE = os.path.join('..','..', 'data_exploration_results', 'taxonomy_issues.csv')

In [184]:
pd.set_option('display.max_colwidth', -1)

df_subjects_not_in_bepress = pd.DataFrame(subjects_not_in_bepress)
df_subjects_not_in_bepress = df_subjects_not_in_bepress.fillna(0)
df_subjects_not_in_bepress = df_subjects_not_in_bepress.astype(int)


df_subjects_not_in_bepress.to_csv(ISSUES_OUTPUT_FILE)

df_subjects_not_in_bepress.sort_values('bepress', ascending = False)[:10]

Unnamed: 0,SocArXiv,PsyArXiv,SportRxiv,engrXiv,AgriXiv,ECSarXiv,INA-Rxiv,EarthArXiv,PaleorXiv,bepress,LawArXiv,Arabixiv,Thesis Commons
Life Sciences|Laboratory and Basic Science Research Life Sciences,0,0,0,0,3,0,0,0,0,17,0,0,0
Life Sciences|Animal Sciences|Aquaculture and Fisheries Life Sciences,0,0,0,0,0,0,27,1,0,8,0,0,0
"Business|Business Law, Public Responsibility, and Ethics Business",0,0,0,0,0,0,13,0,0,7,6,0,0
Life Sciences|Plant Sciences|Agronomy and Crop Sciences Life Sciences,0,0,0,0,3,0,10,1,0,3,0,2,0
Life Sciences|Physiology|Systems and Integrative Physiology Life Sciences,0,0,0,0,0,0,0,0,0,3,0,0,0
"Arts and Humanities|History of Art, Architecture, and Archaeology|Ancient, Medieval, Renaissance and Baroque Art and Architecture",19,0,0,0,0,0,0,0,0,2,0,0,0
"Social and Behavioral Sciences|Public Affairs, Public Policy and Public Administration|Peace and Conflict",2,0,0,0,0,0,4,0,0,2,0,0,0
Life Sciences|Plant Sciences|Plant Breeding and Genetics Life Sciences,0,0,0,0,2,0,0,0,0,2,0,4,0
Social and Behavioral Sciences|Sociology|Community-based Research,0,0,0,0,0,0,0,0,0,1,0,0,0
Life Sciences|Microbiology|Environmental Microbiology and Microbial Ecology Life Sciences,0,0,0,0,3,0,0,4,0,1,0,0,0
