# SUBJECT SYNONYM STATISTICS

1. [Median, min, max](#gen-stats)
2. [Number of subjects synonyms per record (table)](#subjects-per-record) 
3. [Number of records per subject synonym (table)](#records-per-subject) 
4. [No subject synonyms](#no-subjects)
5. [Taxonomy issues](#taxonomy-issues)

In [2]:
import os

# SHARE data file, each line is a record
SHARE_FILE = os.path.join('..', '..', 'data', 'share-jan-2019.json')

In [3]:
import json
from collections import defaultdict

# number of subjects for each record
# note that a record may have multiple subjects and subjects are up to 3 different levels of specificity
# since bepress is the most common taxonomy, also store the number of bepress subjs per record
# if the same subject appears but from two diff taxonomies (e.g. arXiv|Life Sciences, biorXiv|Life Sciences),
#   count it only once
subjects_per_record = { 'specific' : [], 'main' : [], 'bepress_specific' : [], 'bepress_main' : [] }

# number of records per MAIN subject and source
records_per_subject_and_source = {}

# number of records with no subjects
records_with_no_subjects_per_source = defaultdict(int)

# contains all specific subjects that appear in the dataset
all_specific_subjects = set()

with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        # checks if there are duplicate subjects in a single record
        # this should not happen
        if len(data['subject_synonyms']) != len(set(data['subject_synonyms'])):
            print('Duplicate subjects: {}'.format(data['id']))
            print(data['subject_synonyms'])
        
        specific_subjects = set()
        main_subjects = set()
        
        for subject in data['subject_synonyms']:
            # subject format: taxonomy|main_subject|rest of subject
            # specific subject = main_subject|rest of subject
            subject = subject.split('|', 1)
            taxonomy = subject[0]
            specific_subject = subject[1]
            main_subject = subject[1].split('|')[0]
            
            # this should not happen
            # all subjects should be from the bepress taxonomy
            if subject[0] != 'bepress':
                print(subject[0])
            
            specific_subjects.add(specific_subject)
            main_subjects.add(main_subject)
            
        for main_subject in main_subjects:
            if main_subject not in records_per_subject_and_source:
                records_per_subject_and_source[main_subject] = defaultdict(int)
                
            for source in data['sources']:
                records_per_subject_and_source[main_subject][source] += 1
            
            records_per_subject_and_source[main_subject]['-Total Records-'] += 1
        
        if len(main_subjects) == 0:
            records_with_no_subjects_per_source['-Total Records-'] += 1
            for source in data['sources']:
                records_with_no_subjects_per_source[source] += 1
                
        subjects_per_record['specific'].append(len(specific_subjects))
        subjects_per_record['main'].append(len(main_subjects))
        
        all_specific_subjects = all_specific_subjects.union(specific_subjects)

## <a id='gen-stats'>GENERAL STATISTICS</a>

In [5]:
import statistics
print('TOTAL NUMBER OF UNIQUE SPECIFIC SUBJECTS: {}'.format(len(all_specific_subjects)))
print('MEDIAN NUMBER OF SPECIFIC SUBJECTS PER RECORD: {}'.format(statistics.median(subjects_per_record['specific'])))
print('MIN NUMBER OF SPECIFIC SUBJECTS PER RECORD: {}'.format(min(subjects_per_record['specific'])))
print('MAX NUMBER OF SPECIFIC SUBJECTS PER RECORD: {}'.format(max(subjects_per_record['specific'])))

print('\n')

print('TOTAL NUMBER OF UNIQUE MAIN SUBJECTS: {}'.format(len(records_per_subject_and_source.keys())))
print('MEDIAN NUMBER OF MAIN SUBJECTS PER RECORD: {}'.format(statistics.median(subjects_per_record['main'])))
print('MIN NUMBER OF MAIN SUBJECTS PER RECORD: {}'.format(min(subjects_per_record['main'])))
print('MAX NUMBER OF MAIN SUBJECTS PER RECORD: {}'.format(max(subjects_per_record['main'])))

TOTAL NUMBER OF UNIQUE SPECIFIC SUBJECTS: 1013
MEDIAN NUMBER OF SPECIFIC SUBJECTS PER RECORD: 0
MIN NUMBER OF SPECIFIC SUBJECTS PER RECORD: 0
MAX NUMBER OF SPECIFIC SUBJECTS PER RECORD: 55


TOTAL NUMBER OF UNIQUE MAIN SUBJECTS: 10
MEDIAN NUMBER OF MAIN SUBJECTS PER RECORD: 0
MIN NUMBER OF MAIN SUBJECTS PER RECORD: 0
MAX NUMBER OF MAIN SUBJECTS PER RECORD: 10


## <a id='subjects-per-record'>SUBJECTS PER RECORD</a>

In [6]:
ALL_SPR_OUTPUT_FILE = os.path.join('..', '..', 'data_exploration_results', 'subjects_synonyms_per_record.csv')

In [8]:
import collections
import pandas as pd

subjects_per_record_freq = {}
for key in subjects_per_record.keys():
    subjects_per_record_freq[key] = collections.Counter(subjects_per_record[key])

df_subjects_per_record = pd.DataFrame(subjects_per_record_freq)
df_subjects_per_record = df_subjects_per_record.fillna(0)
df_subjects_per_record = df_subjects_per_record.astype(int)

df_subjects_per_record.to_csv(ALL_SPR_OUTPUT_FILE)

df_subjects_per_record[:11]

Unnamed: 0,specific,main,bepress_specific,bepress_main
0,2206129,2206129,0,0
1,1317,14023,0,0
2,5156,1968,0,0
3,5227,271,0,0
4,1970,45,0,0
5,1035,14,0,0
6,613,2,0,0
7,349,4,0,0
8,204,2,0,0
9,144,1,0,0


## <a id='records-per-subject'>RECORDS PER SUBJECT</a>

In [9]:
ALL_RPS_OUTPUT_FILE = os.path.join('..', '..', 'data_exploration_results', 'records_per_subject_synonym.csv')

In [10]:
df_records_per_subject = pd.DataFrame(records_per_subject_and_source)
df_records_per_subject = df_records_per_subject.fillna(0)
df_records_per_subject = df_records_per_subject.astype(int)

df_records_per_subject.to_csv(ALL_RPS_OUTPUT_FILE)

df_records_per_subject.loc[['-Total Records-']].transpose().sort_values('-Total Records-', ascending = False)

Unnamed: 0,-Total Records-
Social and Behavioral Sciences,8249
Education,2208
Engineering,1646
Life Sciences,1581
Physical Sciences and Mathematics,1532
Arts and Humanities,1475
Law,1425
Business,695
Medicine and Health Sciences,560
Architecture,76


## <a id='no-subjects'>NO SUBJECT SYNONYMS</a>

In [11]:
NO_SUBJECT_OUTPUT_FILE = os.path.join('..', '..', 'data_exploration_results', 'no_subject_synonyms.csv')

In [17]:
df_records_with_no_subjects = pd.DataFrame(sorted(records_with_no_subjects_per_source.items(), \
                                      key = lambda kv : kv[1], reverse = True), columns = ['Subject', 'Records'])
                                           
df_records_with_no_subjects.to_csv(NO_SUBJECT_OUTPUT_FILE)

df_records_with_no_subjects[:10]

Unnamed: 0,Subject,Records
0,-Total Records-,2206129
1,arXiv,1317689
2,Research Papers in Economics,799595
3,CrossRef,428986
4,Hyper Articles en Ligne (HAL),73724
5,bioRxiv,31501
6,CERN Document Server,28929
7,DoE's SciTech Connect Database,28137
8,AgEcon Search,16839
9,Preprints.org,8603


## <a id='taxonomy-issues'>TAXONOMY ISSUES</a>

In [18]:
BEPRESS_SUBJECTS_FILE = os.path.join('..','..', 'data_exploration_results', 'bepress_subjects.txt')

In [20]:
bepress_subject_master_list = set()
with open(BEPRESS_SUBJECTS_FILE, 'r') as f:
    for line in f:
        bepress_subject_master_list.add(line.strip())

subjects_not_in_bepress = defaultdict(int)
with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        for subject in data['subject_synonyms']:
            # subject format: taxonomy|main_subject|rest of subject
            # specific subject = main_subject|rest of subject
            subject = subject.split('|', 1)
            taxonomy = subject[0]
            specific_subject = subject[1]
            main_subject = subject[1].split('|')[0]
                
            if specific_subject not in bepress_subject_master_list:   
                subjects_not_in_bepress[specific_subject] += 1

In [25]:
df_subjects_not_in_bepress = pd.DataFrame(sorted(subjects_not_in_bepress.items(), key = lambda kv : kv[1], reverse = True),
                                      columns = ['Subject', 'Records'])

df_subjects_not_in_bepress

Unnamed: 0,Subject,Records
0,Life Sciences|Animal Sciences|Aquaculture and Fisheries Life Sciences,28
1,"Business|Business Law, Public Responsibility, and Ethics Business",19
2,"Arts and Humanities|History of Art, Architecture, and Archaeology|Ancient, Medieval, Renaissance and Baroque Art and Architecture",19
3,Life Sciences|Plant Sciences|Agronomy and Crop Sciences Life Sciences,16
4,Social and Behavioral Sciences|Sociology|Community-based Research,10
5,Life Sciences|Microbiology|Environmental Microbiology and Microbial Ecology Life Sciences,7
6,Life Sciences|Plant Sciences|Plant Breeding and Genetics Life Sciences,6
7,"Social and Behavioral Sciences|Public Affairs, Public Policy and Public Administration|Peace and Conflict",6
8,Life Sciences|Laboratory and Basic Science Research Life Sciences,3
9,"Life Sciences|Pharmacology, Toxicology and Environmental Health|Environmental Health Life Sciences",2
