# LANGUAGE STATISTICS

1. [Number of records per language (table)](#records-per-lang) 
2. [Number of records per language with all English records combined (table)](#records-per-lang-eng) 
3. [Sources of records with languages](#source-per-record-lang)
4. [Misc analysis - check sources of 'eee' and 'en' languages](#misc) 

In [3]:
import os

SHARE_FILE = os.path.join('..', '..', 'data', 'share-jan-2019.json')

In [4]:
import json
from collections import defaultdict

# number of records with a given language 
languages = defaultdict(int)

with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        if data['language'] is None or data['language'].strip() == '':
            languages[None] +=1
        else:
            languages[data['language']] += 1

## <a id='records-per-lang'>NUMBER OF RECORDS PER LANGUAGE</a>

In [24]:
import pandas as pd
df = pd.DataFrame(sorted(languages.items(), key = lambda kv : kv[1], reverse = True), \
                  columns = ['Source', 'Records'])

display(df)

Unnamed: 0,Source,Records
0,,2170828
1,eng,36848
2,en,10950
3,eee,3861
4,spa,2
5,fra,2
6,deu,2
7,English,1
8,ita,1


## <a id='records-per-lang-eng'>NUMBER OF RECORDS PER LANGUAGE (ALL ENGLISH RECORDS COMBINED)</a> 

In [25]:
df.loc[1, 'Records'] += df.loc[2, 'Records'] + df.loc[7, 'Records'] 
df = df[df.Source != 'en']
df = df[df.Source != 'English']

display(df)

Unnamed: 0,Source,Records
0,,2170828
1,eng,47799
3,eee,3861
4,spa,2
5,fra,2
6,deu,2
8,ita,1


## <a id='source-per-record-lang'>SOURCES OF RECORDS WITH LANGUAGES</a> 

In [41]:
import json
from collections import defaultdict

# number of records per source for records with languages
sources = defaultdict(int)

with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        if data['language'] is None or data['language'].strip() == '':
            pass
        else:
            for source in data['sources']:
                sources[source] += 1

In [42]:
import pandas as pd
df = pd.DataFrame(sorted(sources.items(), key = lambda kv : kv[1], reverse = True), \
                  columns = ['Source', 'Records'])

display(df)

Unnamed: 0,Source,Records
0,arXiv,32021
1,DoE's SciTech Connect Database,28073
2,CrossRef,25728
3,bioRxiv,9867
4,Department of Energy Pages,6590
5,Research Papers in Economics,4816
6,Munich Personal RePEc Archive,4786
7,Preprints.org,3578
8,Hyper Articles en Ligne (HAL),3082
9,Digital.CSIC,2342


## <a id='misc'>MISC ANALYSIS</a> 

In [32]:
# check the sources of 'eee' and 'en' since they appear to be errors, especially eee

import json
from collections import defaultdict

# number of records with a given language 
eee_sources = defaultdict(int)
en_sources = defaultdict(int)

with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        if data['language'] == 'eee':
            for source in data['sources']:
                eee_sources[source] += 1
                
        if data['language'] == 'en':
            for source in data['sources']:
                en_sources[source] += 1

### SINGLE SOURCES FOR LANGUAGE = 'eee'

In [34]:
import pandas as pd
df = pd.DataFrame(sorted(eee_sources.items(), key = lambda kv : kv[1], reverse = True), \
                  columns = ['Source', 'Records'])

display(df)

Unnamed: 0,Source,Records
0,Preprints.org,3578
1,CrossRef,2102
2,arXiv,284
3,University of Texas at Austin Digital Repository,237
4,Virginia Tech VTechWorks,46
5,Hyper Articles en Ligne (HAL),9
6,DSpace@MIT,8
7,CERN Document Server,4
8,Digital Access to Scholarship at Harvard,3
9,OSF,2


### SINGLE SOURCES FOR LANGUAGE = 'en'

In [35]:
import pandas as pd
df = pd.DataFrame(sorted(en_sources.items(), key = lambda kv : kv[1], reverse = True), \
                  columns = ['Source', 'Records'])

display(df)

Unnamed: 0,Source,Records
0,bioRxiv,9862
1,CrossRef,7668
2,PeerJ,1088
3,DataCite MDS,38
4,Hyper Articles en Ligne (HAL),15
5,arXiv,11
6,Zenodo,9
7,figshare,9
8,PubMed Central,3
9,Digital Access to Scholarship at Harvard,2


In [36]:
# check the sources of 'eee' and 'en' since they appear to be errors, especially eee

import json
from collections import defaultdict

# number of records with a given language 
eee_sources_combined = defaultdict(int)
en_sources_combined = defaultdict(int)

with open(SHARE_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        
        if data['language'] == 'eee':
            data['sources'].sort()
            eee_sources_combined['--'.join(data['sources'])] += 1
                
        if data['language'] == 'en':
            data['sources'].sort()
            en_sources_combined['--'.join(data['sources'])] += 1

### COMBINED SOURCES FOR LANGUAGE = 'eee'

In [37]:
import pandas as pd
df = pd.DataFrame(sorted(eee_sources_combined.items(), key = lambda kv : kv[1], reverse = True), \
                  columns = ['Source', 'Records'])

display(df)

Unnamed: 0,Source,Records
0,CrossRef--Preprints.org,1917
1,Preprints.org,1658
2,CrossRef--University of Texas at Austin Digita...,139
3,University of Texas at Austin Digital Reposito...,77
4,CrossRef--Virginia Tech VTechWorks--arXiv,27
5,Virginia Tech VTechWorks--arXiv,15
6,CrossRef--DSpace@MIT--University of Texas at A...,6
7,Hyper Articles en Ligne (HAL)--University of T...,4
8,CERN Document Server--CrossRef--Virginia Tech ...,3
9,CrossRef--Digital Access to Scholarship at Har...,2


### COMBINED SOURCES FOR LANGUAGE = 'en'

In [38]:
import pandas as pd
df = pd.DataFrame(sorted(en_sources_combined.items(), key = lambda kv : kv[1], reverse = True), \
                  columns = ['Source', 'Records'])

display(df)

Unnamed: 0,Source,Records
0,CrossRef--bioRxiv,6547
1,bioRxiv,3255
2,CrossRef--PeerJ,1072
3,CrossRef--Hyper Articles en Ligne (HAL)--bioRxiv,11
4,CrossRef--DataCite MDS--bioRxiv,9
5,DataCite MDS--bioRxiv,9
6,CrossRef--DataCite MDS--PeerJ,8
7,CrossRef--DataCite MDS--Zenodo--bioRxiv,6
8,PeerJ--arXiv,4
9,Hyper Articles en Ligne (HAL)--bioRxiv,3
