In [1]:
import os
import re
import json
import tqdm
import fasttext
import numpy as np
import pandas as pd
import networkx as nx
from numpy import array # needed for eval()

In [2]:
def get_primary_discipline(category_str):
    try:
        category_dict = eval(category_str)
        first_level_disciplines = category_dict["first_level"]["full"]
        if len(first_level_disciplines) > 0:
            return first_level_disciplines[0]["name"]
        else:
            return None
    except (KeyError, IndexError, SyntaxError, TypeError, ValueError):
        return None

In [3]:
def get_country(country_str):
    try:
        country_list = [s.strip("'") for s in re.split(r"'\s+", country_str.strip("[]")) if s.strip("'")]
        if len(country_list) > 0:
            return country_list[0]
        else:
            return None
    except (KeyError, IndexError, SyntaxError, TypeError, ValueError):
        return None

In [4]:
fasttext.FastText.eprint = lambda *args, **kwargs: None
fasttext_model_path = "s2_fos/data/lid.176.bin"
fasttext_model = fasttext.load_model(fasttext_model_path)

In [5]:
def get_language(row):
    try:
        text = eval(row["title"]).get("preferred", "") + eval(row["abstract"]).get("preferred", "")
        if text:
            pred = fasttext_model.predict(text.replace("\n", " "))
            label = pred[0][0].replace("__label__", "")
            return label
        else:
            return None
    except (KeyError, IndexError, SyntaxError, TypeError, ValueError):
        return None

## citations_of_all_pubs.csv

### second pass (languages)

In [6]:
filtered_df = pd.read_csv("data/filtered_citations.csv")

With references:

In [17]:
languages = filtered_df[filtered_df["reference_ids"] != '[]'].apply(get_language, axis=1)

In [18]:
languages.count()

916881

In [19]:
languages.value_counts() / languages.count()

en    0.779176
pt    0.066269
es    0.062652
id    0.061454
uk    0.005358
        ...   
ba    0.000001
bn    0.000001
sw    0.000001
mk    0.000001
he    0.000001
Name: count, Length: 65, dtype: float64

With DOIS:

In [20]:
languages = filtered_df[filtered_df["doi"].notnull()].apply(get_language, axis=1)

In [21]:
languages.count()

912706

In [22]:
languages.value_counts() / languages.count()

en    0.778199
pt    0.066572
es    0.062933
id    0.061735
uk    0.005383
        ...   
ba    0.000001
bn    0.000001
sw    0.000001
mk    0.000001
he    0.000001
Name: count, Length: 65, dtype: float64

### first pass (disciplines and countries)

In [4]:
with open("data/ojs_issns.json", "r") as infile:
    ojs_issns = json.load(infile)
ojs_issns = {k: 1 for k in ojs_issns}

In [5]:
cite_df = pd.read_csv("data/citations_of_all_pubs.csv", header=None)

In [6]:
cite_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0,pub.1011627231,['pub.1000621744' 'pub.1000722723' 'pub.100103...,"[{'id': 'pub.1115166073', 'year': 2016}\n {'id...",10.1152/ajpendo.90306.2008,0193-1849,1522-1555,article,2008-05-20,"{'first_level': {'codes': array(['31', '42'], ...","{'times_cited': 100, 'recent_citations': 6, 'f...",['grid.6451.6'],['ur.07527627403.94' 'ur.01276472107.59'],"{'id': 'jour.1327387', 'title': 'AJP Endocrino...",['Israel'],"{'id': None, 'score': None}",{'preferred': 'Transcriptional regulation of t...,{'preferred': 'The insulin-responsive glucose ...,[]
1,1,pub.1011182182,['pub.1013726283' 'pub.1018900088' 'pub.102673...,"[{'id': 'pub.1102285413', 'year': 2010}\n {'id...",10.1353/apa.0.0007,0360-5949,1533-0699,article,2008-03-01,"{'first_level': {'codes': array(['43', '44', '...","{'times_cited': 7, 'recent_citations': 2, 'fie...",['grid.266515.3'],['ur.011637600145.05'],"{'id': 'jour.1143161', 'title': 'Transactions ...",['United States'],"{'id': '56626804', 'score': 9}",{'preferred': 'Genus quid est?: Roman Scholars...,{'preferred': 'From at least as early as Varro...,[]


In [38]:
cite_df.columns = ["index", "publication_id", "reference_ids", "citing_ids", "doi", "issn", "eissn", "type", "date", "category_for", "times_cited", "research_org_cities", "ur_id", "source", "research_org_country_names", "altmetrics", "title", "abstract", "concepts"]

In [44]:
filtered_df = cite_df[(cite_df.iloc[:, 5].isin(ojs_issns.keys())) | (cite_df.iloc[:, 6].isin(ojs_issns.keys()))]
filtered_df.count()

index                         1026862
publication_id                1026862
reference_ids                 1026862
citing_ids                    1026862
doi                           1021679
issn                           903907
eissn                          955421
type                          1026862
date                          1026328
category_for                   780600
times_cited                   1026853
research_org_cities           1026862
ur_id                         1026862
source                        1026862
research_org_country_names    1026862
altmetrics                    1026862
title                         1026862
abstract                       916881
concepts                      1026862
dtype: int64

In [52]:
cite_df.count()

index                         15109997
publication_id                15109997
reference_ids                 15109997
citing_ids                    15109997
doi                           15048232
issn                          13234995
eissn                         14430559
type                          15109997
date                          15106839
category_for                  14476049
times_cited                   15109966
research_org_cities           15109997
ur_id                         15109997
source                        15056836
research_org_country_names    15109997
altmetrics                    15109997
title                         15109997
abstract                      14071618
concepts                      15109997
dtype: int64

In [51]:
filtered_df.count() / cite_df.count()

index                         0.067959
publication_id                0.067959
reference_ids                 0.067959
citing_ids                    0.067959
doi                           0.067894
issn                          0.068297
eissn                         0.066208
type                          0.067959
date                          0.067938
category_for                  0.053924
times_cited                   0.067959
research_org_cities           0.067959
ur_id                         0.067959
source                        0.068199
research_org_country_names    0.067959
altmetrics                    0.067959
title                         0.067959
abstract                      0.065158
concepts                      0.067959
dtype: float64

In [45]:
disciplines = filtered_df["category_for"].apply(get_primary_discipline)
countries = filtered_df["research_org_country_names"].apply(get_country)

In [46]:
copied_df = filtered_df.copy()
copied_df["primary_discipline"] = disciplines
copied_df["primary_country"] = countries
copied_df.to_csv("data/filtered_citations.csv", index=False)

In [48]:
copied_df["primary_discipline"].value_counts().sum()

780600

In [49]:
copied_df["primary_discipline"].value_counts()

primary_discipline
Biomedical and Clinical Sciences              142482
Biological Sciences                            97659
Education                                      67178
Agricultural, Veterinary and Food Sciences     60717
Commerce, Management, Tourism and Services     56452
Health Sciences                                46691
Human Society                                  42591
Language, Communication and Culture            32139
Engineering                                    31361
Earth Sciences                                 28515
Mathematical Sciences                          24728
Information and Computing Sciences             20644
Chemical Sciences                              19546
Philosophy and Religious Studies               18545
Physical Sciences                              16437
Creative Arts and Writing                      16260
History, Heritage and Archaeology              11936
Built Environment and Design                   10997
Economics                  

In [50]:
copied_df["primary_discipline"].value_counts() / copied_df["primary_discipline"].value_counts().sum()

primary_discipline
Biomedical and Clinical Sciences              0.182529
Biological Sciences                           0.125108
Education                                     0.086059
Agricultural, Veterinary and Food Sciences    0.077782
Commerce, Management, Tourism and Services    0.072319
Health Sciences                               0.059814
Human Society                                 0.054562
Language, Communication and Culture           0.041172
Engineering                                   0.040176
Earth Sciences                                0.036530
Mathematical Sciences                         0.031678
Information and Computing Sciences            0.026446
Chemical Sciences                             0.025040
Philosophy and Religious Studies              0.023757
Physical Sciences                             0.021057
Creative Arts and Writing                     0.020830
History, Heritage and Archaeology             0.015291
Built Environment and Design                  

Articles with references:

In [59]:
sum(copied_df["reference_ids"] != '[]')

1026862

In [60]:
copied_df["reference_ids"].count()

1026862

In [61]:
sum(copied_df["reference_ids"] != '[]') / copied_df["reference_ids"].count()

1.0

In [62]:
copied_df[copied_df["reference_ids"] != '[]']["primary_discipline"].value_counts() / copied_df[copied_df["reference_ids"] != '[]']["primary_discipline"].count()

primary_discipline
Biomedical and Clinical Sciences              0.182529
Biological Sciences                           0.125108
Education                                     0.086059
Agricultural, Veterinary and Food Sciences    0.077782
Commerce, Management, Tourism and Services    0.072319
Health Sciences                               0.059814
Human Society                                 0.054562
Language, Communication and Culture           0.041172
Engineering                                   0.040176
Earth Sciences                                0.036530
Mathematical Sciences                         0.031678
Information and Computing Sciences            0.026446
Chemical Sciences                             0.025040
Philosophy and Religious Studies              0.023757
Physical Sciences                             0.021057
Creative Arts and Writing                     0.020830
History, Heritage and Archaeology             0.015291
Built Environment and Design                  

In [63]:
copied_df[copied_df["reference_ids"] != '[]']["primary_discipline"].count()

780600

In [64]:
copied_df["primary_country"].value_counts() / copied_df["primary_country"].count()

primary_country
Indonesia                           0.168843
United States                       0.155495
Brazil                              0.150568
United Kingdom                      0.044686
Spain                               0.031880
                                      ...   
Cook Islands                        0.000003
Cayman Islands                      0.000003
Anguilla                            0.000001
Cocos Islands                       0.000001
Saint Vincent and the Grenadines    0.000001
Name: count, Length: 226, dtype: float64

In [65]:
copied_df["primary_country"].count()

753209

In [68]:
sum(copied_df["citing_ids"] != '[]') / copied_df["citing_ids"].count()

0.5735639258245022

In [69]:
sum(copied_df["citing_ids"] != '[]')

588971

Null DOI:

In [70]:
copied_df["doi"].notnull().sum()

1021679

In [71]:
copied_df["doi"].notnull().sum() / copied_df["index"].count()

0.9949525836967382

In [72]:
copied_df[copied_df["doi"].notnull()]["primary_discipline"].value_counts() / copied_df[copied_df["doi"].notnull()]["primary_discipline"].count()

primary_discipline
Biomedical and Clinical Sciences              0.180027
Biological Sciences                           0.125763
Education                                     0.086613
Agricultural, Veterinary and Food Sciences    0.078218
Commerce, Management, Tourism and Services    0.072783
Health Sciences                               0.059694
Human Society                                 0.054894
Language, Communication and Culture           0.041436
Engineering                                   0.040428
Earth Sciences                                0.036767
Mathematical Sciences                         0.031880
Chemical Sciences                             0.025188
Information and Computing Sciences            0.024642
Philosophy and Religious Studies              0.023908
Physical Sciences                             0.021194
Creative Arts and Writing                     0.020960
History, Heritage and Archaeology             0.015390
Built Environment and Design                  

In [73]:
copied_df[copied_df["doi"].notnull()]["primary_discipline"].count()

775555

In [74]:
copied_df[copied_df["doi"].notnull()]["primary_country"].value_counts() / copied_df[copied_df["doi"].notnull()]["primary_country"].count()

primary_country
Indonesia                           0.169783
United States                       0.155091
Brazil                              0.151342
United Kingdom                      0.044826
Spain                               0.031988
                                      ...   
Antarctica                          0.000003
Cook Islands                        0.000003
Anguilla                            0.000001
Cocos Islands                       0.000001
Saint Vincent and the Grenadines    0.000001
Name: count, Length: 226, dtype: float64

In [75]:
copied_df[copied_df["doi"].notnull()]["primary_country"].count()

749001

## references_of_all_pubs.csv

### second pass (languages)

In [23]:
filtered_df = pd.read_csv("data/filtered_references.csv")

With references:

In [24]:
languages = filtered_df[filtered_df["reference_ids"] != '[]'].apply(get_language, axis=1)

In [25]:
languages.count()

520794

In [26]:
languages.value_counts() / languages.count()

en    0.849793
es    0.049787
pt    0.040772
id    0.033278
fr    0.005240
ru    0.004485
uk    0.003654
de    0.002865
pl    0.001684
it    0.001463
lt    0.001292
hu    0.001194
tr    0.000960
ar    0.000753
cs    0.000541
ca    0.000442
da    0.000298
no    0.000246
sv    0.000232
vi    0.000200
af    0.000175
sh    0.000115
zh    0.000096
ms    0.000060
nl    0.000058
uz    0.000044
hr    0.000044
sk    0.000042
sl    0.000027
ka    0.000017
sr    0.000017
el    0.000015
et    0.000015
ja    0.000015
lv    0.000012
fi    0.000012
ro    0.000012
mn    0.000010
ky    0.000006
gl    0.000006
nn    0.000006
la    0.000004
bg    0.000004
os    0.000004
fa    0.000002
ne    0.000002
ko    0.000002
Name: count, dtype: float64

With DOIs:

In [27]:
languages = filtered_df[filtered_df["doi"].notnull()].apply(get_language, axis=1)

In [28]:
languages.count()

1004614

In [30]:
languages.value_counts() / languages.count()

en     6.793624e-01
id     1.114279e-01
es     9.044767e-02
pt     8.681643e-02
uk     6.227267e-03
           ...     
ba     9.954072e-07
ko     9.954072e-07
ast    9.954072e-07
kk     9.954072e-07
sw     9.954072e-07
Name: count, Length: 65, dtype: float64

### first pass (disciplines and countries)

In [23]:
filtered_df = pd.read_csv("data/filtered_references.csv")

In [36]:
cites = [eval(s)["times_cited"] for s in filtered_df["times_cited"] if isinstance(s, str)]

In [37]:
len(cites) / filtered_df.count()

index                         0.999989
publication_id                0.999989
reference_ids                 0.999989
citing_ids                    0.999989
doi                           1.026331
issn                          1.122841
eissn                         1.089701
type                          0.999989
date                          1.000208
category_for                  1.503855
times_cited                   1.000000
research_org_cities           0.999989
ur_id                         0.999989
source                        0.999989
research_org_country_names    0.999989
altmetrics                    0.999989
title                         0.999989
abstract                      1.219175
concepts                      0.999989
idk                           0.999989
primary_discipline            1.503855
dtype: float64

In [35]:
len(cites)

1255148

In [4]:
with open("data/ojs_issns.json", "r") as infile:
    ojs_issns = json.load(infile)
ojs_issns = {k: 1 for k in ojs_issns}

In [5]:
#ref_df = pd.read_csv("data/references_of_all_pubs.csv", header=None)
#ref_df.columns = ["index", "publication_id", "reference_ids", "citing_ids", "doi", "issn", "eissn", "type", "date", "category_for", "times_cited", "research_org_cities", "ur_id", "source", "research_org_country_names", "altmetrics", "title", "abstract", "concepts", "idk"]

In [6]:
#filtered_df = ref_df[(ref_df.iloc[:, 5].isin(ojs_issns.keys())) | (ref_df.iloc[:, 6].isin(ojs_issns.keys()))]
#filtered_df.count()
#filtered_df.to_csv("data/filtered_references.csv", index=False)
#disciplines = filtered_df["category_for"].apply(get_primary_discipline)
#copied_df = filtered_df.copy()
#copied_df["primary_discipline"] = disciplines
#copied_df.to_csv("data/filtered_references.csv", index=False)

In [15]:
filtered_df["primary_discipline"].value_counts()

primary_discipline
Biomedical and Clinical Sciences              145569
Biological Sciences                            94986
Education                                      75953
Commerce, Management, Tourism and Services     67879
Agricultural, Veterinary and Food Sciences     62793
Health Sciences                                48614
Human Society                                  48483
Language, Communication and Culture            37811
Engineering                                    33152
Philosophy and Religious Studies               28095
Earth Sciences                                 27967
Information and Computing Sciences             23869
Mathematical Sciences                          23156
Chemical Sciences                              19034
Creative Arts and Writing                      16388
History, Heritage and Archaeology              13789
Physical Sciences                              13742
Economics                                      12634
Built Environment and Desig

In [14]:
filtered_df["primary_discipline"].value_counts() / filtered_df["primary_discipline"].count()

primary_discipline
Biomedical and Clinical Sciences              0.174240
Biological Sciences                           0.113694
Education                                     0.090913
Commerce, Management, Tourism and Services    0.081248
Agricultural, Veterinary and Food Sciences    0.075161
Health Sciences                               0.058189
Human Society                                 0.058032
Language, Communication and Culture           0.045258
Engineering                                   0.039682
Philosophy and Religious Studies              0.033629
Earth Sciences                                0.033475
Information and Computing Sciences            0.028570
Mathematical Sciences                         0.027717
Chemical Sciences                             0.022783
Creative Arts and Writing                     0.019616
History, Heritage and Archaeology             0.016505
Physical Sciences                             0.016449
Economics                                     

In [16]:
countries = filtered_df["research_org_country_names"].apply(get_country)

In [17]:
countries.value_counts() / countries.count()

research_org_country_names
Brazil                            0.158008
Indonesia                         0.151695
United States                     0.148292
United Kingdom                    0.047197
Spain                             0.040669
                                    ...   
Mayotte                           0.000002
Cayman Islands                    0.000002
Sao Tome and Principe             0.000002
British Indian Ocean Territory    0.000002
Saint Martin                      0.000002
Name: count, Length: 229, dtype: float64

In [18]:
countries.value_counts()

research_org_country_names
Brazil                            103362
Indonesia                          99232
United States                      97006
United Kingdom                     30874
Spain                              26604
                                   ...  
Mayotte                                1
Cayman Islands                         1
Sao Tome and Principe                  1
British Indian Ocean Territory         1
Saint Martin                           1
Name: count, Length: 229, dtype: int64

Null references:

In [19]:
sum(filtered_df["reference_ids"] != '[]')

577821

In [20]:
sum(filtered_df["reference_ids"] != '[]') / filtered_df["reference_ids"].count()

0.45989880683758233

In [35]:
filtered_df[filtered_df["reference_ids"] != '[]']["primary_discipline"].value_counts() / filtered_df[filtered_df["reference_ids"] != '[]']["primary_discipline"].count()

primary_discipline
Biomedical and Clinical Sciences              0.191364
Biological Sciences                           0.149586
Agricultural, Veterinary and Food Sciences    0.077772
Education                                     0.067693
Health Sciences                               0.065798
Commerce, Management, Tourism and Services    0.064097
Human Society                                 0.047718
Engineering                                   0.045062
Earth Sciences                                0.041530
Mathematical Sciences                         0.036555
Language, Communication and Culture           0.033553
Chemical Sciences                             0.031497
Physical Sciences                             0.025524
Information and Computing Sciences            0.025064
Philosophy and Religious Studies              0.017986
Psychology                                    0.013662
Economics                                     0.013307
Built Environment and Design                  

In [36]:
filtered_df[filtered_df["reference_ids"] != '[]']["primary_discipline"].count()

484997

In [27]:
countries = filtered_df[filtered_df["reference_ids"] != '[]']["research_org_country_names"].apply(get_country)

In [29]:
countries.count()

456772

In [28]:
countries.value_counts() / countries.count()

research_org_country_names
United States     0.189920
Brazil            0.151802
Indonesia         0.095336
United Kingdom    0.059500
Spain             0.029260
                    ...   
Vatican           0.000002
Aland Islands     0.000002
Cayman Islands    0.000002
Norfolk Island    0.000002
Sint Maarten      0.000002
Name: count, Length: 224, dtype: float64

With DOIs:

In [30]:
filtered_df["doi"].notnull().sum()

1224162

In [31]:
filtered_df["doi"].notnull().sum() / filtered_df["index"].count()

0.974333994742158

In [37]:
filtered_df[filtered_df["doi"].notnull()]["primary_discipline"].value_counts() / filtered_df[filtered_df["doi"].notnull()]["primary_discipline"].count()

primary_discipline
Biomedical and Clinical Sciences              0.157354
Biological Sciences                           0.112666
Education                                     0.094284
Commerce, Management, Tourism and Services    0.084153
Agricultural, Veterinary and Food Sciences    0.076088
Human Society                                 0.060056
Health Sciences                               0.057318
Language, Communication and Culture           0.046952
Engineering                                   0.041083
Philosophy and Religious Studies              0.034717
Earth Sciences                                0.034709
Mathematical Sciences                         0.028748
Information and Computing Sciences            0.027131
Chemical Sciences                             0.023504
Creative Arts and Writing                     0.020329
History, Heritage and Archaeology             0.017069
Physical Sciences                             0.016845
Economics                                     

In [38]:
filtered_df[filtered_df["doi"].notnull()]["primary_discipline"].count()

805103

In [39]:
countries = filtered_df[filtered_df["doi"].notnull()]["research_org_country_names"].apply(get_country)

In [40]:
countries.count()

633585

In [41]:
countries.value_counts() / countries.count()

research_org_country_names
Brazil                            0.162136
Indonesia                         0.156233
United States                     0.150627
United Kingdom                    0.048314
Spain                             0.041391
                                    ...   
Norfolk Island                    0.000002
Mayotte                           0.000002
Sao Tome and Principe             0.000002
British Indian Ocean Territory    0.000002
research_org_country_names        0.000002
Name: count, Length: 228, dtype: float64