# CORD-19-collect-scopus-data

In general, this jupyter notebook is designated to collect additional data via scopus to enbroaden the CORD19 dataset: 
https://datadryad.org/stash/dataset/doi:10.5061/dryad.vmcvdncs0

First, relevant packages must be imported to the Notebook.

In [1]:
import numpy as np
import pandas as pd
import csv
import ast
import collections
import matplotlib.pyplot as plt
import Levenshtein as lev
from fuzzywuzzy import fuzz 
import datetime
import matplotlib.pyplot as plt
import re
from urllib.parse import urlparse
from collections import Counter

from elsapy.elsclient import ElsClient
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch

import time # for sleep
from pybtex.database import parse_file, BibliographyData, Entry
import json
from elsapy.elsclient import ElsClient
from elsapy.elsdoc import AbsDoc
from elsapy.elssearch import ElsSearch

In [2]:
CORD19_CSV = pd.read_csv('../data/cord-19/CORD19_software_mentions.csv')

In [3]:
len(CORD19_CSV['doi'])

77448

In [4]:
doi = CORD19_CSV['doi']
doi

0                                 NaN
1          10.1016/j.regg.2021.01.002
2           10.1016/j.rec.2020.08.002
3        10.1016/j.vetmic.2006.11.026
4                   10.3390/v12080849
                     ...             
77443      10.1007/s11229-020-02869-9
77444                             NaN
77445     10.1101/2020.05.13.20100206
77446      10.1007/s42991-020-00052-8
77447     10.1101/2020.09.14.20194670
Name: doi, Length: 77448, dtype: object

In [5]:
#delete Nans
doi_counted = doi.value_counts()
doi_counted

10.31729/jnma.5498               2
10.1016/j.dsx.2020.04.012        2
10.1021/acsinfecdis.6b00006      1
10.1093/jamia/ocaa105            1
10.1111/jocd.13688               1
                                ..
10.1038/srep26311                1
10.1016/j.resplu.2021.100088     1
10.1016/j.mbs.2020.108499        1
10.1016/s0022-1759(03)00244-8    1
10.1016/0167-5877(95)00498-x     1
Name: doi, Length: 74302, dtype: int64

In [6]:
def AffiliationsFromScopusByDOI(client, doi):
    """obtain addiotional paper information from scopus by doi
    """
    doc_srch = ElsSearch("DOI("+doi+")",'scopus')
    doc_srch.execute(client, get_all = True)
    #print ("doc_srch has", len(doc_srch.results), "results.")
    #print(doc_srch.results)
    try:
        scopus_id=doc_srch.results[0]["dc:identifier"].split(":")[1]
        scp_doc = AbsDoc(scp_id = scopus_id)
        if scp_doc.read(client):
            # print ("scp_doc.title: ", scp_doc.title)
            scp_doc.write()   
        else:
            print ("Read document failed.")
        # print(scp_doc.data["affiliation"])
        return scp_doc.data
    except:
        return None

In [7]:
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()
## Initialize client
client = ElsClient(config['apikey'])
# https://api.elsevier.com/content/search/scopus?query=DOI(10.1109/MCOM.2016.7509373)&apiKey=6d485ef1fe1408712f37e8a783a285a4

In [8]:
df_current_extra_info = pd.read_pickle('extra_info_CS5099.pkl')
df_current_extra_info

Unnamed: 0,affiliation,coredata
0,"[{'affiliation-city': None, 'affilname': 'Pata...","{'srctype': 'j', 'prism:issueIdentifier': '230..."
1,"[{'affiliation-city': 'New Delhi', 'affilname'...","{'srctype': 'j', 'eid': '2-s2.0-85083171050', ..."
2,,
3,"[{'affiliation-city': 'London', 'affilname': '...","{'srctype': 'j', 'eid': '2-s2.0-79953057246', ..."
4,"[{'affiliation-city': 'Baoding', 'affilname': ...","{'srctype': 'j', 'prism:issueIdentifier': '7',..."
...,...,...
17267,,
17268,,
17269,"[{'affiliation-city': 'Dunedin', 'affilname': ...","{'srctype': 'j', 'eid': '2-s2.0-85105184346', ..."
17270,,


In [9]:
len_df_current_extra_info = len(df_current_extra_info)
len_df_current_extra_info

17272

In [10]:
%%time
i = len_df_current_extra_info
dict_new_extra_info = dict()
len_dois = len(doi_counted)
while i < len_dois:
    print("Position: " + str(i) + " -> " +  doi_counted.index[i])
    dict_new_extra_info[i] = AffiliationsFromScopusByDOI(client, doi_counted.index[i])
    i = i + 1 

Position: 17272 -> 10.1186/1758-2652-7-2-71
Position: 17273 -> 10.3389/fmed.2020.594623
Position: 17274 -> 10.1016/j.pnpbp.2021.110254
Position: 17275 -> 10.1016/j.stlm.2020.100001
Position: 17276 -> 10.1136/bmjopen-2020-042647
Position: 17277 -> 10.1155/2020/1025913
Position: 17278 -> 10.1101/2020.07.03.185850
Position: 17279 -> 10.1016/j.virol.2014.07.020
Position: 17280 -> 10.1002/pa.2266
Position: 17281 -> 10.3390/ijerph17072365
Position: 17282 -> 10.1016/j.drudis.2020.10.002
Position: 17283 -> 10.3390/ijerph17238999
Position: 17284 -> 10.1101/2020.12.16.20248358
Position: 17285 -> 10.3389/fpsyg.2020.570567
Position: 17286 -> 10.1007/s00705-006-0750-y
Position: 17287 -> 10.1177/1757975920977837
Position: 17288 -> 10.1034/j.1399-3038.2003.00064.x
Position: 17289 -> 10.3390/microorganisms8060850
Position: 17290 -> 10.1101/2020.09.24.20197632
Position: 17291 -> 10.3390/v4102137
Position: 17292 -> 10.1016/j.ijnurstu.2020.103644
Position: 17293 -> 10.1101/2020.04.01.019877
Position: 172

Position: 17453 -> 10.3343/alm.2017.37.5.408
Position: 17454 -> 10.1101/2020.05.06.074039
Position: 17455 -> 10.1101/2020.08.23.20177501
Position: 17456 -> 10.1101/2020.09.30.20204719
Position: 17457 -> 10.1055/s-0040-1716334
Position: 17458 -> 10.1016/j.vaccine.2016.11.064
Position: 17459 -> 10.2147/ott.s227915
Position: 17460 -> 10.1038/cr.2015.113
Position: 17461 -> 10.1097/md.0000000000023261
Position: 17462 -> 10.1186/s12913-021-06081-4
Position: 17463 -> 10.7759/cureus.10906
Position: 17464 -> 10.1016/j.tcm.2020.05.008
Position: 17465 -> 10.1007/978-3-030-50420-5_6
Position: 17466 -> 10.1007/978-3-030-51310-8_22
Position: 17467 -> 10.7717/peerj.6748
Position: 17468 -> 10.1016/j.heliyon.2020.e05410
Position: 17469 -> 10.1007/s40520-020-01755-1
Position: 17470 -> 10.1093/cid/cis629
Position: 17471 -> 10.4252/wjsc.v12.i8.814
Position: 17472 -> 10.1016/j.rtbm.2020.100570
Position: 17473 -> 10.1186/s12985-020-01452-5
Position: 17474 -> 10.5455/javar.2019.f324
Position: 17475 -> 10.101

Position: 17636 -> 10.1007/s10072-021-05068-7
Position: 17637 -> 10.1101/2020.09.16.20195693
Position: 17638 -> 10.1164/rccm.202007-2934ed
Position: 17639 -> 10.1039/c2ib20204b
Position: 17640 -> 10.1002/jmv.26330
Position: 17641 -> 10.1200/op.20.00210
Position: 17642 -> 10.1016/j.exer.2020.108273
Position: 17643 -> 10.1101/352898
Position: 17644 -> 10.1186/s41205-020-00064-7
Position: 17645 -> 10.1186/s13063-020-04467-z
Position: 17646 -> 10.7554/elife.37663
Position: 17647 -> 10.1136/bmjgh-2020-004085
Position: 17648 -> 10.1007/s00068-020-01383-4
Position: 17649 -> 10.1007/s11262-017-1526-z
Position: 17650 -> 10.1007/s11419-020-00567-2
Position: 17651 -> 10.1097/qai.0000000000002540
Position: 17652 -> 10.1016/j.promfg.2020.05.139
Position: 17653 -> 10.1111/spol.12670
Position: 17654 -> 10.1111/1747-0080.12619
Position: 17655 -> 10.1007/s00092-020-4141-6
Position: 17656 -> 10.1016/j.vetmic.2007.03.016
Position: 17657 -> 10.1186/s12889-020-09590-9
Position: 17658 -> 10.1136/bmjopen-202

Position: 17818 -> 10.1371/journal.pone.0245556
Position: 17819 -> 10.1038/srep37725
Position: 17820 -> 10.1007/s00259-020-04994-6
Position: 17821 -> 10.1371/journal.pone.0242903
Position: 17822 -> 10.3390/genes11080942
Position: 17823 -> 10.1101/2020.12.09.418541
Position: 17824 -> 10.1038/s41401-020-0435-1
Position: 17825 -> 10.1371/journal.pone.0027915
Position: 17826 -> 10.4161/idp.27454
Position: 17827 -> 10.1016/j.dld.2020.05.023
Position: 17828 -> 10.1101/2020.12.01.20241067
Position: 17829 -> 10.1371/journal.pone.0028001
Position: 17830 -> 10.1292/jvms.14-0207
Position: 17831 -> 10.7759/cureus.11967
Position: 17832 -> 10.3389/fpsyg.2020.567101
Position: 17833 -> 10.1007/s00108-019-0612-9
Position: 17834 -> 10.2478/jvetres-2020-0050
Position: 17835 -> 10.1101/2020.09.09.287987
Position: 17836 -> 10.1109/jsen.2018.2829084
Position: 17837 -> 10.1038/s41598-020-63696-z
Position: 17838 -> 10.1016/j.xphs.2020.09.025
Position: 17839 -> 10.18632/aging.103255
Position: 17840 -> 10.1007/

Position: 18002 -> 10.1007/s40140-020-00424-3
Position: 18003 -> 10.1186/s40779-017-0144-3
Position: 18004 -> 10.1016/j.vetmic.2010.02.027
Position: 18005 -> 10.1093/ofid/ofaa468
Position: 18006 -> 10.1101/2020.06.07.138800
Position: 18007 -> 10.1016/j.ssci.2020.105058
Position: 18008 -> 10.1101/2020.09.28.314625
Position: 18009 -> 10.1016/j.prevetmed.2015.10.020
Position: 18010 -> 10.1111/ajt.16424
Position: 18011 -> 10.1101/2020.08.26.269183
Position: 18012 -> 10.1186/s13071-018-2697-8
Position: 18013 -> 10.1371/journal.pone.0237548
Position: 18014 -> 10.3201/eid1009.040058
Position: 18015 -> 10.1101/2020.09.22.20199406
Position: 18016 -> 10.1093/cid/ciaa1802
Position: 18017 -> 10.1016/j.eml.2020.100924
Position: 18018 -> 10.1016/j.jneb.2020.07.009
Position: 18019 -> 10.1101/2020.05.05.079194
Position: 18020 -> 10.1016/j.aprim.2020.12.006
Position: 18021 -> 10.1098/rsif.2011.0850
Position: 18022 -> 10.1016/j.ab.2005.06.037
Position: 18023 -> 10.1007/978-3-319-71321-2_89
Position: 180

Position: 18184 -> 10.1016/j.virusres.2004.11.004
Position: 18185 -> 10.1007/s40804-020-00195-8
Position: 18186 -> 10.1128/jvi.02595-13
Position: 18187 -> 10.1016/j.antiviral.2014.02.002
Position: 18188 -> 10.1017/s0950268819000062
Position: 18189 -> 10.3390/vaccines8020251
Position: 18190 -> 10.3402/ehtj.v6i0.19955
Position: 18191 -> 10.1007/s11262-006-0036-1
Position: 18192 -> 10.5414/alx02197e
Position: 18193 -> 10.1371/journal.pntd.0000530
Position: 18194 -> 10.1016/j.ajp.2020.102367
Position: 18195 -> 10.1007/s10654-014-9919-y
Position: 18196 -> 10.31557/apjcp.2019.20.3.757
Position: 18197 -> 10.3390/ijerph17197223
Position: 18198 -> 10.3390/antibiotics9050220
Position: 18199 -> 10.1007/s13193-020-01270-9
Position: 18200 -> 10.1016/j.amjsurg.2020.12.046
Position: 18201 -> 10.1093/bib/bbaa141
Position: 18202 -> 10.1007/s11135-021-01095-w
Position: 18203 -> 10.1007/s00737-020-01077-1
Position: 18204 -> 10.1016/s1875-5364(19)30021-4
Position: 18205 -> 10.3889/oamjms.2018.332
Position

Position: 18367 -> 10.1007/s11524-020-00465-3
Position: 18368 -> 10.7759/cureus.7737
Position: 18369 -> 10.1016/j.socscimed.2020.113473
Position: 18370 -> 10.1038/cdd.2016.3
Position: 18371 -> 10.1016/j.jstrokecerebrovasdis.2020.105182
Position: 18372 -> 10.1016/j.bbrc.2020.05.028
Position: 18373 -> 10.1007/s00210-008-0275-x
Position: 18374 -> 10.1016/j.epidem.2016.01.001
Position: 18375 -> 10.1097/mat.0000000000001191
Position: 18376 -> 10.3390/jof6040286
Position: 18377 -> 10.4142/jvs.2020.21.e50
Position: 18378 -> 10.1016/j.jviromet.2017.06.007
Position: 18379 -> 10.1097/txd.0000000000001054
Position: 18380 -> 10.1002/jcb.21332
Position: 18381 -> 10.1186/1743-422x-5-40
Position: 18382 -> 10.1080/14760584.2019.1604231
Position: 18383 -> 10.3389/fmed.2020.00486
Position: 18384 -> 10.1038/s41394-019-0196-6
Position: 18385 -> 10.1080/07391102.2020.1834457
Position: 18386 -> 10.1101/2020.07.25.20151225
Position: 18387 -> 10.1186/s12903-020-01274-5
Position: 18388 -> 10.1007/s43545-020-00

KeyboardInterrupt: 

In [11]:
dict_new_extra_info

{17272: None,
 17273: {'affiliation': [{'affiliation-city': 'Vienna',
    'affilname': 'Vienna Institute of Demography',
    'affiliation-country': 'Austria'},
   {'affiliation-city': 'Vienna',
    'affilname': 'Klinik Landstrasse',
    'affiliation-country': 'Austria'}],
  'coredata': {'srctype': 'j',
   'eid': '2-s2.0-85098052064',
   'prism:coverDate': '2020-12-08',
   'prism:aggregationType': 'Journal',
   'prism:url': 'https://api.elsevier.com/content/abstract/scopus_id/85098052064',
   'subtypeDescription': 'Article',
   'dc:creator': {'author': [{'ce:given-name': 'Claudia',
      'preferred-name': {'ce:given-name': 'Claudia',
       'ce:initials': 'C.',
       'ce:surname': 'Stöllberger',
       'ce:indexed-name': 'Stöllberger C.'},
      '@seq': '1',
      'ce:initials': 'C.',
      '@_fa': 'true',
      'affiliation': {'@id': '125141646',
       '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/125141646'},
      'ce:surname': 'Stöllberger',
      '@auid': 

In [12]:
df_new_extra_info = pd.DataFrame(dict_new_extra_info)

In [13]:
df_new_extra_info

Unnamed: 0,17272,17273,17274,17275,17276,17277,17278,17279,17280,17281,...,18466,18467,18468,18469,18470,18471,18472,18473,18474,18475
affiliation,,"[{'affiliation-city': 'Vienna', 'affilname': '...","{'affiliation-city': 'Izmir', 'affilname': 'Iz...",,"[{'affiliation-city': 'Singapore City', 'affil...","[{'affiliation-city': 'Mohali', 'affilname': '...",,"[{'affiliation-city': 'Summerland', 'affilname...","{'affiliation-city': 'Mahendergarh', 'affilnam...","[{'affiliation-city': 'Taipei', 'affilname': '...",...,"[{'affiliation-city': 'Beijing', 'affilname': ...","[{'affiliation-city': 'Budapest', 'affilname':...","{'affiliation-city': 'Leeds', 'affilname': 'Un...","[{'affiliation-city': 'Seoul', 'affilname': 'S...","[{'affiliation-city': 'Pecs', 'affilname': 'Pé...","[{'affiliation-city': 'Rehovot', 'affilname': ...",,"[{'affiliation-city': 'Rockville', 'affilname'...",,"[{'affiliation-city': 'San Francisco', 'affiln..."
coredata,,"{'srctype': 'j', 'eid': '2-s2.0-85098052064', ...","{'srctype': 'j', 'eid': '2-s2.0-85100059646', ...",,"{'srctype': 'j', 'prism:issueIdentifier': '12'...","{'srctype': 'j', 'eid': '2-s2.0-85096210115', ...",,"{'srctype': 'j', 'eid': '2-s2.0-84906233835', ...","{'srctype': 'j', 'prism:issueIdentifier': '2',...","{'srctype': 'j', 'prism:issueIdentifier': '7',...",...,"{'srctype': 'j', 'prism:issueIdentifier': '1',...","{'srctype': 'j', 'prism:issueIdentifier': '1',...","{'srctype': 'j', 'prism:issueIdentifier': '11'...","{'srctype': 'j', 'eid': '2-s2.0-85086878323', ...","{'srctype': 'j', 'eid': '2-s2.0-85097520338', ...","{'srctype': 'j', 'eid': '2-s2.0-85100044416', ...","{'srctype': 'j', 'eid': '2-s2.0-33645723465', ...","{'srctype': 'j', 'eid': '2-s2.0-85096082670', ...",,"{'srctype': 'j', 'prism:issueIdentifier': 'AUG..."


In [14]:
df_new_extra_transposed = df_new_extra_info.T
df_new_extra_transposed

Unnamed: 0,affiliation,coredata
17272,,
17273,"[{'affiliation-city': 'Vienna', 'affilname': '...","{'srctype': 'j', 'eid': '2-s2.0-85098052064', ..."
17274,"{'affiliation-city': 'Izmir', 'affilname': 'Iz...","{'srctype': 'j', 'eid': '2-s2.0-85100059646', ..."
17275,,
17276,"[{'affiliation-city': 'Singapore City', 'affil...","{'srctype': 'j', 'prism:issueIdentifier': '12'..."
...,...,...
18471,"[{'affiliation-city': 'Rehovot', 'affilname': ...","{'srctype': 'j', 'eid': '2-s2.0-85100044416', ..."
18472,,"{'srctype': 'j', 'eid': '2-s2.0-33645723465', ..."
18473,"[{'affiliation-city': 'Rockville', 'affilname'...","{'srctype': 'j', 'eid': '2-s2.0-85096082670', ..."
18474,,


In [15]:
df_combined_extra_info = pd.concat([df_current_extra_info, df_new_extra_transposed],ignore_index=True)

In [16]:
df_combined_extra_info.to_pickle('extra_info_CS5099.pkl')

In [17]:
entry = AffiliationsFromScopusByDOI(client, '10.1086/605034')
entry

{'affiliation': [{'affiliation-city': 'Chiba',
   'affilname': 'Chiba University',
   'affiliation-country': 'Japan'},
  {'affiliation-city': 'Hanoi',
   'affilname': 'National Hospital of Pediatrics Hanoi',
   'affiliation-country': 'Viet Nam'},
  {'affiliation-city': 'Tokyo',
   'affilname': 'National Institute of Infectious Diseases',
   'affiliation-country': 'Japan'},
  {'affiliation-city': 'Tokyo',
   'affilname': 'National Center for Global Health and Medicine',
   'affiliation-country': 'Japan'},
  {'affiliation-city': 'Miyazaki',
   'affilname': 'University of Miyazaki Faculty of Medicine',
   'affiliation-country': 'Japan'},
  {'affiliation-city': 'Tokyo',
   'affilname': 'Tokai University',
   'affiliation-country': 'Japan'}],
 'coredata': {'srctype': 'j',
  'prism:issueIdentifier': '4',
  'eid': '2-s2.0-69149099607',
  'pubmed-id': '19591579',
  'prism:coverDate': '2009-08-15',
  'prism:aggregationType': 'Journal',
  'prism:url': 'https://api.elsevier.com/content/abstract/s

In [18]:
print (json.dumps(entry, ensure_ascii=False, indent=4))

{
    "affiliation": [
        {
            "affiliation-city": "Chiba",
            "affilname": "Chiba University",
            "affiliation-country": "Japan"
        },
        {
            "affiliation-city": "Hanoi",
            "affilname": "National Hospital of Pediatrics Hanoi",
            "affiliation-country": "Viet Nam"
        },
        {
            "affiliation-city": "Tokyo",
            "affilname": "National Institute of Infectious Diseases",
            "affiliation-country": "Japan"
        },
        {
            "affiliation-city": "Tokyo",
            "affilname": "National Center for Global Health and Medicine",
            "affiliation-country": "Japan"
        },
        {
            "affiliation-city": "Miyazaki",
            "affilname": "University of Miyazaki Faculty of Medicine",
            "affiliation-country": "Japan"
        },
        {
            "affiliation-city": "Tokyo",
            "affilname": "Tokai University",
            "affiliatio

In [19]:
df_test = pd.DataFrame.from_dict(entry['affiliation'], orient='columns')
df_test

Unnamed: 0,affiliation-city,affilname,affiliation-country
0,Chiba,Chiba University,Japan
1,Hanoi,National Hospital of Pediatrics Hanoi,Viet Nam
2,Tokyo,National Institute of Infectious Diseases,Japan
3,Tokyo,National Center for Global Health and Medicine,Japan
4,Miyazaki,University of Miyazaki Faculty of Medicine,Japan
5,Tokyo,Tokai University,Japan


In [20]:
entry['coredata']

{'srctype': 'j',
 'prism:issueIdentifier': '4',
 'eid': '2-s2.0-69149099607',
 'pubmed-id': '19591579',
 'prism:coverDate': '2009-08-15',
 'prism:aggregationType': 'Journal',
 'prism:url': 'https://api.elsevier.com/content/abstract/scopus_id/69149099607',
 'subtypeDescription': 'Article',
 'dc:creator': {'author': [{'ce:given-name': 'Shoji',
    'preferred-name': {'ce:given-name': 'Shoji',
     'ce:initials': 'S.',
     'ce:surname': 'Kawachi',
     'ce:indexed-name': 'Kawachi S.'},
    '@seq': '1',
    'ce:initials': 'S.',
    '@_fa': 'true',
    'affiliation': {'@id': '60022886',
     '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60022886'},
    'ce:surname': 'Kawachi',
    '@auid': '7006325573',
    'author-url': 'https://api.elsevier.com/content/author/author_id/7006325573',
    'ce:indexed-name': 'Kawachi S.'}]},
 'link': [{'@_fa': 'true',
   '@rel': 'self',
   '@href': 'https://api.elsevier.com/content/abstract/scopus_id/69149099607'},
  {'@_fa': 'true',
 

In [21]:
## ScienceDirect (full-text) document example using DOI
doi_doc = FullDoc(doi = '10.1016/S1525-1578(10)60571-5')
if doi_doc.read(client):
    print ("doi_doc.title: ", doi_doc.title)
    doi_doc.write()   
else:
    print ("Read document failed.")
doi_doc

doi_doc.title:  Sensitive Sequencing Method for KRAS Mutation Detection by Pyrosequencing 


<elsapy.elsdoc.FullDoc at 0x209c7355c40>