# CORD-19-collect-scopus-data

In general, this jupyter notebook is designated to collect additional data via scopus to enbroaden the CORD19 dataset: 
https://datadryad.org/stash/dataset/doi:10.5061/dryad.vmcvdncs0

First, relevant packages must be imported to the Notebook.

In [1]:
import numpy as np
import pandas as pd
import csv
import ast
import collections
import matplotlib.pyplot as plt
import Levenshtein as lev
from fuzzywuzzy import fuzz 
import datetime
import matplotlib.pyplot as plt
import re
from urllib.parse import urlparse
from collections import Counter

from elsapy.elsclient import ElsClient
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch

import time # for sleep
from pybtex.database import parse_file, BibliographyData, Entry
import json
from elsapy.elsclient import ElsClient
from elsapy.elsdoc import AbsDoc
from elsapy.elssearch import ElsSearch

In [2]:
CORD19_CSV = pd.read_csv('../data/cord-19/CORD19_software_mentions.csv')

In [3]:
len(CORD19_CSV['doi'])

77448

In [4]:
doi = CORD19_CSV['doi']
doi

0                                 NaN
1          10.1016/j.regg.2021.01.002
2           10.1016/j.rec.2020.08.002
3        10.1016/j.vetmic.2006.11.026
4                   10.3390/v12080849
                     ...             
77443      10.1007/s11229-020-02869-9
77444                             NaN
77445     10.1101/2020.05.13.20100206
77446      10.1007/s42991-020-00052-8
77447     10.1101/2020.09.14.20194670
Name: doi, Length: 77448, dtype: object

In [5]:
#delete Nans
doi_counted = doi.value_counts()
doi_counted

10.31729/jnma.5498            2
10.1016/j.dsx.2020.04.012     2
10.1007/s13384-020-00416-6    1
10.1016/j.jcv.2018.04.012     1
10.1007/s11071-020-05854-6    1
                             ..
10.14336/ad.2017.0701         1
10.1093/aje/kwy239            1
10.1186/s12879-016-1525-z     1
10.1002/med.21664             1
10.1007/s10554-020-02127-6    1
Name: doi, Length: 74302, dtype: int64

In [6]:
def AffiliationsFromScopusByDOI(client, doi):
    """obtain addiotional paper information from scopus by doi
    """
    doc_srch = ElsSearch("DOI("+doi+")",'scopus')
    doc_srch.execute(client, get_all = True)
    #print ("doc_srch has", len(doc_srch.results), "results.")
    #print(doc_srch.results)
    try:
        scopus_id=doc_srch.results[0]["dc:identifier"].split(":")[1]
        scp_doc = AbsDoc(scp_id = scopus_id)
        if scp_doc.read(client):
            # print ("scp_doc.title: ", scp_doc.title)
            scp_doc.write()   
        else:
            print ("Read document failed.")
        # print(scp_doc.data["affiliation"])
        return scp_doc.data
    except:
        return None

In [7]:
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()
## Initialize client
client = ElsClient(config['apikey'])
# https://api.elsevier.com/content/search/scopus?query=DOI(10.1109/MCOM.2016.7509373)&apiKey=6d485ef1fe1408712f37e8a783a285a4

In [8]:
df_current_extra_info = pd.read_pickle('extra_info_CS5099.pkl')
df_current_extra_info

Unnamed: 0,affiliation,coredata
0,"[{'affiliation-city': None, 'affilname': 'Pata...","{'srctype': 'j', 'prism:issueIdentifier': '230..."
1,"[{'affiliation-city': 'New Delhi', 'affilname'...","{'srctype': 'j', 'eid': '2-s2.0-85083171050', ..."
2,,
3,"[{'affiliation-city': 'London', 'affilname': '...","{'srctype': 'j', 'eid': '2-s2.0-79953057246', ..."
4,"[{'affiliation-city': 'Baoding', 'affilname': ...","{'srctype': 'j', 'prism:issueIdentifier': '7',..."
...,...,...
10853,,
10854,"[{'affiliation-city': 'Beer Yacov', 'affilname...","{'srctype': 'j', 'eid': '2-s2.0-85096945420', ..."
10855,"[{'affiliation-city': 'Kampala', 'affilname': ...","{'srctype': 'j', 'eid': '2-s2.0-85089620904', ..."
10856,"[{'affiliation-city': 'Naples', 'affilname': '...","{'srctype': 'j', 'eid': '2-s2.0-85097658009', ..."


In [9]:
len_df_current_extra_info = len(df_current_extra_info)
len_df_current_extra_info

10858

In [10]:
%%time
i = len_df_current_extra_info
dict_new_extra_info = dict()
len_dois = len(doi_counted)
while i < len_dois:
    print("Position: " + str(i) + " -> " +  doi_counted.index[i])
    dict_new_extra_info[i] = AffiliationsFromScopusByDOI(client, doi_counted.index[i])
    i = i + 1 

Position: 10858 -> 10.1016/j.psychres.2020.113570
Position: 10859 -> 10.1073/pnas.2009412117
Position: 10860 -> 10.3390/ijerph15112369
Position: 10861 -> 10.3390/v12080891
Position: 10862 -> 10.1002/pbc.28790
Position: 10863 -> 10.1016/j.virusres.2008.05.008
Position: 10864 -> 10.1186/s12879-015-1271-7
Position: 10865 -> 10.1155/2014/967946
Position: 10866 -> 10.1590/0074-02760200328
Position: 10867 -> 10.1016/j.tim.2020.10.008
Position: 10868 -> 10.1111/j.1440-1843.2010.01864.x
Position: 10869 -> 10.1016/j.jmb.2020.166748
Position: 10870 -> 10.1007/s10447-020-09411-z
Position: 10871 -> 10.1111/j.1096-0031.2012.00401.x
Position: 10872 -> 10.1002/ehf2.12916
Position: 10873 -> 10.3390/s20113089
Position: 10874 -> 10.1016/j.jpubeco.2020.104296
Position: 10875 -> 10.3389/fpsyg.2020.614770
Position: 10876 -> 10.1186/s13104-016-2040-4
Position: 10877 -> 10.1007/s35658-020-0277-0
Position: 10878 -> 10.3389/fneur.2020.581394
Position: 10879 -> 10.1016/j.eclinm.2020.100683
Position: 10880 -> 10

Position: 11040 -> 10.1111/resp.12905
Position: 11041 -> 10.1007/s00251-010-0498-0
Position: 11042 -> 10.1101/2020.09.01.20186304
Position: 11043 -> 10.3389/fgene.2020.00750
Position: 11044 -> 10.1183/13993003.02554-2020
Position: 11045 -> 10.1093/pubmed/fdaa096
Position: 11046 -> 10.1016/j.ijid.2020.12.021
Position: 11047 -> 10.3390/v12050560
Position: 11048 -> 10.1007/s12498-020-1254-5
Position: 11049 -> 10.1101/2020.10.21.20217380
Position: 11050 -> 10.1007/978-3-030-51054-1_3
Position: 11051 -> 10.1007/s12250-019-00157-6
Position: 11052 -> 10.1371/journal.pone.0237548
Position: 11053 -> 10.1007/s00134-020-06312-y
Position: 11054 -> 10.1007/978-981-10-5451-8_2
Position: 11055 -> 10.1016/j.chemolab.2020.104172
Position: 11056 -> 10.1186/s40635-020-00334-y
Position: 11057 -> 10.1186/s12909-020-02291-2
Position: 11058 -> 10.1038/s41598-020-77698-4
Position: 11059 -> 10.1016/j.resuscitation.2020.09.013
Position: 11060 -> 10.1111/ajt.13897
Position: 11061 -> 10.1007/s12564-020-09658-6
Po

Position: 11221 -> 10.1017/s0950268820001065
Position: 11222 -> 10.24171/j.phrp.2020.11.5.02
Position: 11223 -> 10.1007/s11126-020-09808-4
Position: 11224 -> 10.1136/bmjgh-2020-003276
Position: 11225 -> 10.3389/fmicb.2019.00596
Position: 11226 -> 10.1016/j.virol.2005.06.026
Position: 11227 -> 10.1016/j.jogc.2020.08.008
Position: 11228 -> 10.1371/journal.pone.0244968
Position: 11229 -> 10.7554/elife.58537
Position: 11230 -> 10.1140/epjp/s13360-020-00895-7
Position: 11231 -> 10.1038/s41401-019-0288-7
Position: 11232 -> 10.3389/fpubh.2020.592878
Position: 11233 -> 10.15585/mmwr.mm6934e1
Position: 11234 -> 10.2147/cia.s260098
Position: 11235 -> 10.1186/s12896-018-0483-5
Position: 11236 -> 10.1186/s13059-020-02191-0
Position: 11237 -> 10.1021/acsmedchemlett.0c00410
Position: 11238 -> 10.1016/j.tmaid.2020.101651
Position: 11239 -> 10.1016/j.ajp.2020.102085
Position: 11240 -> 10.1186/s12889-020-10057-0
Position: 11241 -> 10.1101/2020.05.18.20105783
Position: 11242 -> 10.1101/2021.01.26.428302

Position: 11401 -> 10.1101/2020.08.09.20170985
Position: 11402 -> 10.1371/journal.ppat.0030109
Position: 11403 -> 10.1073/pnas.2014297117
Position: 11404 -> 10.1007/s11414-019-09654-8
Position: 11405 -> 10.26434/chemrxiv.12682316
Position: 11406 -> 10.1186/1746-6148-9-184
Position: 11407 -> 10.1093/labmed/lmaa104
Position: 11408 -> 10.1186/s13223-020-00466-6
Position: 11409 -> 10.3390/ijerph17238987
Position: 11410 -> 10.1186/s12992-019-0449-y
Position: 11411 -> 10.1101/2020.07.09.20149997
Position: 11412 -> 10.1186/s43044-020-00105-w
Position: 11413 -> 10.1186/s13063-020-04915-w
Position: 11414 -> 10.1513/annalsats.202007-799rl
Position: 11415 -> 10.1038/s41598-019-51947-7
Position: 11416 -> 10.1111/j.1472-765x.2011.03107.x
Position: 11417 -> 10.15585/mmwr.mm6923e2
Position: 11418 -> 10.1016/s2352-4642(20)30304-7
Position: 11419 -> 10.1186/s12960-020-00545-0
Position: 11420 -> 10.1186/s12871-020-01162-z
Position: 11421 -> 10.1101/2020.06.11.146522
Position: 11422 -> 10.1093/ecam/nem15

Position: 11584 -> 10.1186/s12909-020-02356-2
Position: 11585 -> 10.1016/j.wdp.2020.100245
Position: 11586 -> 10.3389/fmicb.2018.03174
Position: 11587 -> 10.1038/s41422-020-00392-7
Position: 11588 -> 10.1186/s13054-020-2834-6
Position: 11589 -> 10.1134/s1024856020060184
Position: 11590 -> 10.1016/j.anai.2021.01.017
Position: 11591 -> 10.1016/j.aej.2021.01.043
Position: 11592 -> 10.4274/balkanmedj.galenos.2020.2020.7.52
Position: 11593 -> 10.3390/ijerph17249531
Position: 11594 -> 10.1038/srep23774
Position: 11595 -> 10.1101/2020.11.21.20236018
Position: 11596 -> 10.1101/2020.10.21.347690
Position: 11597 -> 10.1093/cid/ciu647
Position: 11598 -> 10.1016/j.scs.2020.102577
Position: 11599 -> 10.1111/bjh.16985
Position: 11600 -> 10.1016/j.gloepi.2020.100035
Position: 11601 -> 10.1016/j.molstruc.2021.129979
Position: 11602 -> 10.2147/jaa.s279061
Position: 11603 -> 10.1128/jvi.02160-17
Position: 11604 -> 10.1186/s12874-020-01110-y
Position: 11605 -> 10.1007/978-3-030-48077-6_16
Position: 11606

ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))

In [11]:
dict_new_extra_info

{10858: {'affiliation': {'affiliation-city': 'Nashville',
   'affilname': 'Vanderbilt University',
   'affiliation-country': 'United States'},
  'coredata': {'srctype': 'j',
   'eid': '2-s2.0-85096520842',
   'pubmed-id': '33234326',
   'prism:coverDate': '2021-01-01',
   'prism:aggregationType': 'Journal',
   'prism:url': 'https://api.elsevier.com/content/abstract/scopus_id/85096520842',
   'subtypeDescription': 'Article',
   'dc:creator': {'author': [{'ce:given-name': 'Hyeon-Seung',
      'preferred-name': {'ce:given-name': 'Hyeon Seung',
       'ce:initials': 'H.S.',
       'ce:surname': 'Lee',
       'ce:indexed-name': 'Lee H.S.'},
      '@seq': '1',
      'ce:initials': 'H.-S.',
      '@_fa': 'true',
      'affiliation': {'@id': '60003915',
       '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60003915'},
      'ce:degrees': 'M.Sc.',
      'ce:surname': 'Lee',
      '@auid': '57190622037',
      'author-url': 'https://api.elsevier.com/content/author/author_i

In [12]:
df_new_extra_info = pd.DataFrame(dict_new_extra_info)

In [13]:
df_new_extra_info

Unnamed: 0,10858,10859,10860,10861,10862,10863,10864,10865,10866,10867,...,11654,11655,11656,11657,11658,11659,11660,11661,11662,11663
affiliation,"{'affiliation-city': 'Nashville', 'affilname':...","[{'affiliation-city': 'Davis', 'affilname': 'U...","[{'affiliation-city': 'Seoul', 'affilname': 'K...","[{'affiliation-city': 'Maisons-Alfort', 'affil...","[{'affiliation-city': 'Southampton', 'affilnam...","[{'affiliation-city': 'Geelong', 'affilname': ...","[{'affiliation-city': 'Marcy-l'Etoile', 'affil...","[{'affiliation-city': 'Khulna', 'affilname': '...",,"[{'affiliation-city': 'Vienna', 'affilname': '...",...,,"[{'affiliation-city': 'Madrid', 'affilname': '...",,"[{'affiliation-city': 'Wuhan', 'affilname': 'K...","{'affiliation-city': 'Lyon', 'affilname': 'Aus...",,"{'affiliation-city': 'Regensburg', 'affilname'...","{'affiliation-city': 'Safat', 'affilname': 'Al...","[{'affiliation-city': 'Atlanta', 'affilname': ...","[{'affiliation-city': 'Rio de Janeiro', 'affil..."
coredata,"{'srctype': 'j', 'eid': '2-s2.0-85096520842', ...","{'srctype': 'j', 'eid': '2-s2.0-85089741555', ...","{'srctype': 'j', 'prism:issueIdentifier': '11'...","{'srctype': 'j', 'prism:issueIdentifier': '8',...","{'srctype': 'j', 'prism:issueIdentifier': '2',...","{'srctype': 'j', 'eid': '2-s2.0-46249107600', ...","{'srctype': 'j', 'prism:issueIdentifier': '1',...","{'srctype': 'j', 'eid': '2-s2.0-84920505456', ...",,"{'srctype': 'j', 'eid': '2-s2.0-85097878477', ...",...,,"{'srctype': 'j', 'eid': '2-s2.0-85085638496', ...",,"{'srctype': 'j', 'prism:issueIdentifier': '1',...","{'srctype': 'j', 'eid': '2-s2.0-85088572802', ...",,"{'srctype': 'j', 'eid': '2-s2.0-85097251977', ...","{'srctype': 'j', 'prism:issueIdentifier': '1',...","{'srctype': 'j', 'prism:issueIdentifier': '1',...","{'srctype': 'j', 'eid': '2-s2.0-85083879171', ..."


In [14]:
df_new_extra_transposed = df_new_extra_info.T
df_new_extra_transposed

Unnamed: 0,affiliation,coredata
10858,"{'affiliation-city': 'Nashville', 'affilname':...","{'srctype': 'j', 'eid': '2-s2.0-85096520842', ..."
10859,"[{'affiliation-city': 'Davis', 'affilname': 'U...","{'srctype': 'j', 'eid': '2-s2.0-85089741555', ..."
10860,"[{'affiliation-city': 'Seoul', 'affilname': 'K...","{'srctype': 'j', 'prism:issueIdentifier': '11'..."
10861,"[{'affiliation-city': 'Maisons-Alfort', 'affil...","{'srctype': 'j', 'prism:issueIdentifier': '8',..."
10862,"[{'affiliation-city': 'Southampton', 'affilnam...","{'srctype': 'j', 'prism:issueIdentifier': '2',..."
...,...,...
11659,,
11660,"{'affiliation-city': 'Regensburg', 'affilname'...","{'srctype': 'j', 'eid': '2-s2.0-85097251977', ..."
11661,"{'affiliation-city': 'Safat', 'affilname': 'Al...","{'srctype': 'j', 'prism:issueIdentifier': '1',..."
11662,"[{'affiliation-city': 'Atlanta', 'affilname': ...","{'srctype': 'j', 'prism:issueIdentifier': '1',..."


In [15]:
df_combined_extra_info = pd.concat([df_current_extra_info, df_new_extra_transposed],ignore_index=True)

In [16]:
df_combined_extra_info.to_pickle('extra_info_CS5099.pkl')

In [17]:
entry = AffiliationsFromScopusByDOI(client, '10.1086/605034')
entry

{'affiliation': [{'affiliation-city': 'Chiba',
   'affilname': 'Chiba University',
   'affiliation-country': 'Japan'},
  {'affiliation-city': 'Hanoi',
   'affilname': 'National Hospital of Pediatrics Hanoi',
   'affiliation-country': 'Viet Nam'},
  {'affiliation-city': 'Tokyo',
   'affilname': 'National Institute of Infectious Diseases',
   'affiliation-country': 'Japan'},
  {'affiliation-city': 'Tokyo',
   'affilname': 'National Center for Global Health and Medicine',
   'affiliation-country': 'Japan'},
  {'affiliation-city': 'Miyazaki',
   'affilname': 'University of Miyazaki Faculty of Medicine',
   'affiliation-country': 'Japan'},
  {'affiliation-city': 'Tokyo',
   'affilname': 'Tokai University',
   'affiliation-country': 'Japan'}],
 'coredata': {'srctype': 'j',
  'prism:issueIdentifier': '4',
  'eid': '2-s2.0-69149099607',
  'pubmed-id': '19591579',
  'prism:coverDate': '2009-08-15',
  'prism:aggregationType': 'Journal',
  'prism:url': 'https://api.elsevier.com/content/abstract/s

In [18]:
print (json.dumps(entry, ensure_ascii=False, indent=4))

{
    "affiliation": [
        {
            "affiliation-city": "Chiba",
            "affilname": "Chiba University",
            "affiliation-country": "Japan"
        },
        {
            "affiliation-city": "Hanoi",
            "affilname": "National Hospital of Pediatrics Hanoi",
            "affiliation-country": "Viet Nam"
        },
        {
            "affiliation-city": "Tokyo",
            "affilname": "National Institute of Infectious Diseases",
            "affiliation-country": "Japan"
        },
        {
            "affiliation-city": "Tokyo",
            "affilname": "National Center for Global Health and Medicine",
            "affiliation-country": "Japan"
        },
        {
            "affiliation-city": "Miyazaki",
            "affilname": "University of Miyazaki Faculty of Medicine",
            "affiliation-country": "Japan"
        },
        {
            "affiliation-city": "Tokyo",
            "affilname": "Tokai University",
            "affiliatio

In [19]:
df_test = pd.DataFrame.from_dict(entry['affiliation'], orient='columns')
df_test

Unnamed: 0,affiliation-city,affilname,affiliation-country
0,Chiba,Chiba University,Japan
1,Hanoi,National Hospital of Pediatrics Hanoi,Viet Nam
2,Tokyo,National Institute of Infectious Diseases,Japan
3,Tokyo,National Center for Global Health and Medicine,Japan
4,Miyazaki,University of Miyazaki Faculty of Medicine,Japan
5,Tokyo,Tokai University,Japan


In [20]:
entry['coredata']

{'srctype': 'j',
 'prism:issueIdentifier': '4',
 'eid': '2-s2.0-69149099607',
 'pubmed-id': '19591579',
 'prism:coverDate': '2009-08-15',
 'prism:aggregationType': 'Journal',
 'prism:url': 'https://api.elsevier.com/content/abstract/scopus_id/69149099607',
 'subtypeDescription': 'Article',
 'dc:creator': {'author': [{'ce:given-name': 'Shoji',
    'preferred-name': {'ce:given-name': 'Shoji',
     'ce:initials': 'S.',
     'ce:surname': 'Kawachi',
     'ce:indexed-name': 'Kawachi S.'},
    '@seq': '1',
    'ce:initials': 'S.',
    '@_fa': 'true',
    'affiliation': {'@id': '60022886',
     '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60022886'},
    'ce:surname': 'Kawachi',
    '@auid': '7006325573',
    'author-url': 'https://api.elsevier.com/content/author/author_id/7006325573',
    'ce:indexed-name': 'Kawachi S.'}]},
 'link': [{'@_fa': 'true',
   '@rel': 'self',
   '@href': 'https://api.elsevier.com/content/abstract/scopus_id/69149099607'},
  {'@_fa': 'true',
 

In [21]:
## ScienceDirect (full-text) document example using DOI
doi_doc = FullDoc(doi = '10.1016/S1525-1578(10)60571-5')
if doi_doc.read(client):
    print ("doi_doc.title: ", doi_doc.title)
    doi_doc.write()   
else:
    print ("Read document failed.")
doi_doc

doi_doc.title:  Sensitive Sequencing Method for KRAS Mutation Detection by Pyrosequencing 


<elsapy.elsdoc.FullDoc at 0x1d3f9a32d00>