# CORD-19-collect-scopus-data

In general, this jupyter notebook is designated to collect additional data via scopus to enbroaden the CORD19 dataset: 
https://datadryad.org/stash/dataset/doi:10.5061/dryad.vmcvdncs0

First, relevant packages must be imported to the Notebook.

In [1]:
import numpy as np
import pandas as pd
import csv
import ast
import collections
import matplotlib.pyplot as plt
import Levenshtein as lev
from fuzzywuzzy import fuzz 
import datetime
import matplotlib.pyplot as plt
import re
from urllib.parse import urlparse
from collections import Counter

from elsapy.elsclient import ElsClient
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch

import time # for sleep
from pybtex.database import parse_file, BibliographyData, Entry
import json
from elsapy.elsclient import ElsClient
from elsapy.elsdoc import AbsDoc
from elsapy.elssearch import ElsSearch

In [2]:
CORD19_CSV = pd.read_csv('../data/cord-19/CORD19_software_mentions.csv')

In [3]:
len(CORD19_CSV['doi'])

77448

In [4]:
doi = CORD19_CSV['doi']
doi

0                                 NaN
1          10.1016/j.regg.2021.01.002
2           10.1016/j.rec.2020.08.002
3        10.1016/j.vetmic.2006.11.026
4                   10.3390/v12080849
                     ...             
77443      10.1007/s11229-020-02869-9
77444                             NaN
77445     10.1101/2020.05.13.20100206
77446      10.1007/s42991-020-00052-8
77447     10.1101/2020.09.14.20194670
Name: doi, Length: 77448, dtype: object

In [5]:
#delete Nans
doi_counted = doi.value_counts()
doi_counted

10.1016/j.dsx.2020.04.012       2
10.31729/jnma.5498              2
10.1055/s-0040-1702014          1
10.1016/j.bbrc.2008.09.125      1
10.1002/jmv.25743               1
                               ..
10.1101/2020.06.22.20137257     1
10.1136/bmj.b4571               1
10.1371/journal.pbio.2006926    1
10.1155/2020/8821077            1
10.1186/s13071-018-2740-9       1
Name: doi, Length: 74302, dtype: int64

In [6]:
def AffiliationsFromScopusByDOI(client, doi):
    """obtain addiotional paper information from scopus by doi
    """
    doc_srch = ElsSearch("DOI("+doi+")",'scopus')
    doc_srch.execute(client, get_all = True)
    #print ("doc_srch has", len(doc_srch.results), "results.")
    #print(doc_srch.results)
    try:
        scopus_id=doc_srch.results[0]["dc:identifier"].split(":")[1]
        scp_doc = AbsDoc(scp_id = scopus_id)
        if scp_doc.read(client):
            # print ("scp_doc.title: ", scp_doc.title)
            scp_doc.write()   
        else:
            print ("Read document failed.")
        # print(scp_doc.data["affiliation"])
        return scp_doc.data
    except:
        return None

In [7]:
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()
## Initialize client
client = ElsClient(config['apikey'])
# https://api.elsevier.com/content/search/scopus?query=DOI(10.1109/MCOM.2016.7509373)&apiKey=6d485ef1fe1408712f37e8a783a285a4

In [8]:
read_affiliation = pd.read_pickle('extra_info_affiliation_CS5099.pkl')
read_coredata = pd.read_pickle('extra_info_coredata_CS5099.pkl')
df_current_extra_info = pd.DataFrame()
# df_current_extra_info = df_current_extra_info.iloc[0:0]
df_current_extra_info['affiliation'] = read_affiliation
df_current_extra_info['coredata'] = read_coredata
df_current_extra_info

Unnamed: 0,affiliation,coredata
0,"[{'affiliation-city': None, 'affilname': 'Pata...","{'srctype': 'j', 'prism:issueIdentifier': '230..."
1,"[{'affiliation-city': 'New Delhi', 'affilname'...","{'srctype': 'j', 'eid': '2-s2.0-85083171050', ..."
2,,
3,"[{'affiliation-city': 'London', 'affilname': '...","{'srctype': 'j', 'eid': '2-s2.0-79953057246', ..."
4,"[{'affiliation-city': 'Baoding', 'affilname': ...","{'srctype': 'j', 'prism:issueIdentifier': '7',..."
...,...,...
74297,"[{'affiliation-city': 'Wageningen', 'affilname...","{'srctype': 'j', 'prism:issueIdentifier': '4',..."
74298,"[{'affiliation-city': 'London', 'affilname': '...","{'srctype': 'j', 'prism:issueIdentifier': '12 ..."
74299,"{'affiliation-city': 'Adelaide', 'affilname': ...","{'srctype': 'j', 'eid': '2-s2.0-85091772561', ..."
74300,"[{'affiliation-city': 'San Diego', 'affilname'...","{'srctype': 'j', 'prism:issueIdentifier': '6',..."


In [9]:
# df_current_extra_info = pd.read_pickle('extra_info_CS5099.pkl')
# df_current_extra_info

In [10]:
len_df_current_extra_info = len(df_current_extra_info)
len_df_current_extra_info

74302

In [11]:
%%time
i = len_df_current_extra_info
dict_new_extra_info = dict()
len_dois = len(doi_counted)
while i < len_dois:
    print("Position: " + str(i) + " -> " +  doi_counted.index[i])
    dict_new_extra_info[i] = AffiliationsFromScopusByDOI(client, doi_counted.index[i])
    i = i + 1 

Wall time: 0 ns


In [12]:
# dict_new_extra_info_saver = dict()
# i = len_df_current_extra_info
# while i < 47683:
#     #print("Position: " + str(i) + " -> " +  doi_counted.index[i])
#     dict_new_extra_info_saver[i] = dict_new_extra_info[i]
#     i = i + 1 

# dict_new_extra_info = dict_new_extra_info_saver

In [13]:
dict_new_extra_info

{}

In [14]:
df_new_extra_info = pd.DataFrame(dict_new_extra_info)
df_new_extra_info

In [15]:
df_new_extra_transposed = df_new_extra_info.T
df_new_extra_transposed

In [16]:
df_combined_extra_info = pd.concat([df_current_extra_info, df_new_extra_transposed],ignore_index=True)

In [17]:
# df_combined_extra_info.to_pickle('extra_info_CS5099.pkl')
# df_combined_extra_info

In [18]:
ser_affiliation = df_combined_extra_info['affiliation']
ser_affiliation

0        [{'affiliation-city': None, 'affilname': 'Pata...
1        [{'affiliation-city': 'New Delhi', 'affilname'...
2                                                     None
3        [{'affiliation-city': 'London', 'affilname': '...
4        [{'affiliation-city': 'Baoding', 'affilname': ...
                               ...                        
74297    [{'affiliation-city': 'Wageningen', 'affilname...
74298    [{'affiliation-city': 'London', 'affilname': '...
74299    {'affiliation-city': 'Adelaide', 'affilname': ...
74300    [{'affiliation-city': 'San Diego', 'affilname'...
74301    [{'affiliation-city': 'Taipei', 'affilname': '...
Name: affiliation, Length: 74302, dtype: object

In [19]:
ser_coredata = df_combined_extra_info['coredata']
ser_coredata

0        {'srctype': 'j', 'prism:issueIdentifier': '230...
1        {'srctype': 'j', 'eid': '2-s2.0-85083171050', ...
2                                                     None
3        {'srctype': 'j', 'eid': '2-s2.0-79953057246', ...
4        {'srctype': 'j', 'prism:issueIdentifier': '7',...
                               ...                        
74297    {'srctype': 'j', 'prism:issueIdentifier': '4',...
74298    {'srctype': 'j', 'prism:issueIdentifier': '12 ...
74299    {'srctype': 'j', 'eid': '2-s2.0-85091772561', ...
74300    {'srctype': 'j', 'prism:issueIdentifier': '6',...
74301    {'srctype': 'j', 'prism:issueIdentifier': '5',...
Name: coredata, Length: 74302, dtype: object

In [20]:
ser_affiliation.to_pickle('extra_info_affiliation_CS5099.pkl')
ser_coredata.to_pickle('extra_info_coredata_CS5099.pkl')