In [1]:
import numpy as np
import pandas as pd
import os
from os.path import join

import json
import requests
import xmltodict
import re
import time
import random

In [2]:
dataDir = '../data'

# Get Citations for articles in the COVID open-access pubmed database

**Note** Retrieving the citation list for ALL articles must occur everytime a new batch of articles is retrieved as article citations continue to grow over time

Use the list of PMCIDs from [here](https://www.ncbi.nlm.nih.gov/pmc/?term=2019-nCoV+OR+2019nCoV+OR+COVID-19+OR+SARS-CoV-2+OR+((wuhan+AND+coronavirus)+AND+2019%2F12%5BPDAT%5D%3A2030%5BPDAT%5D)%20AND%20%22open%20access%22%5BFilter%5D)

To create a list of IDs for all articles, go to the bottom of that link, click "Send to:", choose "File" and select "Format: PMCID list". Download, and move to `data/` within this repository, and rename to `covidPMCIDs.csv`


In [3]:
# get a list of all PMC IDs
with open(join(dataDir, 'covidPMCIDs.csv'), 'r') as f:
    PMCIDs = f.read().splitlines()

In [4]:
PMCIDs[:3]

['PMC7405836', 'PMC7373339', 'PMC7759095']

In [5]:
with open('NCBI_API_key.txt', 'r') as f:
    NCBI_API_KEY = f.read().rstrip('\n')

In [10]:
# Tools for parsing the citation dictionary returned by the API
def getSrcPMCID(citeDict):
    # return the ID of the source article (i.e. the one that was cited)
    return 'PMC{}'.format(citeDict['IdList']['Id'])

def getCiteIDs(citeDict):
    # return a list of PMCIDs in the given cite dictionary
    if 'LinkSetDb' not in citeDict:
        return []
    if 'Link' not in citeDict['LinkSetDb']:
        return []
        
    linkDicts = citeDict['LinkSetDb']['Link']
    if not isinstance(linkDicts, list):
        return [ linkDicts['Id'] ]
    
    
    citeIDs = []
    for linkDict in linkDicts:
        citeIDs.append(linkDict['Id'])
        
    return citeIDs
    
    

In [19]:
def getCitations(IDs, citedBy=True):
    """
    For each PMCID in the given list, look up either 
    1) the articleIDs that cite this id (citedBy=True), or
    2) the articleIDs that this id cites (citedBy=False)
    
    Returns a list, with each entry a dictionary of citations. 
    If 1)  the dictionary contains:
        - list of the PMCIDs of articles that cite it
        - list of the PMCIDs of articles that cite it that are w/in the covid database
        - number of articles that cite it
        - number of articles w/in covid database that cite it
        
    If 2) the dictionary contains:
        - list of the PMCIDs that the given article cites
        - list of the PMCIDs within the covid database that the given article cites
        - number of articles that the given ID cites
        - number of articles w/in covid database that the given ID cites
        
    The API lets you request multiple IDs at once. Hence why this func takes in a list of IDs
    
    Returns a list
    """
    ### set the API tool
    if citedBy:
        eutils_tool = 'pmc_pmc_citedby'
    else:
        eutils_tool = 'pmc_pmc_cites'
    
    ### format URL
    baseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    toolName = "covidcitations"
    email = "jeff.macinnes@duke.edu"
    params = f"?dbfrom=pubmed&linkname={eutils_tool}&tool={toolName}&email={email}&api_key={NCBI_API_KEY}"
    requestedIDs = "{}".format(''.join(['&id={}'.format(x[3:]) for x in IDs]))
    URL = baseURL + params + requestedIDs

    ### retrieve data
    page = requests.get(URL, timeout=10)
    if page.status_code != 200:
        print('request failed, {} IDs, {} (first), {} (last)'.format(len(IDs), IDs[0], IDs[-1]))
        return []
    parsed = xmltodict.parse(page.content)

    ### get relevant bit from the parsed xml
    # make sure linkSets is a list
    if len(IDs) == 1:
        linkSets = [parsed['eLinkResult']['LinkSet']]
    else:
        linkSets = parsed['eLinkResult']['LinkSet']

    citationSets = []
    for linkSet in linkSets:
        srcID = getSrcPMCID(linkSet)
        citeIDs = getCiteIDs(linkSet)
        
        # calculate how many of the citing articles are in the COVID PMCID list
        citeIDs = ['PMC{}'.format(x) for x in citeIDs]
        citeIDs_covid = list(set(citeIDs).intersection(set(PMCIDs)))
        
        # format the results based on whether citations or cited-by was requested
        if citedBy:
            citationSets.append({
                "PMCID": srcID,
                "nCitedBy": len(citeIDs),
                "nCitedBy_covid": len(citeIDs_covid),
                "citedBy": citeIDs,
                "citedBy_covid": citeIDs_covid
            })
        else:
            citationSets.append({
                "PMCID": srcID,
                "nCitations": len(citeIDs),
                "nCitations_covid": len(citeIDs),
                "citations": citeIDs,
                "citations_covid": citeIDs_covid
            })

    return citationSets

In [16]:
def chunkIDs(IDs, chunkLen):
    """ Divide the list of IDs into separate lists of size chunkLen"""
    for i in range(0, len(IDs), chunkLen):
        yield IDs[i:i+chunkLen]
    

In [23]:
# test
IDs = PMCIDs[:2]

combined = []

citations = getCitations(IDs, citedBy=False)
citedBys = getCitations(IDs, citedBy=True)

for i in range(len(citations)):
    thisCitations = citations[i]
    thisCitedBy = citedBys[i]
    
    combined.append({**thisCitations, **thisCitedBy})

## Retrieve new citations

Check against existing data to see which PMCIDs are new.

For each new PMCID, get the list of articles that cite it, and append to list

In [26]:
citeFile = join(dataDir, 'articleCitations.json')

In [30]:
# overwrite old output file
with open(citeFile, 'w') as f:
    f.write('')

# split IDs into groups of certain length to batch multiple at once
print('{} PMCIDs'.format(len(PMCIDs)))
ID_chunkList = list(chunkIDs(PMCIDs, 100))
print("collecting {} sets of IDs".format(len(ID_chunkList)))
for i,ID_chunk in enumerate(ID_chunkList):
    
    # print
    print(i+1, end=', ')
    
    # collect citations data
    try:
        theseCitations = getCitations(ID_chunk, citedBy=False)
    except Exception as e:
        print("Error: {}".format(e))
    time.sleep(random.choice([.1, .11, .12, .13]))
    
    # collect citedBy citations
    try: 
        theseCitedBy = getCitations(ID_chunk, citedBy=True)
    except Exception as e:
        print("Error: {}".format(e))
    time.sleep(random.choice([.1, .11, .12, .13]))   
    
    # combined the results
    combined = []
    for i in range(len(theseCitations)):
        thisCitations = theseCitations[i]
        thisCitedBy = theseCitedBy[i]
        combined.append({**thisCitations, **thisCitedBy})
            
    # store the results
    with open(citeFile, 'a', encoding='utf-8') as f:
        for ID_citations in combined:
            json.dump(ID_citations, f, ensure_ascii=False)
            f.write('\n')



90032 PMCIDs
collecting 901 sets of IDs
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214

### Misc

In [105]:
IDs = ['PMC7662698']

#IDs = [PMCIDs[0]]

IDs = PMCIDs[:2]

# ------------

### format URL
baseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
toolName = "covidcitations"
email = "jeff.macinnes@duke.edu"
params = "?dbfrom=pubmed&linkname=pmc_pmc_citedby&tool={}&email={}".format(toolName, email)
requestedIDs = "{}".format(''.join(['&id={}'.format(x[3:]) for x in IDs]))
URL = baseURL + params + requestedIDs

### retrieve data
page = requests.get(URL, timeout=10)
parsed = xmltodict.parse(page.content)

### get relevant bit from the parsed xml
# make sure linkSets is a list
if len(IDs) == 1:
    linkSets = [parsed['eLinkResult']['LinkSet']]
else:
    linkSets = parsed['eLinkResult']['LinkSet']
    
citationSets = []
print(linkSets)
print(len(linkSets))
for linkSet in linkSets:
    srcID = getSrcPMCID(linkSet)
    citations = getCitations(linkSet)
#     citationSets.append({
#         "PMCID": srcID,
#         "nCitations": len(citations),
#         "citations": citations
#     })

#citationSets

[OrderedDict([('DbFrom', 'pubmed'), ('IdList', OrderedDict([('Id', '7384689')])), ('LinkSetDb', OrderedDict([('DbTo', 'pmc'), ('LinkName', 'pmc_pmc_citedby'), ('Link', [OrderedDict([('Id', '7730003')]), OrderedDict([('Id', '7721080')]), OrderedDict([('Id', '7709804')]), OrderedDict([('Id', '7689662')]), OrderedDict([('Id', '7682380')]), OrderedDict([('Id', '7679797')]), OrderedDict([('Id', '7679035')]), OrderedDict([('Id', '7663373')]), OrderedDict([('Id', '7648466')]), OrderedDict([('Id', '7584839')]), OrderedDict([('Id', '7567238')]), OrderedDict([('Id', '7556821')]), OrderedDict([('Id', '7544522')]), OrderedDict([('Id', '7536342')]), OrderedDict([('Id', '7521469')]), OrderedDict([('Id', '7472808')]), OrderedDict([('Id', '7456291')]), OrderedDict([('Id', '7456282')]), OrderedDict([('Id', '7452828')]), OrderedDict([('Id', '7418728')]), OrderedDict([('Id', '7384692')])])]))]), OrderedDict([('DbFrom', 'pubmed'), ('IdList', OrderedDict([('Id', '7373339')])), ('LinkSetDb', OrderedDict([('