# A shift of editorial focus towards dissemination of information on the SARS-CoV-2 pandemic #

This notebook contains the scripts necessary to reproducibly create the data analyzed in Gray et al. 2020.

## Sections:

1. Obtain DOIs of papers pertain to the COVID-19 pandemic.
2. Construct the three cohorts of version 1 (v1) preprint abstracts: 
    * The x months leading up to the beginning of the pandemic<sup>*</sup>
    * The SARS-CoV-2 COVID-19 articles
    * Articles deposited as preprints and published in the course of the pandemic
3. 

In [2]:
MAX_RETRIES = 5
PUBMED_API_KEY = "cca9551545bbf746ce6248903f91daed6b08"
import json

with open('data.json', 'r') as f:
    dataWithPublished = json.load(f)

with open('metadata.json', 'r') as f:
    metadata = json.load(f)



In [3]:
# Use the bioRxiv API to obtain DOIs of papers pertaining to the COVID-19 pandemic

from tqdm import tqdm
from time import sleep
import requests

covid_collection_base = 'https://api.biorxiv.org/covid19/%d'

covid_preprints = []
while True:
    try:
        lengthInfo = json.loads(requests.get(covid_collection_base % 0).text)
        totalDocs = lengthInfo['messages'][0]['total']
        stepInterval = lengthInfo['messages'][0]['count']
        break
    except:
        pass
for i in tqdm(range(0, totalDocs, stepInterval)):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            response = json.loads(requests.get(covid_collection_base % i).text)
            covid_preprints = covid_preprints + [{'server': datum['rel_site'], 'DOI': datum['rel_doi']} for datum in response['collection']]
            break
        except:
            retries += 1
print('\n%s COVID-19 SARS-CoV-2 DOIs found' % len(covid_preprints))

100%|██████████| 297/297 [01:51<00:00,  2.67it/s]
8907 COVID-19 SARS-CoV-2 DOIs found



In [18]:
# Get information for all articles in bioRxiv and medRxiv
from datetime import datetime

data = []
while True:
    try:
        date_1st_article_mRxiv = datetime.strptime(json.loads(requests.get('http://api.biorxiv.org/details/medrxiv/1').text)['collection'][0]['date'],'%Y-%m-%d').date()
        date_1st_article_bRxiv = datetime.strptime(json.loads(requests.get('http://api.biorxiv.org/details/biorxiv/1').text)['collection'][0]['date'],'%Y-%m-%d').date()
        medRxivLengthResponse = json.loads(requests.get('http://api.biorxiv.org/details/medrxiv/%s/%s/%s' % (date_1st_article_mRxiv,datetime.now().date(), 0), timeout=100).text)['messages'][0]
        bioRxivLengthResponse = json.loads(requests.get('http://api.biorxiv.org/details/biorxiv/%s/%s/%s' % (date_1st_article_bRxiv,datetime.now().date(), 0), timeout=100).text)['messages'][0]
        break
    except:
        pass
for cursor in tqdm(range(0, bioRxivLengthResponse['total'], bioRxivLengthResponse['count'])):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            response = json.loads(requests.get('http://api.biorxiv.org/details/biorxiv/%s/%s/%s' % (date_1st_article_bRxiv,datetime.now().date(), cursor), timeout=100).text)
            break
        except:
            retries += 1
    data += response['collection']
for cursor in tqdm(range(0, medRxivLengthResponse['total'], medRxivLengthResponse['count'])):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            response = json.loads(requests.get('http://api.biorxiv.org/details/medrxiv/%s/%s/%s' % (date_1st_article_mRxiv,datetime.now().date(), cursor), timeout=100).text)
            break
        except:
            retries += 1
    data += response['collection']

# Computing derived preprint properties
for entry in tqdm(data):
    if 'authors' in entry:
        if len(entry['authors']) > 0 and entry['authors'][-1] == ';':
            entry['pp_num_authors'] = len(entry['authors'][:-1].split(';'))
        else:
            entry['pp_num_authors'] = len(entry['authors'].split(';'))


100%|██████████| 1339/1339 [56:05<00:00,  2.51s/it]
100%|██████████| 146/146 [01:15<00:00,  1.95it/s]
100%|██████████| 148402/148402 [00:01<00:00, 111648.03it/s]


In [19]:
updatedDataWithPublished = [x for x in data if 'published' in x and x['published'] != 'NA']

In [33]:
for entry in tqdm(updatedDataWithPublished):
    alreadyIndexedDOI = next((x for x in dataWithPublished if x['doi'] == entry['doi']), None)
    if alreadyIndexedDOI is not None:
        latestPulledVersion = max([int(x['version']) for x in dataWithPublished if x['doi'] == entry['doi']])
        if int(entry['version']) > latestPulledVersion:
            dataWithPublished.append(entry)   
    else:
        dataWithPublished.append(entry)       

100%|██████████| 63955/63955 [25:39<00:00, 41.55it/s]


In [43]:
# Get data from pubmed for peer-reviewed titles and abstracts
from eutils import Client

ec = Client(api_key=PUBMED_API_KEY)
doisProcessed = {}
for entry in tqdm(dataWithPublished):
    if 'pmid' in entry and entry['pmid'] is not None:
        doisProcessed[entry['doi']] = {}
        doisProcessed[entry['doi']]['pmid'] = entry['pmid']
        doisProcessed[entry['doi']]['pmcid'] = entry['pmcid']


# Gather all possible PMIDs from pubmed
for entry in tqdm(dataWithPublished):
    if 'pmid' in entry and entry['pmid'] is not None:
        continue
    entry['pmid'] = None
    entry['pmcid'] = None
    doi = entry['published']
    retries = 0

    if doi in doisProcessed:
        entry['pmid'] = doisProcessed[doi]['pmid']
        entry['pmcid'] = doisProcessed[doi]['pmcid']
        continue
    doisProcessed[doi] = {}
    doisProcessed[doi]['pmid'] = None
    doisProcessed[doi]['pmcid'] = None
    while retries < MAX_RETRIES:
        try:
            esr = ec.esearch(db='pubmed', term='%s[Location ID]' % doi)
            if len(esr.ids) == 1:
                sleep(0.5)
                paset = ec.efetch(db='pubmed', id=esr.ids)
                confirmed = False
                pmcid = None
                for pa in paset:
                    if pa.doi == doi:
                        confirmed = True
                        pmcid = pa.pmc
                if confirmed:
                    entry['pmid'] = esr.ids[0]
                    entry['pmcid'] = pmcid
                    doisProcessed[doi]['pmid'] = esr.ids[0]
                    doisProcessed[doi]['pmcid'] = pmcid
            break
        except:
            print("error, retrying for doi: %s" % doi)
            retries += 1

print("\n\nFound %d entries with a PMID" % sum(entry.get('pmid') is not None for entry in dataWithPublished))
print("Found %d entries with a PMCID" % sum(entry.get('pmcid') is not None for entry in dataWithPublished))

100%|██████████| 63954/63954 [00:00<00:00, 914570.18it/s]
100%|██████████| 63954/63954 [42:12<00:00, 25.26it/s]

Found 58687 entries with a PMID
Found 45402 entires with a PMCID



In [44]:
def safeAccessInternalElement(element, elementsInPath):
    currElement = element
    for e in elementsInPath:
        currElement = currElement.find(e)
        if currElement == None:
            return None
    return currElement

In [45]:
def safeGetFromJSON(jsonToSearch, pathToSearch):
    head = jsonToSearch
    for el in pathToSearch:
        if el in head:
            head = head[el]
        else:
            return None
    return head

In [46]:
def recursiveGetTextFromNode(node, nodeToCheckForLabel = "nonsense", labelName="nonsense"):
    text = ""
    for elem in node.iter():
        if elem.tag == nodeToCheckForLabel:
            if labelName in elem.attrib:
                if elem.attrib[labelName]:
                    text += elem.attrib[labelName] + " "
        if elem.text:
            text += elem.text
        if elem.tail: 
            text += elem.tail + " "
    text = text.replace('\n','')
    text = text.strip()
    return text

In [47]:
from xml.etree import ElementTree

for entry in tqdm(dataWithPublished):
    pmid = entry['pmid']
    doi = entry['published']
    if pmid is None:
        continue
    if 'pub_journal_name' in entry or 'pub_date_accepted' in entry:
        continue # Already obtained results on pubmed for this entry
    while True:
        try:
            root = ElementTree.fromstring(requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml&api_key=%s' % (pmid, PUBMED_API_KEY), timeout=100).content)
            title = safeAccessInternalElement(root, ['PubmedArticle', 'MedlineCitation', 'Article', 'ArticleTitle'])
            abstract = safeAccessInternalElement(root, ['PubmedArticle', 'MedlineCitation', 'Article', 'Abstract'])
            journalName = safeAccessInternalElement(root, ['PubmedArticle', 'MedlineCitation', 'Article', 'Journal','Title'])
            authorList = safeAccessInternalElement(root, ['PubmedArticle', 'MedlineCitation', 'Article', 'AuthorList'])
            pubHistory = safeAccessInternalElement(root, ['PubmedArticle', 'PubmedData', 'History'])
            if pubHistory is not None:
                for dateEntry in pubHistory.findall('PubMedPubDate'):
                    entry['pub_date_%s' % (dateEntry.attrib['PubStatus'] if 'PubStatus' in dateEntry.attrib else 'undefined')] = datetime(year=int(dateEntry.find('Year').text), month=int(dateEntry.find('Month').text), day=(int(dateEntry.find('Day').text) if dateEntry.find('Day') is not None else 1))
            if journalName is not None:
                entry['pub_journal_name'] = recursiveGetTextFromNode(journalName)
            if authorList is not None:
                entry['pub_num_authors'] = len(authorList.findall('Author'))
            if title is not None:
                entry['pub_title'] = recursiveGetTextFromNode(title)
            if abstract is not None:
                entry['pub_abstract'] = ""
                for abstractTextElem in abstract.findall('AbstractText'):
                    entry['pub_abstract'] += recursiveGetTextFromNode(abstractTextElem, "AbstractText", "Label")
            break
        except:
            print('error here, sleeping')
            sleep(1)
            continue


100%|██████████| 63954/63954 [00:13<00:00, 4844.79it/s]


In [55]:
# Gather engagement metrics from the preprint servers
from bs4 import BeautifulSoup
from datetime import timezone
currentMonth = int(datetime.now().replace(tzinfo=timezone.utc).strftime("%m"))
currentMonthStartTimestamp = datetime.strptime(datetime.now().strftime("%m/%Y"), "%m/%Y").replace(tzinfo=timezone.utc).timestamp()

if str(int(currentMonthStartTimestamp)) not in metadata['engagement']:
    metadata['engagement'][str(int(currentMonthStartTimestamp))] = []


doisProcessed = {}
for entry in tqdm(dataWithPublished):
    if 'engagementMetrics' in entry:
        doisProcessed[entry['doi']] = entry['engagementMetrics']

for entry in tqdm(dataWithPublished):
    if entry['doi'] in metadata['engagement'][str(int(currentMonthStartTimestamp))]:
        continue
    retries = 0
    preprintDOI = entry['doi']
    entry['engagementMetrics'] = {}
    if preprintDOI in doisProcessed:
        entry['engagementMetrics'] = doisProcessed[preprintDOI]
        continue
    while retries < MAX_RETRIES:
        try:
            r = requests.get('https://www.doi.org/%s' % preprintDOI, timeout=100)
            if r.status_code != 200:
                break
            resolvedURL = r.url
            metricsRes = requests.get('%s.article-metrics' % resolvedURL, timeout=100)
            metricSoup = BeautifulSoup(metricsRes.content, 'html.parser')
            numMetrics = len(metricSoup.select('#highwire-highwire-stats-filter-form > div > table > thead > tr > th')) - 1
            for i in range(2, numMetrics + 2):
                sectionTitle = metricSoup.select('#highwire-highwire-stats-filter-form > div > table > thead > tr > th:nth-child(%d)' % i)[0].text
                values = metricSoup.select('#highwire-highwire-stats-filter-form > div > table > tbody > tr > td:nth-child(%d)' % i)
                total = 0
                for val in values:
                    total += int(val.text.replace(',',''))
                entry['engagementMetrics'][sectionTitle] = total
            doisProcessed[preprintDOI] = entry['engagementMetrics']
            metadata['engagement'][str(int(currentMonthStartTimestamp))].append(entry['doi'])
            break
        except Exception as e:
            print(e)
            print('error here, retrying')
            retries += 1


with open('metadata.json', 'w') as f:
    f.write(json.dumps(metadata, indent=4))






100%|██████████| 63954/63954 [00:00<00:00, 1175929.78it/s]
100%|██████████| 63954/63954 [01:27<00:00, 731.35it/s]


In [56]:
# Gather engagement tweet metrics from Altmetric data
import csv 
with open('altmetricData.csv',encoding='mac_roman') as f:
     reader = csv.DictReader(f, delimiter=',')
     for row in tqdm(reader):
        if 'DOI' in row:
            entry = next((x for x in dataWithPublished if x['doi'] == row['DOI']), None)
            if entry is not None:
                entry['altmetricData'] = row




48272it [11:23, 70.58it/s]


In [61]:
# Export the gathered data to the file system
import time
with open('data.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(dataWithPublished,ensure_ascii=False, indent=4, default=str))

metadata['lastUpdated'] = time.strftime("%m/%d/%Y")
with open('metadata.json', 'w') as f:
    f.write(json.dumps(metadata, indent=4))

In [62]:
# Get version 1 abstract

v1_pp_abstracts = [datum['abstract'] for datum in dataWithPublished if datum['version'] == '1']
published = [datum['published'] for datum in dataWithPublished if datum['published'] != 'NA']

In [65]:
import pandas as pd
a = pd.DataFrame.from_records(dataWithPublished)
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63954 entries, 0 to 63953
Data columns (total 30 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   doi                               63954 non-null  object 
 1   title                             63954 non-null  object 
 2   authors                           63954 non-null  object 
 3   author_corresponding              63954 non-null  object 
 4   author_corresponding_institution  63954 non-null  object 
 5   date                              63954 non-null  object 
 6   version                           63954 non-null  object 
 7   type                              63954 non-null  object 
 8   license                           63954 non-null  object 
 9   category                          63954 non-null  object 
 10  abstract                          63954 non-null  object 
 11  published                         63954 non-null  object 
 12  serv

In [66]:
published[0]

'10.1371/journal.pone.0106541'