In [1]:
import numpy as np
import pandas as pd
import os
from os.path import join

import matplotlib.pyplot as plt

In [17]:
import requests

In [21]:
import xmltodict
import json

In [2]:
dataDir = '../data'

# PMC open access articles

PubMed Central is providing a free open access repository of research articles related to COVID-19. To see that list, see: https://www.ncbi.nlm.nih.gov/pmc/?term=2019-nCoV+OR+2019nCoV+OR+COVID-19+OR+SARS-CoV-2+OR+((wuhan+AND+coronavirus)+AND+2019%2F12%5BPDAT%5D%3A2030%5BPDAT%5D)%20AND%20%22open%20access%22%5BFilter%5D

To create a list of IDs for all articles, go to the bottom of that link, click "Send to:", choose "File" and select "Format: PMCID list". Download the file and move to this repo


In [15]:
# get a list of all PMC IDs
with open(join(dataDir, 'PMCID_list.csv'), 'r') as f:
    pmcIDs = f.read().splitlines()

In [16]:
pmcIDs[:3]

['PMC7359905', 'PMC7359914', 'PMC7568028']

In [174]:
# field-specific functions to retrieve info from the metadata dict
def getJournalTitle(md):
    return md['journal-meta']['journal-title-group']['journal-title']

def getDOI(md):
    artIdList = md['article-meta']['article-id']
    return [x for x in artIdList if x['@pub-id-type'] == 'doi'][0]['#text']

def getPMID(md):
    artIdList = md['article-meta']['article-id']
    return [x for x in artIdList if x['@pub-id-type'] == 'pmid'][0]['#text']

def getArticleTitle(md):
    titleGroup = d['article-meta']['title-group']
    return ': '.join([titleGroup[key] for key in titleGroup.keys()])

def getPubDates(md):
    dateGroup = md['article-meta']['pub-date']
    dates = []
    for date in dateGroup:
        if 'day' in date.keys():
            day = date['day']
        else:
            day = '01'
        dates.append({
            'pubType': date['@pub-type'],
            'date': '-'.join([date['month'].zfill(2), day, date['year']])
        })
    return dates

def getAuthors(md):
    contribGroup = raw['article-meta']['contrib-group']

    # parse affiliation locations
    affs = []
    for aff in contribGroup['aff']:
        affs.append({
            'id': aff['@id'],
            'address': aff['#text']
        })
    
    # parse authors
    authors = []
    for author in contribGroup['contrib']:
        authors.append({
            'surname': author['name']['surname'],
            'givenNames': ' '.join([author['name'][x] for x in author['name'].keys() if x == 'given-names']),
            'aff': author['xref']['@rid']
        })
    
    return {'authors': authors, 'affiliations': affs} 

# helper funcs to parse the metadata
def getField(md, field):
    # get the requested field from the metadata
    fieldFuncs = {
        'title': getArticleTitle,
        'authors': getAuthors,
        'pubDates': getPubDates,
        'pmid': getPMID,
        'doi': getDOI,
        'journalTitle': getJournalTitle
    }
    try:
        return fieldFuncs[field](md)
    except:
        return None

def getArticleMeta(PMCID):
    """
    Return the structured article metadata associated with the given PMCID
    """
    ID = PMCID[3:]
    URL = "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:{}&metadataPrefix=pmc".format(ID)
    page = requests.get(URL)
    if page.status_code != 200:
        return None
    
    # parse XML to convert XML response to metadata dict
    parsed = xmltodict.parse(page.content)
    articleRaw = parsed['OAI-PMH']['GetRecord']['record']['metadata']['article']['front']
    
    # create a dict of all relevent bits
    articleMeta = {}
    articleMeta['PMCID'] = PMCID
    for field in ['title', 'authors', 'pubDates', 'pmid', 'doi', 'journalTitle']:
        articleMeta[field] = getField(articleRaw, field)
    
    return articleMeta



In [175]:
d = getArticleMeta(pmcIDs[0])

In [176]:
d

{'PMCID': 'PMC7359905',
 'title': None,
 'authors': {'authors': [{'surname': 'Buonsenso',
    'givenNames': 'Danilo',
    'aff': 'aff1'},
   {'surname': 'Onesimo', 'givenNames': 'Roberta', 'aff': 'aff1'},
   {'surname': 'Valentini', 'givenNames': 'Piero', 'aff': 'aff1'},
   {'surname': 'Chiaretti', 'givenNames': 'Antonio', 'aff': 'aff1'},
   {'surname': 'Gatto', 'givenNames': 'Antonio', 'aff': 'aff1'},
   {'surname': 'Attinà', 'givenNames': 'Giorgio', 'aff': 'aff1'},
   {'surname': 'Conti', 'givenNames': 'Giorgio', 'aff': 'aff2'},
   {'surname': 'Vento', 'givenNames': 'Giovanni', 'aff': 'aff1'},
   {'surname': 'Cambieri', 'givenNames': 'Andrea', 'aff': 'aff3'},
   {'surname': 'Mercuri', 'givenNames': 'Eugenio', 'aff': 'aff1'},
   {'surname': 'Zampino', 'givenNames': 'Giuseppe', 'aff': 'aff1'}],
  'affiliations': [{'id': 'aff1',
    'address': 'From the Department of Woman and Child Health and Public Health'},
   {'id': 'aff2',
    'address': 'Pediatric Intensive Care Unit, Department o

In [157]:
pmcIDs[1]

'PMC7359914'

# Misc

In [25]:
jd = json.dumps(d)

In [33]:
d['OAI-PMH']['GetRecord']['record'].keys()

odict_keys(['header', 'metadata'])