In [5]:
import numpy as np
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import re
import pickle

import pprint

from Bio import Entrez

In [6]:
with open("../API_ignore.txt", "r") as f:
    lines = f.read()

entrez_api_key = lines.split(":")[1].strip()

## Step 1

First, we use esearch to send a query for all reviews & systematic reviews that have free full text for a specific topic; we want to get the PMIDs of these papers.

In [7]:
esearch_base_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
    
review_pmids_query_dict = {
    "db": "pubmed",
    "sort": "relevance",
    "retmax": '10',
    "term": "{}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])",
    "api_key": entrez_api_key
}

In [8]:
joined_terms = [k+"="+v for k, v in review_pmids_query_dict.items()] 

In [9]:
get_review_pmids_query = esearch_base_query + "&".join(joined_terms)

In [10]:
get_review_pmids_query

'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&sort=relevance&retmax=10&term={}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])&api_key=b0b12c603fda132e7f526bd128008cf75a08'

In [23]:
#"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=asthma&Review[ptyp]"
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&sort=relevance&retmax=100&term=atrial+fibrillation+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])

#get_review_pmids_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])&api_key={}"

In [9]:
first_search = "atrial+fibrillation"

In [11]:
second_search = "lewy+body+dementia"

In [8]:
# Once we send this query to pubmed using eutils, we get back an xml object which we can store in a tree.
r = requests.get(get_review_pmids_query.format(first_search, entrez_api_key))

In [12]:
# Once we send this query to pubmed using eutils, we get back an xml object which we can store in a tree.
r = requests.get(get_review_pmids_query.format(second_search, entrez_api_key))

In [13]:
tree = ET.ElementTree(ET.fromstring(r.content))
root = tree.getroot()

pmids = root.findall('.//Id')

pmid_list = [pmid.text for pmid in pmids]

## Step 2

Now that we have the PMIDs for the 20 review papers returned by esearch, we have to convert the PMIDs into PMCIDs. In order to convert the PMIDs to PMCIDs, we have to use the ID converter provided by the NCBI, as outlined here: https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/

In [14]:
# Let's convert each PMID into a PMCID. The JSON that is returned from this request always has a key 'records'.
# Check the dictionary inside of 'records'; if there is a key called 'errmsg', then you know that
# the convert request failed. Otherwise, check to see if the dictionary inside of records has a key called 
# 'pmcid'. If it does, grab the value of the key 'pmcid' and store it. We'll use that PMCID to query PMC to
# fetch the xml of the full paper.

convert_PMID_query = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=review_assistant&email=jl56923@gmail.com&ids={}&format=json"

In [15]:
pmcid_list = []

for pmid in pmid_list:
    r = requests.get(convert_PMID_query.format(pmid))
    result = r.json()
    records_dict = result['records'][0]
    # If there is an error message in the records dictionary that gets returned with the result, then this
    # paper does not have a PMCID and we are not going to be able to get the full text of this paper.
    if 'errmsg' in records_dict:
        pass
    else:
        if 'pmcid' in records_dict:
            pmcid_list.append(records_dict['pmcid'])

## Step 3

Now that we have the list of pmcids, we can use efetch to get the xml of these papers.

In [16]:
get_pmc_xml_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={}&tool=review_assistant&email=jl56923@gmail.com"

In [17]:
pmcid_list

['PMC6088773',
 'PMC4892610',
 'PMC5840831',
 'PMC5390937',
 'PMC4610749',
 'PMC4275567',
 'PMC5873980',
 'PMC5724510',
 'PMC5912679']

Okay, so first finding the abstract and then the body nodes, and then going through each of those and joining together the paragraphs seems to work relatively well. We'll go ahead and write the abstract and body texts to files instead. We'll also define a function that can take an XML node, look for all the paragraphs, join them together and return a clean string.

In [23]:
def get_paragraphs_as_clean_string(xml_node):
    # first, we'll clear all the children elements of the table-wrap tags,
    # which will get rid of all the content that was in the tables.
    # But, first check to see if the table even has tables, because if there is no table-wrap tag
    # then you don't need to do anything.
    if xml_node.find(".//table-wrap"):
        for table in xml_node.findall(".//table-wrap"):
            table.clear()
    
    node_paragraphs = xml_node.findall(".//p")

    clean_string = ""

    for paragraph in node_paragraphs:
        clean_string += " ".join(paragraph.itertext())
        clean_string += " "
        
    clean_string = clean_string.strip()
    
    # We'll get rid of anything inside of square brackets, since those tend to be the citations.
    clean_string = re.sub(r'\[.*?]', "", clean_string)
    clean_string = re.sub(r'(\s)+', " ", clean_string)
    
    return clean_string

In [24]:
def get_article_text_exclude_after_conclusion(body_node):
    sections = body.findall(".//sec")
    
    conclusion_index = len(sections)
    
    for index, section in enumerate(sections):
        section_title = section.find(".//title")
        if "conclusion" in section_title.text.lower():
            conclusion_index = index
            break
    
    article_text = ""
    
    for section in sections[:conclusion_index]:
        article_text += get_paragraphs_as_clean_string(section)
    
    return article_text

In [30]:
j = 0
for i in range(len(pmcid_list)):
    
    r = requests.get(get_pmc_xml_query.format(pmcid_list[i]))

    tree = ET.ElementTree(ET.fromstring(r.content))
    root = tree.getroot()
    
    # Check to see if you can even get the XML from PMC; if not, then pass. If you can, then you can go ahead
    # and continue making the dictionary and pickling it.
    body = root.find(".//body")
    if body:
        article_text = get_article_text_exclude_after_conclusion(body)
    else:
        print(f"Unable to get text for {pmcid_list[i]}.")
        continue
    
    pmcid = pmcid_list[i]
    
    # We'll find the first article-title in the text, which should be the title of the paper.
    title = root.find(".//article-title").text
    
    keyword_elements = root.findall(".//kwd")
    keywords = [keyword.text.lower() for keyword in keyword_elements]
    
    abstract = root.find(".//abstract")
    abstract_text = get_paragraphs_as_clean_string(abstract)
    
    citations = root.findall(".//pub-id")
    citation_tuples = [(citation.text, list(citation.attrib.values())[0]) for citation in citations]
    
    paper_dict = {
        "pmcid": pmcid,
        "title": title,
        "abstract_text": abstract_text,
        "article_text": article_text,
        "citation_tuples": citation_tuples
    }
    
    with open(f"documents/lbd_paper{j+1}_dict.pkl", "wb") as f:
        pickle.dump(paper_dict, f)
        
    print(f"Successfully pickled dictionary for {pmcid_list[i]} as lbd_paper{j+1}_dict.pkl.")
    
    j += 1
        
#     with open(f"documents/af_paper{i+1}_body.txt", "w") as f:
#         f.write(article_text)

Successfully pickled dictionary for PMC6088773 as lbd_paper1_dict.pkl.
Successfully pickled dictionary for PMC4892610 as lbd_paper2_dict.pkl.
Successfully pickled dictionary for PMC5840831 as lbd_paper3_dict.pkl.
Unable to get text for PMC5390937.
Unable to get text for PMC4610749.
Successfully pickled dictionary for PMC4275567 as lbd_paper4_dict.pkl.
Unable to get text for PMC5873980.
Successfully pickled dictionary for PMC5724510 as lbd_paper5_dict.pkl.
Unable to get text for PMC5912679.
