In [257]:
import numpy as np
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import re
import pickle

import pprint

from Bio import Entrez

In [258]:
with open("../API_ignore.txt", "r") as f:
    lines = f.read()

entrez_api_key = lines.split(":")[1].strip()

## Step 1

First, we use esearch to send a query for all reviews & systematic reviews that have free full text for a specific topic; we want to get the PMIDs of these papers.

In [284]:
esearch_base_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"

review_pmids_query_dict = {
    "db": "pubmed",
    "sort": "relevance",
    "retmax": '100',
    "term": '{}+AND+(Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb]+AND+"last%205%20years"[PDat]',
    "api_key": entrez_api_key
}

In [285]:
joined_terms = [k+"="+v for k, v in review_pmids_query_dict.items()] 

In [286]:
get_review_pmids_query = esearch_base_query + "&".join(joined_terms)

In [287]:
entity_query = "atrial+fibrillation"

In [288]:
noun_for_filename = entity_query.replace("+", "_")

In [289]:
get_review_pmids_query.format(entity_query, entrez_api_key)

'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&sort=relevance&retmax=100&term=atrial+fibrillation+AND+(Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb]+AND+"last%205%20years"[PDat]&api_key=b0b12c603fda132e7f526bd128008cf75a08'

In [290]:
# Once we send this query to pubmed using eutils, we get back an xml object which we can store in a tree.
r = requests.get(get_review_pmids_query.format(entity_query, entrez_api_key))

In [291]:
tree = ET.ElementTree(ET.fromstring(r.content))
root = tree.getroot()

pmids = root.findall('.//Id')

pmid_list = [pmid.text for pmid in pmids]

## Step 2

Now that we have the PMIDs for the 20 review papers returned by esearch, we have to convert the PMIDs into PMCIDs. In order to convert the PMIDs to PMCIDs, we have to use the ID converter provided by the NCBI, as outlined here: https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/

In [292]:
# Let's convert each PMID into a PMCID. The JSON that is returned from this request always has a key 'records'.
# Check the dictionary inside of 'records'; if there is a key called 'errmsg', then you know that
# the convert request failed. Otherwise, check to see if the dictionary inside of records has a key called 
# 'pmcid'. If it does, grab the value of the key 'pmcid' and store it. We'll use that PMCID to query PMC to
# fetch the xml of the full paper.

convert_PMID_query = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=review_assistant&email=jl56923@gmail.com&ids={}&format=json"

In [293]:
pmcid_list = []

for pmid in log_progress(pmid_list, every=1):
    r = requests.get(convert_PMID_query.format(pmid))
    result = r.json()
    records_dict = result['records'][0]
    # If there is an error message in the records dictionary that gets returned with the result, then this
    # paper does not have a PMCID and we are not going to be able to get the full text of this paper.
    if 'errmsg' in records_dict:
        pass
    else:
        if 'pmcid' in records_dict:
            pmcid_list.append(records_dict['pmcid'])

VBox(children=(HTML(value=''), IntProgress(value=0)))

## Step 3

Now that we have the list of pmcids, we can use efetch to get the xml of these papers.

In [294]:
get_pmc_xml_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={}&tool=review_assistant&email=jl56923@gmail.com"

Okay, so first finding the abstract and then the body nodes, and then going through each of those and joining together the paragraphs seems to work relatively well. We'll go ahead and write the abstract and body texts to files instead. We'll also define a function that can take an XML node, look for all the paragraphs, join them together and return a clean string.

In [295]:
def get_paragraphs_as_clean_string(xml_node):
    # first, we'll clear all the children elements of the table-wrap tags,
    # which will get rid of all the content that was in the tables.
    # But, first check to see if the table even has tables, because if there is no table-wrap tag
    # then you don't need to do anything.
    if not xml_node:
        return ""
    
    if xml_node.find(".//table-wrap"):
        for table in xml_node.findall(".//table-wrap"):
            table.clear()
    
    node_paragraphs = xml_node.findall(".//p")

    clean_string = ""

    for paragraph in node_paragraphs:
        clean_string += " ".join(paragraph.itertext())
        clean_string += " "
        
    clean_string = clean_string.strip()
    
    # We'll get rid of anything inside of square brackets, since those tend to be the citations.
    clean_string = re.sub(r'\[.*?]', "", clean_string)
    clean_string = re.sub(r'(\s)+', " ", clean_string)
    
    return clean_string

In [296]:
def get_article_text_exclude_after_conclusion(body_node):
    sections = body_node.findall(".//sec")
    
    conclusion_index = len(sections)
            
    for index, section in enumerate(sections):
        section_title = section.find(".//title")
        if section_title:
            if section_title.text:
                if "conclusion" in section_title.text.lower():
                    conclusion_index = index
                    break
                if "discussion" in section_title.text.lower():
                    conclusion_index = index
                    break
    
    article_text = ""
    
    for section in sections[:conclusion_index]:
        article_text += get_paragraphs_as_clean_string(section)
    
    return article_text

In [297]:
# pmcid_list[4]

In [298]:
# r = requests.get(get_pmc_xml_query.format(pmcid_list[4]))

# tree = ET.ElementTree(ET.fromstring(r.content))
# root = tree.getroot()

In [299]:
# def depth_iter(element, tag=None):
#     stack = []
#     stack.append(iter([element]))
#     while stack:
#         e = next(stack[-1], None)
#         if e == None:
#             stack.pop()
#         else:
#             stack.append(iter(e))
#             if tag == None or e.tag == tag:
#                 yield (e, stack.__len__()-1)

In [300]:
# def perf_func(elem, func, level=0):
#     func(elem,level)
#     for child in elem.getchildren():
#         perf_func(child, func, level+1)

# def print_level(elem,level):
#     print('-'*level+elem.tag)

In [301]:
# subject_test = root.findall(".//subject")

In [302]:
# subj_group_test = root.findall(".//subj-group")

In [303]:
# perf_func(root, print_level)

In [304]:
# keywords

In [305]:
entity_dictionary_list = []

for i in log_progress(range(len(pmcid_list)), every=1):
    
    r = requests.get(get_pmc_xml_query.format(pmcid_list[i]))

    tree = ET.ElementTree(ET.fromstring(r.content))
    root = tree.getroot()
    
    # Check to see if you can even get the XML from PMC; if not, then pass. If you can, then you can go ahead
    # and continue making the dictionary and pickling it.
    body = root.find(".//body")
    if body:
        article_text = get_article_text_exclude_after_conclusion(body)
    else:
        print(f"Unable to get text for {pmcid_list[i]}.")
        continue
    
    pmcid = pmcid_list[i]
    
    # We'll find the first article-title in the text, which should be the title of the paper.
    title = root.find(".//article-title").text
    
    keyword_elements = root.findall(".//kwd")
#     #
#     if keyword_elements is None:
#         keyword_elements = root.findall(".//subject")
#         keyword_elements = [keyword_element for keyword_element in keyword_elements if not len(list(keyword_element))]
#     #
    if keyword_elements:
        keywords = [keyword.text.lower() for keyword in keyword_elements if keyword.text]
    else:
        keywords = []
    
    abstract = root.find(".//abstract")
    abstract_text = get_paragraphs_as_clean_string(abstract)
    
    citations = root.findall(".//pub-id")
    citation_tuples = [(citation.text, list(citation.attrib.values())[0]) for citation in citations]
    
    paper_dict = {
        "pmcid": pmcid,
        "title": title,
        "keywords": keywords,
        "abstract_text": abstract_text,
        "article_text": article_text,
        "citation_tuples": citation_tuples
    }
    
    entity_dictionary_list.append(paper_dict)
        
    print(f"Successfully appended dictionary for {pmcid_list[i]} to entity_dictionary_list.")

VBox(children=(HTML(value=''), IntProgress(value=0, max=64)))

Successfully appended dictionary for PMC4952027 to entity_dictionary_list.
Unable to get text for PMC5942796.
Successfully appended dictionary for PMC5598874 to entity_dictionary_list.
Successfully appended dictionary for PMC4766963 to entity_dictionary_list.
Successfully appended dictionary for PMC5658096 to entity_dictionary_list.
Successfully appended dictionary for PMC5560908 to entity_dictionary_list.
Successfully appended dictionary for PMC5089715 to entity_dictionary_list.
Unable to get text for PMC5071280.
Successfully appended dictionary for PMC5079045 to entity_dictionary_list.
Unable to get text for PMC5427484.
Successfully appended dictionary for PMC5543536 to entity_dictionary_list.
Successfully appended dictionary for PMC5340010 to entity_dictionary_list.
Unable to get text for PMC5122472.
Successfully appended dictionary for PMC5442605 to entity_dictionary_list.
Successfully appended dictionary for PMC5380695 to entity_dictionary_list.
Successfully appended dictionary fo

In [306]:
[entity_dictionary['title'] for entity_dictionary in entity_dictionary_list]

['Accuracy of methods for detecting an irregular pulse and suspected atrial fibrillation: A systematic review and meta-analysis',
 'Atrial fibrillation and the risk for myocardial infarction, all-cause mortality and heart failure: A systematic review and meta-analysis',
 'European Primary Care Cardiovascular Society (EPCCS) consensus guidance on stroke prevention in atrial fibrillation (SPAF) in primary care',
 'The effects of rhythm control strategies versus rate control strategies for atrial fibrillation and atrial flutter: A systematic review with meta-analysis and Trial Sequential Analysis',
 'Atrial fibrillation and hyperthyroidism: A literature review',
 'Patient-Reported Outcomes for Quality of Life Assessment in Atrial Fibrillation: A Systematic Review of Measurement Properties',
 'Relation of Body Mass Index With Adverse Outcomes Among Patients With Atrial Fibrillation: A Meta‐Analysis and Systematic Review',
 'Measuring the effect of nurse practitioner (NP)-led care on health

Now we'll go ahead and pickle this entity dictionary list, which we can load in our other notebooks, and we also don't have to keep track of how many paper dictionaries we got, since when we load the list in the other notebook, we can just iterate over the list or ask the list how long it is.

In [307]:
with open(f"documents/{noun_for_filename}_review_paper_dictionary_list.pkl", "wb") as picklefile:
    pickle.dump(entity_dictionary_list, picklefile)

In [308]:
noun_for_filename

'atrial_fibrillation'

## Progress tracker

In [255]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )