In [2]:
import numpy as np
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import re

import pprint

from Bio import Entrez

In [3]:
with open("../API_ignore.txt", "r") as f:
    lines = f.read()

entrez_api_key = lines.split(":")[1].strip()

## Step 1

First, we use esearch to send a query for all reviews & systematic reviews that have free full text for a specific topic; we want to get the PMIDs of these papers.

In [108]:
esearch_base_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
    
review_pmids_query_dict = {
    "db": "pubmed",
    "sort": "relevance",
    "retmax": '10',
    "term": "{}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])",
    "api_key": entrez_api_key
}

In [109]:
joined_terms = [k+"="+v for k, v in review_pmids_query_dict.items()] 

In [113]:
get_review_pmids_query = esearch_base_query + "&".join(joined_terms)

In [104]:
review_pmids_query_dict

{'sort': 'relevance',
 'retmax': 100,
 'term': '{}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])',
 'api_key': 'b0b12c603fda132e7f526bd128008cf75a08'}

In [23]:
#"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=asthma&Review[ptyp]"
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&sort=relevance&retmax=100&term=atrial+fibrillation+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])

get_review_pmids_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])&api_key={}"

In [114]:
first_search = "atrial+fibrillation"

In [115]:
# Once we send this query to pubmed using eutils, we get back an xml object which we can store in a tree.
r = requests.get(get_review_pmids_query.format(first_search, entrez_api_key))

In [116]:
tree = ET.ElementTree(ET.fromstring(r.content))
root = tree.getroot()

pmids = root.findall('.//Id')

pmid_list = [pmid.text for pmid in pmids]

## Step 2

Now that we have the PMIDs for the 20 review papers returned by esearch, we have to convert the PMIDs into PMCIDs. In order to convert the PMIDs to PMCIDs, we have to use the ID converter provided by the NCBI, as outlined here: https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/

In [118]:
# Let's convert each PMID into a PMCID. The JSON that is returned from this request always has a key 'records'.
# Check the dictionary inside of 'records'; if there is a key called 'errmsg', then you know that
# the convert request failed. Otherwise, check to see if the dictionary inside of records has a key called 
# 'pmcid'. If it does, grab the value of the key 'pmcid' and store it. We'll use that PMCID to query PMC to
# fetch the xml of the full paper.

convert_PMID_query = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=review_assistant&email=jl56923@gmail.com&ids={}&format=json"

In [119]:
pmcid_list = []

for pmid in pmid_list:
    r = requests.get(convert_PMID_query.format(pmid))
    result = r.json()
    records_dict = result['records'][0]
    # If there is an error message in the records dictionary that gets returned with the result, then this
    # paper does not have a PMCID and we are not going to be able to get the full text of this paper.
    if 'errmsg' in records_dict:
        pass
    else:
        if 'pmcid' in records_dict:
            pmcid_list.append(records_dict['pmcid'])

## Step 3

Now that we have the list of pmcids, we can use efetch to get the xml of these papers.

In [93]:
get_pmc_xml_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={}&tool=review_assistant&email=jl56923@gmail.com"

In [421]:
pmcid_list

['PMC5598874',
 'PMC5752005',
 'PMC5658096',
 'PMC5560908',
 'PMC5089715',
 'PMC5071280',
 'PMC5079045',
 'PMC4952027',
 'PMC5543536',
 'PMC5340010',
 'PMC5442605',
 'PMC5380695',
 'PMC5286679',
 'PMC4766963',
 'PMC5942796',
 'PMC5321114',
 'PMC5382469',
 'PMC5500874',
 'PMC5465041',
 'PMC4599513',
 'PMC5346472',
 'PMC4937957',
 'PMC5585859',
 'PMC5586302',
 'PMC4819630',
 'PMC5046840',
 'PMC4547665',
 'PMC4547682',
 'PMC5427484',
 'PMC5622555',
 'PMC5843263',
 'PMC5704695',
 'PMC5122472',
 'PMC5345987',
 'PMC5108192',
 'PMC4189345',
 'PMC5726608',
 'PMC4472367',
 'PMC4547664',
 'PMC4547683',
 'PMC4110594',
 'PMC4732179',
 'PMC5482349',
 'PMC5656712',
 'PMC4630199',
 'PMC6059525',
 'PMC4764082',
 'PMC4642960',
 'PMC4788372',
 'PMC5588987',
 'PMC5933600',
 'PMC4724415',
 'PMC4246362',
 'PMC4814009',
 'PMC5457732',
 'PMC5487882',
 'PMC5382449',
 'PMC5403606',
 'PMC4329775',
 'PMC4957677',
 'PMC4051329',
 'PMC4731871',
 'PMC4547684',
 'PMC5234257',
 'PMC6009792',
 'PMC5331111']

In [422]:
r = requests.get(get_pmc_xml_query.format(pmcid_list[0]))

In [423]:
tree = ET.ElementTree(ET.fromstring(r.content))
root = tree.getroot()

In [424]:
ET.dump(tree)

<pmc-articleset xmlns:ns0="http://www.w3.org/1999/xlink"><article article-type="review-article">
  
  <front>
    <journal-meta>
      <journal-id journal-id-type="nlm-ta">Eur J Prev Cardiol</journal-id>
      <journal-id journal-id-type="iso-abbrev">Eur J Prev Cardiol</journal-id>
      <journal-id journal-id-type="publisher-id">CPR</journal-id>
      <journal-id journal-id-type="hwp">spcpr</journal-id>
      <journal-title-group>
        <journal-title>European Journal of Preventive Cardiology</journal-title>
      </journal-title-group>
      <issn pub-type="ppub">2047-4873</issn>
      <issn pub-type="epub">2047-4881</issn>
      <publisher>
        <publisher-name>SAGE Publications</publisher-name>
        <publisher-loc>Sage UK: London, England</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="pmid">28617620</article-id>
      <article-id pub-id-type="pmc">5598874</article-id>
      <article-id pub-id-type="doi">10.1177/20474

<table-wrap id="table2-2047487317715769" orientation="portrait" position="float"><label>Table 2.</label><caption><p>Inclusion criteria and results in the 15 studies included in the meta-analysis.</p></caption><alternatives><graphic specific-use="table2-2047487317715769" ns0:href="10.1177_2047487317715769-table2" /><table frame="hsides" rules="groups"><thead align="left" valign="top"><tr><th colspan="1" rowspan="1">Study</th><th colspan="1" rowspan="1">Inclusion criteria and design</th><th colspan="1" rowspan="1">Estimates (95% CI) % AF patients treated with OACs</th></tr></thead><tbody align="left" valign="top"><tr><td colspan="1" rowspan="1">Otterstad et al.<sup><xref ref-type="bibr" rid="bibr27-2047487317715769">27</xref></sup></td><td colspan="1" rowspan="1">In 7665 patients with stable CHD the prognostic impact of baseline and new onset AF was assessed in comparison with non-AF patients in a randomised trial (ACTION)</td><td colspan="1" rowspan="1">HR (MI; HF; mortality): Prevalent

</th><th colspan="1" rowspan="1">Adj. <italic>R</italic><sup>2</sup> (%)<sup><xref ref-type="table-fn" rid="table-fn9-2047487317715769">b</xref></sup></th></tr></thead><tbody align="left" valign="top"><tr><td colspan="1" rowspan="1">Endpoint myocardial infarction</td></tr><tr><td colspan="1" rowspan="1"> Covariates</td><td colspan="1" rowspan="1" /><td colspan="1" rowspan="1" /><td colspan="1" rowspan="1" /><td colspan="1" rowspan="1" /><td colspan="1" rowspan="1" /><td colspan="1" rowspan="1" /><td colspan="1" rowspan="1" /><td colspan="1" rowspan="1" /></tr><tr><td colspan="1" rowspan="1">  None</td><td colspan="1" rowspan="1">16</td><td colspan="1" rowspan="1" /><td colspan="1" rowspan="1">0.4296</td><td colspan="1" rowspan="1">0.1133</td><td colspan="1" rowspan="1">3.79</td><td colspan="1" rowspan="1">0.002</td><td colspan="1" rowspan="1">0.1395</td><td colspan="1" rowspan="1">−</td></tr><tr><td colspan="1" rowspan="1">  Coronary heart disease at baseline</td><td colspan="1" rowspa

        <mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Moher</surname><given-names>D</given-names></name><name><surname>Liberati</surname><given-names>A</given-names></name><name><surname>Tetzlaff</surname><given-names>J</given-names></name><etal>et al.</etal></person-group>
<article-title>Preferred reporting items for systematic reviews and meta-analyses: the PRISMA statement</article-title>. <source>J Clin Epidemiol</source>
<year>2009</year>; <volume>62</volume>: <fpage>1006</fpage>–<lpage>1012</lpage>.<pub-id pub-id-type="pmid">19631508</pub-id></mixed-citation>
      </ref>
      <ref id="bibr7-2047487317715769">
        <label>7</label>
        <mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aune</surname><given-names>E</given-names></name><name><surname>Endresen</surname><given-names>K</given-names></name><name><surname>Fox</surname><given-names>KA</given-names></name><etal>et 

In [434]:
# Now, let's see if we can remove all of the tables from this tree.

# https://docs.python.org/3.3/library/xml.etree.elementtree.html

for table in root.findall('.//table-wrap'):
    print(table.tag)
    print(table.attrib)
    print("-----\n")
    # can i just clear the table-wrap element and all of its child elements?
    table.clear()
    #root.remove(table)

table-wrap
{'id': 'table1-2047487317715769', 'orientation': 'portrait', 'position': 'float'}
-----

table-wrap
{'id': 'table2-2047487317715769', 'orientation': 'portrait', 'position': 'float'}
-----

table-wrap
{'id': 'table3-2047487317715769', 'orientation': 'portrait', 'position': 'float'}
-----

table-wrap
{'id': 'table4-2047487317715769', 'orientation': 'portrait', 'position': 'float'}
-----



In [435]:
ET.dump(tree)

<pmc-articleset xmlns:ns0="http://www.w3.org/1999/xlink"><article article-type="review-article">
  
  <front>
    <journal-meta>
      <journal-id journal-id-type="nlm-ta">Eur J Prev Cardiol</journal-id>
      <journal-id journal-id-type="iso-abbrev">Eur J Prev Cardiol</journal-id>
      <journal-id journal-id-type="publisher-id">CPR</journal-id>
      <journal-id journal-id-type="hwp">spcpr</journal-id>
      <journal-title-group>
        <journal-title>European Journal of Preventive Cardiology</journal-title>
      </journal-title-group>
      <issn pub-type="ppub">2047-4873</issn>
      <issn pub-type="epub">2047-4881</issn>
      <publisher>
        <publisher-name>SAGE Publications</publisher-name>
        <publisher-loc>Sage UK: London, England</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="pmid">28617620</article-id>
      <article-id pub-id-type="pmc">5598874</article-id>
      <article-id pub-id-type="doi">10.1177/20474

<year>2011</year>; <volume>343</volume>: <fpage>d4002</fpage>–<lpage>d4002</lpage>.<pub-id pub-id-type="pmid">21784880</pub-id></mixed-citation>
      </ref>
      <ref id="bibr19-2047487317715769">
        <label>19</label>
        <mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rohla</surname><given-names>M</given-names></name><name><surname>Vennekate</surname><given-names>CK</given-names></name><name><surname>Tentzeris</surname><given-names>I</given-names></name><etal>et al.</etal></person-group>
<article-title>Long-term mortality of patients with atrial fibrillation undergoing percutaneous coronary intervention with stent implantation for acute and stable coronary artery disease</article-title>. <source>Int J Cardiol</source>
<year>2015</year>; <volume>184</volume>: <fpage>108</fpage>–<lpage>114</lpage>.<pub-id pub-id-type="pmid">25700281</pub-id></mixed-citation>
      </ref>
      <ref id="bibr20-2047487317715769">
        <label

In [432]:
parent_map = {c:p for p in tree.iter() for c in p}

In [433]:
parent_map

{<Element 'article' at 0x10c77e548>: <Element 'pmc-articleset' at 0x10c8d4ae8>,
 <Element 'front' at 0x10c918228>: <Element 'article' at 0x10c77e548>,
 <Element 'body' at 0x10a251d18>: <Element 'article' at 0x10c77e548>,
 <Element 'back' at 0x10c98c908>: <Element 'article' at 0x10c77e548>,
 <Element 'journal-meta' at 0x10c7258b8>: <Element 'front' at 0x10c918228>,
 <Element 'article-meta' at 0x10c914548>: <Element 'front' at 0x10c918228>,
 <Element 'journal-id' at 0x10c722278>: <Element 'journal-meta' at 0x10c7258b8>,
 <Element 'journal-id' at 0x10c7224a8>: <Element 'journal-meta' at 0x10c7258b8>,
 <Element 'journal-id' at 0x10c6f23b8>: <Element 'journal-meta' at 0x10c7258b8>,
 <Element 'journal-id' at 0x10c6f22c8>: <Element 'journal-meta' at 0x10c7258b8>,
 <Element 'journal-title-group' at 0x10c6cef48>: <Element 'journal-meta' at 0x10c7258b8>,
 <Element 'issn' at 0x10c6ced18>: <Element 'journal-meta' at 0x10c7258b8>,
 <Element 'issn' at 0x10c6ceea8>: <Element 'journal-meta' at 0x10c72

In [367]:
title = root.find('.//article-title')

In [368]:
title.text

'Decision-Making in Clinical Practice: Oral Anticoagulant Therapy in Patients with Non-valvular Atrial Fibrillation and a Single Additional Stroke Risk Factor'

In [340]:
root = tree.getroot()

In [371]:
sections = root.findall(".//sec")

In [372]:
for section in sections:
    print(section.tag)
    print(section.attrib)
    print(list(section))
    section_title = section.find(".//title")
    print(section_title.text)
    print("----\n")

sec
{'id': 'Sec1', 'sec-type': 'introduction'}
[<Element 'title' at 0x10c9b2958>, <Element 'p' at 0x10c9b2cc8>, <Element 'p' at 0x10c732a98>, <Element 'p' at 0x10c732ae8>, <Element 'p' at 0x10c732ef8>, <Element 'p' at 0x10c9b6188>]
Introduction
----

sec
{'id': 'Sec2'}
[<Element 'title' at 0x10c9b6d18>, <Element 'p' at 0x10c9b6048>, <Element 'p' at 0x10c6dc188>, <Element 'p' at 0x10c7644f8>, <Element 'p' at 0x10c764188>, <Element 'p' at 0x10c764228>]
Stroke Risk Stratification and Thromboprophylaxis in Patients with Atrial Fibrillation
----

sec
{'id': 'Sec3'}
[<Element 'title' at 0x10c761868>, <Element 'p' at 0x10c7619f8>, <Element 'p' at 0x10c7614a8>, <Element 'p' at 0x10c719db8>, <Element 'p' at 0x10c719048>, <Element 'p' at 0x10c7829a8>, <Element 'p' at 0x10c688b88>, <Element 'p' at 0x10c688638>, <Element 'p' at 0x10c7c7db8>]
“Real-World” Rates of Stroke in Non-anticoagulated Patients with Atrial Fibrillation and One Additional Stroke Risk Factor
----

sec
{'id': 'Sec4'}
[<Element 

In [381]:
# let's get the number of the section that has a title that contains 'Conclusion', because we want to ignore
# all the sections after that.
conclusion_index = 0

for index, section in enumerate(sections):
    section_title = section.find(".//title")
    if "conclusion" in section_title.text.lower():
        print(index)
        conclusion_index = index
    print(section_title.text.lower())
    print("----\n")

introduction
----

stroke risk stratification and thromboprophylaxis in patients with atrial fibrillation
----

“real-world” rates of stroke in non-anticoagulated patients with atrial fibrillation and one additional stroke risk factor
----

effectiveness of oac in af patients with a single additional stroke risk factor in “real-world” observational studies
----

patient values and preferences
----

5
conclusion
----

disclosures
----

compliance with ethics guidelines
----

data availability
----

open access
----



In [401]:
article_text = ""
section_text = ""

for section in sections[:conclusion_index]:
    section_paragraphs = section.findall(".//p")
    
    for paragraph in section_paragraphs:
        section_text += " ".join(paragraph.itertext())
        section_text += " "
        
    article_text += section_text.strip()
    section_text = ""

In [402]:
# Let's also get rid of all of the references that are in square brackets, if there are any.
article_text = re.sub(r' \[.*?]', "", article_text)
article_text = re.sub(r'(\s)+', " ", article_text)

In [403]:
article_text

'Atrial fibrillation (AF) currently affects at least 33.5 million adults in the world population, not including subclinical or undiagnosed AF cases, and the global prevalence of documented AF is probably underestimated because of limited data outside Europe and North America. Recent population-based studies and stroke registries consistently report a substantial AF-attributable risk of stroke, particularly in the elderly. Approximately 1 in 3–4 patients presenting with an ischemic stroke will also have AF (either already known or first diagnosed at the time of acute stroke, or documented during the post-stroke monitoring). In comparison to strokes from other causes, AF-related strokes are more often fatal or associated with greater permanent neurological deficit, but can be effectively prevented using oral anticoagulant therapy (OAC) with well-controlled vitamin K antagonists (VKAs) or non-vitamin K antagonist oral anticoagulants (NOACs) apixaban, rivaroxaban, dabigatran, or edoxaban. 

In [279]:
abstract = root.find(".//abstract")

In [286]:
abstract_paragraphs = abstract.findall(".//p")

In [293]:
abstract_text = ""

for paragraph in abstract_paragraphs:
        abstract_text += " ".join(paragraph.itertext())
        abstract_text += " "
        print(" ".join(paragraph.itertext()))
        print("-----\n")

abstract_text = abstract_text.strip()

In contemporary atrial fibrillation trials most deaths are cardiac related, whereas stroke and bleeding represent only a small subset of deaths. We aimed to evaluate the long-term risk of cardiac events and all-cause mortality in individuals with atrial fibrillation compared to no atrial fibrillation.
-----

A systematic review and meta-analysis of studies published between 1 January 2006 and 21 October 2016.
-----

Four databases were searched. Studies had follow-up of at least 500 stable patients for either cardiac endpoints or all-cause mortality for 12 months or longer. Publication bias was evaluated and random effects models were used to synthesise the results. Heterogeneity between studies was examined by subgroup and meta-regression analyses.
-----

A total of 15 cohort studies was included. Analyses indicated that atrial fibrillation was associated with an increased risk of myocardial infarction (relative risk (RR) 1.54, 95% confidence interval (CI) 1.26–1.85), all-cause mortal

In [294]:
abstract_text

'In contemporary atrial fibrillation trials most deaths are cardiac related, whereas stroke and bleeding represent only a small subset of deaths. We aimed to evaluate the long-term risk of cardiac events and all-cause mortality in individuals with atrial fibrillation compared to no atrial fibrillation. A systematic review and meta-analysis of studies published between 1 January 2006 and 21 October 2016. Four databases were searched. Studies had follow-up of at least 500 stable patients for either cardiac endpoints or all-cause mortality for 12 months or longer. Publication bias was evaluated and random effects models were used to synthesise the results. Heterogeneity between studies was examined by subgroup and meta-regression analyses. A total of 15 cohort studies was included. Analyses indicated that atrial fibrillation was associated with an increased risk of myocardial infarction (relative risk (RR) 1.54, 95% confidence interval (CI) 1.26–1.85), all-cause mortality (RR 1.95, 95% CI

Okay, so first finding the abstract and then the body nodes, and then going through each of those and joining together the paragraphs seems to work relatively well. We'll go ahead and write the abstract and body texts to files instead. We'll also define a function that can take an XML node, look for all the paragraphs, join them together and return a clean string.

In [436]:
def get_paragraphs_as_clean_string(xml_node):
    # first, we'll clear all the children elements of the table-wrap tags,
    # which will get rid of all the content that was in the tables.
    for table in xml_node.findall(".//table-wrap"):
        table.clear()
    
    node_paragraphs = xml_node.findall(".//p")

    clean_string = ""

    for paragraph in node_paragraphs:
        clean_string += " ".join(paragraph.itertext())
        clean_string += " "
        
    clean_string = clean_string.strip()
    
    # We'll get rid of anything inside of square brackets, since those tend to be the citations.
    clean_string = re.sub(r'\[.*?]', "", clean_string)
    clean_string = re.sub(r'(\s)+', " ", clean_string)
    
    return clean_string

In [437]:
def get_article_text_exclude_after_conclusion(body_node):
    sections = body.findall(".//sec")
    
    conclusion_index = len(sections)
    
    for index, section in enumerate(sections):
        section_title = section.find(".//title")
        if "conclusion" in section_title.text.lower():
            conclusion_index = index
            break
    
    article_text = ""
    
    for section in sections[:conclusion_index]:
        article_text += get_paragraphs_as_clean_string(section)
    
    return article_text

In [438]:
for i in range(5):
    r = requests.get(get_pmc_xml_query.format(pmcid_list[i]))

    tree = ET.ElementTree(ET.fromstring(r.content))
    root = tree.getroot()
    
    abstract = root.find(".//abstract")
    abstract_text = get_paragraphs_as_clean_string(abstract)
    
    with open(f"documents/af_paper{i+1}_abstract.txt", "w") as f:
        f.write(abstract_text)
    
    body = root.find(".//body")
    article_text = get_article_text_exclude_after_conclusion(body)
        
    with open(f"documents/af_paper{i+1}_body.txt", "w") as f:
        f.write(article_text)