In [1]:
import numpy as np
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import re
import pickle

import pprint

from Bio import Entrez

In [2]:
with open("../API_ignore.txt", "r") as f:
    lines = f.read()

entrez_api_key = lines.split(":")[1].strip()

## Step 1

First, we use esearch to send a query for all reviews & systematic reviews that have free full text for a specific topic; we want to get the PMIDs of these papers.

In [3]:
esearch_base_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
    
review_pmids_query_dict = {
    "db": "pubmed",
    "sort": "relevance",
    "retmax": '10',
    "term": "{}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])",
    "api_key": entrez_api_key
}

In [4]:
joined_terms = [k+"="+v for k, v in review_pmids_query_dict.items()] 

In [5]:
get_review_pmids_query = esearch_base_query + "&".join(joined_terms)

In [6]:
get_review_pmids_query

'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&sort=relevance&retmax=10&term={}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])&api_key=b0b12c603fda132e7f526bd128008cf75a08'

In [104]:
review_pmids_query_dict

{'sort': 'relevance',
 'retmax': 100,
 'term': '{}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])',
 'api_key': 'b0b12c603fda132e7f526bd128008cf75a08'}

In [23]:
#"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=asthma&Review[ptyp]"
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&sort=relevance&retmax=100&term=atrial+fibrillation+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])

#get_review_pmids_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={}+AND+((Review[ptyp]+OR+systematic[sb])+AND+free+full+text[sb])&api_key={}"

In [9]:
first_search = "atrial+fibrillation"

In [10]:
second_search = "lewy+body+dementia"

In [8]:
# Once we send this query to pubmed using eutils, we get back an xml object which we can store in a tree.
r = requests.get(get_review_pmids_query.format(first_search, entrez_api_key))

In [11]:
# Once we send this query to pubmed using eutils, we get back an xml object which we can store in a tree.
r = requests.get(get_review_pmids_query.format(second_search, entrez_api_key))

In [12]:
tree = ET.ElementTree(ET.fromstring(r.content))
root = tree.getroot()

pmids = root.findall('.//Id')

pmid_list = [pmid.text for pmid in pmids]

## Step 2

Now that we have the PMIDs for the 20 review papers returned by esearch, we have to convert the PMIDs into PMCIDs. In order to convert the PMIDs to PMCIDs, we have to use the ID converter provided by the NCBI, as outlined here: https://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/

In [13]:
# Let's convert each PMID into a PMCID. The JSON that is returned from this request always has a key 'records'.
# Check the dictionary inside of 'records'; if there is a key called 'errmsg', then you know that
# the convert request failed. Otherwise, check to see if the dictionary inside of records has a key called 
# 'pmcid'. If it does, grab the value of the key 'pmcid' and store it. We'll use that PMCID to query PMC to
# fetch the xml of the full paper.

convert_PMID_query = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=review_assistant&email=jl56923@gmail.com&ids={}&format=json"

In [14]:
pmcid_list = []

for pmid in pmid_list:
    r = requests.get(convert_PMID_query.format(pmid))
    result = r.json()
    records_dict = result['records'][0]
    # If there is an error message in the records dictionary that gets returned with the result, then this
    # paper does not have a PMCID and we are not going to be able to get the full text of this paper.
    if 'errmsg' in records_dict:
        pass
    else:
        if 'pmcid' in records_dict:
            pmcid_list.append(records_dict['pmcid'])

In [15]:
pmcid_list

['PMC6088773',
 'PMC4892610',
 'PMC5840831',
 'PMC5390937',
 'PMC4610749',
 'PMC4275567',
 'PMC5873980',
 'PMC5724510',
 'PMC5912679']

## Step 3

Now that we have the list of pmcids, we can use efetch to get the xml of these papers.

In [16]:
get_pmc_xml_query = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={}&tool=review_assistant&email=jl56923@gmail.com"

In [17]:
pmcid_list

['PMC6088773',
 'PMC4892610',
 'PMC5840831',
 'PMC5390937',
 'PMC4610749',
 'PMC4275567',
 'PMC5873980',
 'PMC5724510',
 'PMC5912679']

In [18]:
r = requests.get(get_pmc_xml_query.format(pmcid_list[0]))

In [19]:
tree = ET.ElementTree(ET.fromstring(r.content))
root = tree.getroot()

In [21]:
title = root.find('.//article-title')

In [22]:
title.text

'Non-pharmacological interventions for Lewy body dementia: a systematic review'

In [23]:
sections = root.findall(".//sec")

In [24]:
for section in sections:
    print(section.tag)
    print(section.attrib)
    print(list(section))
    section_title = section.find(".//title")
    print(section_title.text)
    print("----\n")

sec
{'sec-type': 'intro', 'id': 'sec1'}
[<Element 'title' at 0x113b21318>, <Element 'p' at 0x113b21368>, <Element 'p' at 0x113b21818>, <Element 'p' at 0x113b243b8>]
Introduction
----

sec
{'sec-type': 'methods', 'id': 'sec2'}
[<Element 'title' at 0x113b24728>, <Element 'p' at 0x113b24778>, <Element 'sec' at 0x113b247c8>, <Element 'sec' at 0x113b248b8>, <Element 'sec' at 0x113b24ea8>, <Element 'sec' at 0x113b24f98>, <Element 'sec' at 0x113b2b138>, <Element 'sec' at 0x113b2b368>]
Method
----

sec
{'id': 'sec2-1'}
[<Element 'title' at 0x113b24818>, <Element 'p' at 0x113b24868>]
Eligibility criteria
----

sec
{'id': 'sec2-2'}
[<Element 'title' at 0x113b24908>, <Element 'p' at 0x113b24958>, <Element 'p' at 0x113b24e08>, <Element 'p' at 0x113b24e58>]
Search strategy
----

sec
{'id': 'sec2-3'}
[<Element 'title' at 0x113b24ef8>, <Element 'p' at 0x113b24f48>]
Study selection
----

sec
{'id': 'sec2-4'}
[<Element 'title' at 0x113b2b048>, <Element 'p' at 0x113b2b098>, <Element 'p' at 0x113b2b0e8>]

In [26]:
# let's get the number of the section that has a title that contains 'Conclusion', because we want to ignore
# all the sections after that.
conclusion_index = len(sections)

for index, section in enumerate(sections):
    section_title = section.find(".//title")
    if "conclusion" in section_title.text.lower():
        print(index)
        conclusion_index = index
    print(section_title.text.lower())
    print("----\n")

introduction
----

method
----

eligibility criteria
----

search strategy
----

study selection
----

data extraction
----

quality assessment
----

data synthesis
----

results
----

search results
----

quality assessment
----

participants
----

interventions
----

effectiveness of interventions
----

discussion
----



In [27]:
article_text = ""
section_text = ""

for section in sections[:conclusion_index]:
    section_paragraphs = section.findall(".//p")
    
    for paragraph in section_paragraphs:
        section_text += " ".join(paragraph.itertext())
        section_text += " "
        
    article_text += section_text.strip()
    section_text = ""

In [28]:
# Let's also get rid of all of the references that are in square brackets, if there are any.
article_text = re.sub(r' \[.*?]', "", article_text)
article_text = re.sub(r'(\s)+', " ", article_text)

In [29]:
article_text

"Lewy body dementia is a common neurodegenerative disease in older people (Walker et al. 2015 ). It is responsible for 5–25% of diagnosed cases of dementia (Vann Jones & O'Brien, 2014 ), giving it a likely prevalence of around 1% in people over 65 years old (Ballard et al. 2013 ). The disease is characterised by fluctuations in attention and alertness, recurrent visual hallucinations, and parkinsonian motor features (McKeith et al. 2005 ). It is broadly considered to consist of two related disorders – dementia with Lewy bodies and Parkinson's disease dementia – that are distinguished by the relative timing of when cognitive and motor symptoms appear. Dementia with Lewy bodies is diagnosed if dementia develops either prior to parkinsonian motor symptoms or within 1 year of their onset. By contrast, Parkinson's disease dementia is diagnosed if dementia develops after 1 year of parkinsonian motor symptoms. The two disorders share a common underlying pathophysiology, but likely vary in the

In [30]:
abstract = root.find(".//abstract")

In [31]:
abstract_paragraphs = abstract.findall(".//p")

In [32]:
abstract_text = ""

for paragraph in abstract_paragraphs:
        abstract_text += " ".join(paragraph.itertext())
        abstract_text += " "
        print(" ".join(paragraph.itertext()))
        print("-----\n")

abstract_text = abstract_text.strip()

Lewy body dementia (consisting of dementia with Lewy bodies and Parkinson's disease dementia) is a common neurodegenerative disease characterised by visual hallucinations, fluctuating attention, motor disturbances, falls, and sensitivity to antipsychotics. This combination of features presents challenges for pharmacological management. Given this, we sought to review evidence for non-pharmacological interventions with patients with Lewy body dementia and their carers. Bibliographic databases were searched using a wide range of search terms and no restrictions were placed on study design, language, or clinical setting. Two reviewers independently assessed papers for inclusion, rated study quality, and extracted data. The search identified 21 studies including two randomised controlled trials with available subgroup data, seven case series, and 12 case studies. Most studies reported beneficial effects of the interventions used, though the only sizeable study was on dysphagia, showing a b

Okay, so first finding the abstract and then the body nodes, and then going through each of those and joining together the paragraphs seems to work relatively well. We'll go ahead and write the abstract and body texts to files instead. We'll also define a function that can take an XML node, look for all the paragraphs, join them together and return a clean string.

In [33]:
def get_paragraphs_as_clean_string(xml_node):
    # first, we'll clear all the children elements of the table-wrap tags,
    # which will get rid of all the content that was in the tables.
    for table in xml_node.findall(".//table-wrap"):
        table.clear()
    
    node_paragraphs = xml_node.findall(".//p")

    clean_string = ""

    for paragraph in node_paragraphs:
        clean_string += " ".join(paragraph.itertext())
        clean_string += " "
        
    clean_string = clean_string.strip()
    
    # We'll get rid of anything inside of square brackets, since those tend to be the citations.
    clean_string = re.sub(r'\[.*?]', "", clean_string)
    clean_string = re.sub(r'(\s)+', " ", clean_string)
    
    return clean_string

In [34]:
def get_article_text_exclude_after_conclusion(body_node):
    sections = body.findall(".//sec")
    
    conclusion_index = len(sections)
    
    for index, section in enumerate(sections):
        section_title = section.find(".//title")
        if "conclusion" in section_title.text.lower():
            conclusion_index = index
            break
    
    article_text = ""
    
    for section in sections[:conclusion_index]:
        article_text += get_paragraphs_as_clean_string(section)
    
    return article_text

In [43]:
pmcid_list

['PMC6088773',
 'PMC4892610',
 'PMC5840831',
 'PMC5390937',
 'PMC4610749',
 'PMC4275567',
 'PMC5873980',
 'PMC5724510',
 'PMC5912679']

In [44]:
for i in range(len(pmcid_list)):
    r = requests.get(get_pmc_xml_query.format(pmcid_list[i]))

    tree = ET.ElementTree(ET.fromstring(r.content))
    root = tree.getroot()
    
    
    
    pmcid = pmcid_list[i]
    
    # We'll find the first article-title in the text, which should be the title of the paper.
    title = root.find(".//article-title").text
    
    keyword_elements = root.findall(".//kwd")
    keywords = [keyword.text.lower() for keyword in keyword_elements]
    
    abstract = root.find(".//abstract")
    abstract_text = get_paragraphs_as_clean_string(abstract)
        
    # Check to see if you can even get the XML from PMC; if not, then pass. If you can, then you can go ahead
    # and continue making the dictionary and pickling it.
    body = root.find(".//body")
    if body:
        article_text = get_article_text_exclude_after_conclusion(body)
    else:
        print(f"Unable to get text for {pmcid_list[i]}.")
        pass
    
    
    citations = root.findall(".//pub-id")
    citation_tuples = [(citation.text, list(citation.attrib.values())[0]) for citation in citations]
    
    paper_dict = {
        "pmcid": pmcid,
        "title": title,
        "abstract_text": abstract_text,
        "article_text": article_text,
        "citation_tuples": citation_tuples
    }
    
    with open(f"documents/lbd_paper{i+1}_dict.pkl", "wb") as f:
        pickle.dump(paper_dict, f)
        
#     with open(f"documents/af_paper{i+1}_body.txt", "w") as f:
#         f.write(article_text)

AttributeError: 'NoneType' object has no attribute 'findall'

In [477]:
for i in range(5):
    r = requests.get(get_pmc_xml_query.format(pmcid_list[i]))

    tree = ET.ElementTree(ET.fromstring(r.content))
    root = tree.getroot()
    
    pmcid = pmcid_list[i]
    
    # We'll find the first article-title in the text, which should be the title of the paper.
    title = root.find(".//article-title").text
    
    keyword_elements = root.findall(".//kwd")
    keywords = [keyword.text.lower() for keyword in keyword_elements]
    
    abstract = root.find(".//abstract")
    abstract_text = get_paragraphs_as_clean_string(abstract)
    
#     with open(f"documents/af_paper{i+1}_abstract.txt", "w") as f:
#         f.write(abstract_text)
    
    body = root.find(".//body")
    article_text = get_article_text_exclude_after_conclusion(body)
    
    citations = root.findall(".//pub-id")
    citation_tuples = [(citation.text, list(citation.attrib.values())[0]) for citation in citations]
    
    paper_dict = {
        "pmcid": pmcid,
        "title": title,
        "abstract_text": abstract_text,
        "article_text": article_text,
        "citation_tuples": citation_tuples
    }
    
    with open(f"documents/af_paper{i+1}_dict.pkl", "wb") as f:
        pickle.dump(paper_dict, f)
        
#     with open(f"documents/af_paper{i+1}_body.txt", "w") as f:
#         f.write(article_text)