In [1]:
import pandas as pd
from Bio import Entrez
import xml.etree.ElementTree as ET

In [2]:
example = '''
Surface modification of pralidoxime chloride-loaded solid lipid nanoparticles for enhanced brain reactivation of organophosphorus-inhibited AChE: Pharmacokinetics in rat
'''

In [3]:

Entrez.email = 'itin0003@student.monash.edu'
# difference between pubmed and pmc is that pubmed is abstracts only
# xml return
handle = Entrez.esearch(db = 'pubmed',term = example)
record = Entrez.read(handle)
handle.close()
fetch_handle = Entrez.efetch(db = 'pubmed', id = record['IdList'])
xml_raw = fetch_handle.read()
xml_string = xml_raw.decode('utf-8')


# text return
# handle = Entrez.esearch(db = 'pubmed',term = example)
# record = Entrez.read(handle)
# handle.close()
# fetch_handle = Entrez.efetch(db = 'pubmed', rettype = 'abstract',id = record['IdList'], retmode = 'text')
# temp = fetch_handle.read()


In [16]:
import pandas as pd
from Bio import Entrez
import xml.etree.ElementTree as ET



class PubmedQuery:
    def __init__(self, query, email = 'itin0003@student.monash.edu'):
        # query is the triple
        self.query = query
        self.email = email

    def define_email(self):
        # instantiate email to connect to pubmed
        Entrez.email = self.email

    def get_ids(self):
        # maximum of 1 IDs returned for a query
        self.define_email()
        handle = Entrez.esearch(db  ='pubmed', term = self.query, retmax = 1)
        record = Entrez.read(handle)
        handle.close()
        self.id_list = record['IdList']
    
    def get_xml(self):
        # retrieve response XML
        fetch_handle = Entrez.efetch(db = 'pubmed', id = self.id_list)
        xml_raw = fetch_handle.read()
        self.xml_string = xml_raw.decode('utf-8')

    def find_nodes_by_name(self, node, target_name):
        # helper function to locate appropriate XML node
        found_nodes = []
        if node.tag == target_name:
            found_nodes.append(node)
        for child in node:
            found_nodes.extend(self.find_nodes_by_name(child, target_name))
        return found_nodes
    
    def pipeline(self):
        # prepare xml string
        self.get_ids()
        self.get_xml()

    def get_node_text(self, target_name = 'AbstractText'):
        self.pipeline()
        root = ET.fromstring(self.xml_string)
        # get specified nodes (AbstractText)
        found_nodes = self.find_nodes_by_name(root, target_name)
        abstracts = []
        if found_nodes:
            for node in found_nodes:
                ### the change is BELOW
                parsed_text = ''.join(node.itertext())
                abstracts.append(parsed_text)
        return abstracts



In [17]:
# query some example triple
text = 'cancer treatment'
pq = PubmedQuery(query =text)
texts = pq.get_node_text()


In [11]:
print('\n'.join(texts))

Deprescribing, i.e. the suspension of drugs whose existing or potential harms outweigh the benefits in the context of care for the individual patient, is an increasingly frequently encountered topic in various congresses today. This issue becomes predominant especially in patients with chronic pathologies with a life expectancy of less than a year, in whom the goal of the treatments passes from healing to caring. Currently there are few validated deprescribing tools, one of these is certainly the STOPPFrail, currently available in its second version. Therefore, we decided to provide for the translation into Italian, to make the description for the elderly patient with limited life expectation more applicable.
For the translation, we used the method expressed by the European organisation for research and treatment of cancer (Eortc), using forward-backward translation and a Pilot Testing to verify the clarity and comprehensibility of the translation itself.
We interviewed 15 experts, of 

In [18]:
class PubmedQuery1:
    def __init__(self, query, api_key, email='itin0003@student.monash.edu'):
        self.query = query
        self.email = email
        self.id_list = []
        self.xml_string = ""
        self.api_key = api_key

    def _define_email(self):
        Entrez.email = self.email

    def _define_api_key(self):
        Entrez.api_key = self.api_key

    def _get_ids(self):
        self._define_email()
        handle = Entrez.esearch(db='pubmed', term=self.query, retmax=1)
        record = Entrez.read(handle)
        handle.close()
        self.id_list = record['IdList']

    def _get_xml(self):
        fetch_handle = Entrez.efetch(db='pubmed', id=self.id_list)
        xml_raw = fetch_handle.read()
        self.xml_string = xml_raw.decode('utf-8')

    def _find_nodes_by_name(self, node, target_name):
        found_nodes = []
        if node.tag == target_name:
            found_nodes.append(node)
        for child in node:
            found_nodes.extend(self._find_nodes_by_name(child, target_name))
        return found_nodes
    
    def _parse_abstract_text(self, node):
        return ''.join(node.itertext()).strip()

    def get_abstracts(self, target_name='AbstractText'):
        self._get_ids()
        self._get_xml()
        root = ET.fromstring(self.xml_string)
        found_nodes = self._find_nodes_by_name(root, target_name)
        abstracts = [self._parse_abstract_text(node) for node in found_nodes]
        return abstracts

In [19]:
text = 'cancer treatment'
pq = PubmedQuery1(query = text)
texts = pq.get_abstracts()

In [15]:
texts

['Deprescribing, i.e. the suspension of drugs whose existing or potential harms outweigh the benefits in the context of care for the individual patient, is an increasingly frequently encountered topic in various congresses today. This issue becomes predominant especially in patients with chronic pathologies with a life expectancy of less than a year, in whom the goal of the treatments passes from healing to caring. Currently there are few validated deprescribing tools, one of these is certainly the STOPPFrail, currently available in its second version. Therefore, we decided to provide for the translation into Italian, to make the description for the elderly patient with limited life expectation more applicable.',
 'For the translation, we used the method expressed by the European organisation for research and treatment of cancer (Eortc), using forward-backward translation and a Pilot Testing to verify the clarity and comprehensibility of the translation itself.',
 'We interviewed 15 ex

In [46]:
texts[0]

'The nanotechnological approach is an innovative strategy of high potential to achieve reactivation of organophosphorus-inhibited acetylcholinesterase in central nervous system. It was previously shown that pralidoxime chloride-loaded solid lipid nanoparticles (2-PAM-SLNs) are able to protect the brain against pesticide (paraoxon) central toxicity. In the present work, we increased brain AChE reactivation efficacy by PEGylation of 2-PAM-SLNs using PEG-lipid N-(carbonyl-methoxypolyethylene glycol-2000)-1,2-distearoyl-sn-glycero-3-phosphoethanolamine, sodium salt) (DSPE-PEG'

In [41]:
def get_ids(query):
    # maximum of 5 IDs returned
    handle = Entrez.esearch(db  ='pubmed', term = term, retmax = 5)
    record = Entrez.read(handle)
    handle.close()
    id_list = record['IdList']

def get_

def find_nodes_by_name(node, target_name):
    found_nodes = []

    if node.tag == target_name:
        found_nodes.append(node)

    for child in node:
        found_nodes.extend(find_nodes_by_name(child, target_name))

    return found_nodes

def get_node_text(xml_text, target_name = 'AbstractText'):
    xml_text = xml_text.decode('utf-8')
    root = ET.fromstring(xml_text)
    found_nodes = find_nodes_by_name(root, target_name)
    abstracts = []
    if found_nodes:
        for node in found_nodes:
            abstracts.append(node.text)





In [42]:
root = ET.fromstring(temp)
target_name = 'AbstractText'
found_nodes = find_nodes_by_name(root, target_name)

if found_nodes:
    for node in nodes:
        
    for i, node in enumerate(found_nodes, start=1):
        text = node.text
        print(f"Text of {target_name} {i}: {text}")
else:
    print(f"No {target_name} found.")

Text of AbstractText 1: Cytomegalovirus (CMV) causes serious infection in individuals with deficient T cell immunity. In acquired immunodeficiency syndrome, the retina is a major site of progressive infection, despite the availability of therapy that targets CMV. The administration of highly active antiretroviral therapy to suppress human immunodeficiency virus frequently results in resolution of CMV retinitis, but this may be complicated by ocular inflammation termed "immune recovery uveitis" (IRU). To provide insight into the pathogenesis of IRU, the phenotype and specificity of intraocular T cells in a single patient were analyzed. The T cell infiltrate consisted of a diverse population of CD8(+) CMV-specific T cells, but only a minority of these T cells recognized the CMV phosphoprotein 65 and immediate early protein 1, which have been considered major targets of the host response. These results imply that reconstitution of CMV-specific T cells plays a role in IRU and suggest that 

In [26]:
text = text.decode('utf-8')

In [28]:
print(text)

<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">
<PubmedArticleSet>
<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM"><PMID Version="1">12195359</PMID><DateCompleted><Year>2002</Year><Month>09</Month><Day>24</Day></DateCompleted><DateRevised><Year>2016</Year><Month>04</Month><Day>22</Day></DateRevised><Article PubModel="Print-Electronic"><Journal><ISSN IssnType="Print">0022-1899</ISSN><JournalIssue CitedMedium="Print"><Volume>186</Volume><Issue>5</Issue><PubDate><Year>2002</Year><Month>Sep</Month><Day>01</Day></PubDate></JournalIssue><Title>The Journal of infectious diseases</Title><ISOAbbreviation>J Infect Dis</ISOAbbreviation></Journal><ArticleTitle>Association between immune recovery uveitis and a diverse intraocular cytomegalovirus-specific cytotoxic T cell response.</ArticleTitle><Pagination><StartPage>701</StartPage><EndPage>705</EndPage><MedlinePgn>70

In [32]:
def find_node_by_name(node, target_name):
    if node.tag == target_name:
        return node

    for child in node:
        found_node = find_node_by_name(child, target_name)
        if found_node is not None:
            return found_node

    return None

root = ET.fromstring(text)
target_name = 'AbstractText'
found_node = find_node_by_name(root, target_name)

if found_node is not None:
    res = found_node.text
    print(f"Text of the {target_name}: {res}")
else:
    print(f"{target_name} not found.")

Text of the AbstractText: Cytomegalovirus (CMV) causes serious infection in individuals with deficient T cell immunity. In acquired immunodeficiency syndrome, the retina is a major site of progressive infection, despite the availability of therapy that targets CMV. The administration of highly active antiretroviral therapy to suppress human immunodeficiency virus frequently results in resolution of CMV retinitis, but this may be complicated by ocular inflammation termed "immune recovery uveitis" (IRU). To provide insight into the pathogenesis of IRU, the phenotype and specificity of intraocular T cells in a single patient were analyzed. The T cell infiltrate consisted of a diverse population of CD8(+) CMV-specific T cells, but only a minority of these T cells recognized the CMV phosphoprotein 65 and immediate early protein 1, which have been considered major targets of the host response. These results imply that reconstitution of CMV-specific T cells plays a role in IRU and suggest tha