In [3]:
import os
import xml.etree.ElementTree as ET
import json

# Path to your directory containing XML files
DIR_PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx"
OUTPUT_FILE = "papers.json"

# Function to extract all text within an element, including text in child elements
def extract_text(element):
    return ''.join(element.itertext()).strip()

# Function to extract title, abstract, and PMID from an XML file
def extract_article_info(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract PMID
        pmid_element = root.find('.//article-id[@pub-id-type="pmid"]')
        pmid = pmid_element.text.strip() if pmid_element is not None else None

        if pmid is None:
            print(f"Skipping article with no PMID: {file_path}")
            return None

        # Extract title
        title_element = root.find('.//article-title')
        title = extract_text(title_element) if title_element is not None else 'No title found'

        # Extract abstract
        abstract_element = root.find('.//abstract')
        abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

        return {pmid: {"title": title, "abstract": abstract}}
    except ET.ParseError:
        print(f"Skipping invalid XML file: {file_path}")
        return None

# Collect all articles' information
articles = {}

# Iterate through all XML files in the directory and extract information
for file_name in os.listdir(DIR_PATH):
    if file_name.endswith('.xml'):
        file_path = os.path.join(DIR_PATH, file_name)
        article_info = extract_article_info(file_path)
        if article_info:  # Only add if article_info is not None
            articles.update(article_info)

# Write the collected data to a JSON file
with open(OUTPUT_FILE, 'w') as json_file:
    json.dump(articles, json_file, indent=4)

print(f"Extracted data has been written to {OUTPUT_FILE}")


Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1802624.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1697742.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1942176.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1802625.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1876605.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1942173.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1978239.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1942172.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1933172.xml
Skipping article with no PMID: /Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1876610.xml
Skipping article with no PMID: /Users/ja

In [7]:
with open("papers.json", 'r') as json_file:
    loaded_articles = json.load(json_file)


In [8]:
num_elements = len(loaded_articles)
print(f"Number of elements in {OUTPUT_FILE}: {num_elements}")


Number of elements in paper.json: 1621


In [39]:
import os
import xml.etree.ElementTree as ET
import json

# Path to your directory containing XML files
DIR_PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx"
OUTPUT_FILE = "papers.json"

# Function to extract all text within an element, including text in child elements
def extract_text(element):
    return ''.join(element.itertext()).strip()

# Function to extract title and abstract from an XML file
def extract_article_info(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract title
        title_element = root.find('.//article-title')
        title = extract_text(title_element) if title_element is not None else 'No title found'

        # Extract abstract
        abstract_element = root.find('.//abstract')
        abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

        return {"title": title, "abstract": abstract}
    except ET.ParseError:
        print(f"Skipping invalid XML file: {file_path}")
        return None

# Collect all articles' information
articles = []

# Iterate through all XML files in the directory and extract information
for file_name in os.listdir(DIR_PATH):
    if file_name.endswith('.xml'):
        file_path = os.path.join(DIR_PATH, file_name)
        article_info = extract_article_info(file_path)
        if article_info:  # Only append if article_info is not None
            articles.append(article_info)

# Write the collected data to a JSON file
with open(OUTPUT_FILE, 'w') as json_file:
    json.dump(articles, json_file, indent=4)

print(f"Extracted data has been written to {OUTPUT_FILE}")


Extracted data has been written to papers.json


In [4]:
import json

# Path to the JSON file
OUTPUT_FILE = "papers.json"

# Load the JSON file
with open(OUTPUT_FILE, 'r') as file:
    papers = json.load(file)

# Get the number of elements
num_elements = len(papers)

# Get an example of a title and abstract
example_paper = papers[0] if papers else None

print(f"Number of elements in paper.json: {num_elements}")
if example_paper:
    print("Example paper:")
    print(f"Title: {example_paper['title']}")
    print(f"Abstract: {example_paper['abstract']}")
else:
    print("No papers found.")


KeyError: 0

In [37]:
papers[1]

{'title': 'Expression of RAB4B, a protein governing endocytic recycling, is co-regulated with MHC class II genes',
 'abstract': 'The small GTPase RAB4 regulates endocytic recycling, a process that contributes to Major Histocompatibility Complex (MHC)-mediated antigen presentation by specialized antigen presenting cells (APC) of the immune system. The gene encoding the RAB4B isoform of RAB4 was singled out by two complementary genome-wide screens. One of these consisted of a computer scan to identify genes containing characteristic MHC class II-related regulatory sequences. The second was the use of chromatin immunoprecipitation coupled to microarrays (ChIP-on-chip) to identify novel targets of a transcriptional co-activator called the MHC class II transactivator (CIITA). We show that the RAB4B gene is regulated by a typical MHC class II-like enhancer that is controlled directly by both CIITA and the multiprotein transcription factor complex known as the MHC class II enhanceosome. RAB4B

In [30]:
import os
import xml.etree.ElementTree as ET
import json

# Path to your directory containing XML files
DIR_PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx"
OUTPUT_FILE = "papers.json"

# Function to extract all text within an element, including text in child elements
def extract_text(element):
    return ''.join(element.itertext()).strip()

# Function to extract title and abstract from an XML file
def extract_article_info(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract title
        title_element = root.find('.//article-title')
        title = extract_text(title_element) if title_element is not None else 'No title found'

        # Extract abstract
        abstract_element = root.find('.//abstract')
        abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

        return {"title": title, "abstract": abstract}
    except ET.ParseError:
        return {"title": "Invalid XML", "abstract": "Invalid XML"}

# Collect all articles' information
articles = []

# Iterate through all XML files in the directory and extract information
for file_name in os.listdir(DIR_PATH):
    if file_name.endswith('.xml'):
        file_path = os.path.join(DIR_PATH, file_name)
        article_info = extract_article_info(file_path)
        articles.append(article_info)

# Write the collected data to a JSON file
with open(OUTPUT_FILE, 'w') as json_file:
    json.dump(articles, json_file, indent=4)

print(f"Extracted data has been written to {OUTPUT_FILE}")


Extracted data has been written to papers.json


In [34]:
import xml.etree.ElementTree as ET

# Path to your XML file
PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1998891.xml"

# Load and parse the XML file
tree = ET.parse(PATH)
root = tree.getroot()

# Function to extract all text within an element, including text in child elements
def extract_text(element):
    return ''.join(element.itertext()).strip()

# Extract specific information
title_element = root.find('.//article-title')
title = extract_text(title_element) if title_element is not None else 'No title found'

abstract_element = root.find('.//abstract')
abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

print(f"Title: {title}")
print(f"Abstract: {abstract}")


Title: The power of comparative and developmental studies for mouse models of Down syndrome
Abstract: Since the genetic basis for Down syndrome (DS) was described, understanding the causative relationship between genes at dosage imbalance and phenotypes associated with DS has been a principal goal of researchers studying trisomy 21 (Ts21). Though inferences to the gene-phenotype relationship in humans have been made, evidence linking a specific gene or region to a particular congenital phenotype has been limited. To further understand the genetic basis for DS phenotypes, mouse models with three copies of human chromosome 21 (Hsa21) orthologs have been developed. Mouse models offer access to every tissue at each stage of development, opportunity to manipulate genetic content, and ability to precisely quantify phenotypes. Numerous approaches to recreate trisomic composition and analyze phenotypes similar to DS have resulted in diverse trisomic mouse models. A murine intraspecies comparat

In [15]:
import json

# Path to the JSON file
OUTPUT_FILE = "paper.json"

# Load the JSON file
with open(OUTPUT_FILE, 'r') as file:
    papers = json.load(file)

# Get the number of elements
num_elements = len(papers)

# Get an example of a title and abstract
example_paper = papers[0] if papers else None

print(f"Number of elements in paper.json: {num_elements}")
if example_paper:
    print("Example paper:")
    print(f"Title: {example_paper['title']}")
    print(f"Abstract: {example_paper['abstract']}")
else:
    print("No papers found.")


Number of elements in paper.json: 1657
Example paper:
Title: Open Channel Block by Ca
Abstract: The light-activated channels of


In [14]:
import os
import xml.etree.ElementTree as ET
import json

# Path to your directory containing XML files
DIR_PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx"
OUTPUT_FILE = "paper.json"

# Function to extract text from all <p> (paragraph) elements nested within a given XML element
def extract_text(element):
    abstract_parts = []
    for section in element.findall('.//p'):
        if section.text:
            abstract_parts.append(section.text.strip())
    return ' '.join(abstract_parts) if abstract_parts else 'No abstract found'

# Function to extract title and abstract from an XML file
def extract_article_info(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract title
        title_element = root.find('.//article-title')
        title = title_element.text.strip() if title_element is not None and title_element.text is not None else 'No title found'

        # Extract abstract
        abstract_element = root.find('.//abstract')
        abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

        return {"title": title, "abstract": abstract}
    except ET.ParseError:
        return {"title": "Invalid XML", "abstract": "Invalid XML"}

# Collect all articles' information
articles = []

# Iterate through all XML files in the directory and extract information
for file_name in os.listdir(DIR_PATH):
    if file_name.endswith('.xml'):
        file_path = os.path.join(DIR_PATH, file_name)
        article_info = extract_article_info(file_path)
        articles.append(article_info)

# Write the collected data to a JSON file
with open(OUTPUT_FILE, 'w') as json_file:
    json.dump(articles, json_file, indent=4)

print(f"Extracted data has been written to {OUTPUT_FILE}")


Extracted data has been written to paper.json


In [13]:
import os
import xml.etree.ElementTree as ET
import json

# Path to your directory containing XML files
DIR_PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx"
OUTPUT_FILE = "papers.json"

# Function to extract text from all <p> (paragraph) elements nested within a given XML element
def extract_text(element):
    abstract_parts = []
    for section in element.findall('.//p'):
        if section.text:
            abstract_parts.append(section.text.strip())
    return ' '.join(abstract_parts) if abstract_parts else 'No abstract found'

# Function to extract title and abstract from an XML file
def extract_article_info(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract title
        title_element = root.find('.//article-title')
        title = title_element.text.strip() if title_element is not None else 'No title found'

        # Extract abstract
        abstract_element = root.find('.//abstract')
        abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

        return {"title": title, "abstract": abstract}
    except ET.ParseError:
        return {"title": "Invalid XML", "abstract": "Invalid XML"}

# Collect all articles' information
articles = []

# Iterate through all XML files in the directory and extract information
for file_name in os.listdir(DIR_PATH):
    if file_name.endswith('.xml'):
        file_path = os.path.join(DIR_PATH, file_name)
        article_info = extract_article_info(file_path)
        articles.append(article_info)

# Write the collected data to a JSON file
with open(OUTPUT_FILE, 'w') as json_file:
    json.dump(articles, json_file, indent=4)

print(f"Extracted data has been written to {OUTPUT_FILE}")


AttributeError: 'NoneType' object has no attribute 'strip'

In [12]:
import os
import xml.etree.ElementTree as ET

# Path to your directory containing XML files
DIR_PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx"

# Function to extract text from all <p> (paragraph) elements nested within a given XML element
def extract_text(element):
    abstract_parts = []
    for section in element.findall('.//p'):
        if section.text:
            abstract_parts.append(section.text.strip())
    return ' '.join(abstract_parts) if abstract_parts else 'No abstract found'

# Function to extract title and abstract from an XML file
def extract_article_info(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract title
        title_element = root.find('.//article-title')
        title = title_element.text.strip() if title_element is not None else 'No title found'

        # Extract abstract
        abstract_element = root.find('.//abstract')
        abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

        return title, abstract
    except ET.ParseError:
        return 'Invalid XML', 'Invalid XML'

# Iterate through all XML files in the directory and extract information
for file_name in os.listdir(DIR_PATH):
    if file_name.endswith('.xml'):
        file_path = os.path.join(DIR_PATH, file_name)
        title, abstract = extract_article_info(file_path)
        print(f"File: {file_name}")
        print(f"Title: {title}")
        print(f"Abstract: {abstract}\n")
    


File: PMC1999407.xml
Title: Open Channel Block by Ca
Abstract: The light-activated channels of



In [11]:
import xml.etree.ElementTree as ET

# Path to your XML file
PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1350947.xml"

# Load and parse the XML file
tree = ET.parse(PATH)
root = tree.getroot()

# Function to extracts text form all <p> (paragraph) elements
# nested withing a given XML element.
def extract_text(element):
    abstract_parts = []
    for section in element.findall('.//p'):
        if section.text:
            abstract_parts.append(section.text.strip())
    return ' '.join(abstract_parts) if abstract_parts else 'No abstract found'

# Extract specific information
title_element = root.find('.//article-title')
title = title_element.text.strip() if title_element is not None else 'No title found'

abstract_element = root.find('.//abstract')
abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

print(f"Title: {title}")
print(f"Abstract: {abstract}")


Title: A role for dual viral hits in causation of subacute sclerosing panencephalitis
Abstract: Subacute sclerosing panencephalitis (SSPE) is a progressive fatal neurodegenerative disease associated with persistent infection of the central nervous system (CNS) by measles virus (MV), biased hypermutations of the viral genome affecting primarily the matrix (M) gene with the conversion of U to C and A to G bases, high titers of antibodies to MV, and infiltration of B cells and T cells into the CNS. Neither the precipitating event nor biology underlying the MV infection is understood, nor is their any satisfactory treatment. We report the creation of a transgenic mouse model that mimics the cardinal features of SSPE. This was achieved by initially infecting mice expressing the MV receptor with lymphocytic choriomeningitis virus Cl 13, a virus that transiently suppressed their immune system. Infection by MV 10 days later resulted in persistent MV infection of neurons. Analysis of brains fro

In [6]:
import xml.etree.ElementTree as ET

# Path to your XML file
PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1193645.xml"

# Load and parse the XML file
tree = ET.parse(PATH)
root = tree.getroot()

# Extract specific information
# Use the appropriate tags based on the XML structure
title = root.find('.//article-title').text if root.find('.//article-title') is not None else 'No title found'
abstract = root.find('.//abstract').text if root.find('.//abstract') is not None else 'No abstract found'

print(f"Title: {title}")
print(f"Abstract: {abstract}")


Title: Impaired Development of CD4
Abstract: None


In [8]:
import xml.etree.ElementTree as ET

# Path to your XML file
PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1193645.xml"

# Load and parse the XML file
tree = ET.parse(PATH)
root = tree.getroot()

# Function to recursively find and print all tags and their text
def print_tags(element, level=0):
    indent = "  " * level
    print(f"{indent}{element.tag}: {element.text.strip() if element.text else ''}")
    for child in element:
        print_tags(child, level + 1)

# Print the structure to understand it better
print_tags(root)

# Function to extract the text content of the abstract
def extract_abstract(element):
    abstract_parts = []
    for section in element.findall('.//p'):
        if section.text:
            abstract_parts.append(section.text.strip())
    return ' '.join(abstract_parts) if abstract_parts else 'No abstract found'

# Extract specific information
title_element = root.find('.//article-title')
title = title_element.text.strip() if title_element is not None else 'No title found'

abstract_element = root.find('.//abstract')
abstract = extract_abstract(abstract_element) if abstract_element is not None else 'No abstract found'

print(f"Title: {title}")
print(f"Abstract: {abstract}")


article: 
  front: 
    journal-meta: 
      journal-id: J Exp Med
      journal-id: J. Exp. Med
      journal-title-group: 
        journal-title: The Journal of Experimental Medicine
      issn: 0022-1007
      issn: 1540-9538
      publisher: 
        publisher-name: The Rockefeller University Press
    article-meta: 
      article-id: 14699080
      article-id: PMC1193645
      article-id: 20020509
      article-id: 10.1084/jem.20020509
      article-categories: 
        subj-group: 
          subject: Article
      title-group: 
        article-title: Impaired Development of CD4
          sup: +
          sup: +
        subtitle: Increased Susceptibility to Autoimmune Disease
      contrib-group: 
        contrib: 
          name: 
            surname: Nishibori
            given-names: Takeaki
        contrib: 
          name: 
            surname: Tanabe
            given-names: Yoshinari
        contrib: 
          name: 
            surname: Su
            given-names: Leon
  

In [7]:
import xml.etree.ElementTree as ET

# Path to your XML file
PATH = "/Users/jacobhessels/KU/bachelor/data/PMC001xxxxxx/PMC1193645.xml"

# Load and parse the XML file
tree = ET.parse(PATH)
root = tree.getroot()

# Function to extract the text content of the abstract
def extract_abstract(element):
    abstract_parts = []
    for section in element.findall('.//abstract//p'):
        if section.text:
            abstract_parts.append(section.text.strip())
    return ' '.join(abstract_parts) if abstract_parts else 'No abstract found'

# Extract specific information
title = root.find('.//article-title').text if root.find('.//article-title') is not None else 'No title found'
abstract = extract_abstract(root.find('.//abstract')) if root.find('.//abstract') is not None else 'No abstract found'

print(f"Title: {title}")
print(f"Abstract: {abstract}")


Title: Impaired Development of CD4
Abstract: No abstract found


In [5]:
import xml.etree.ElementTree as ET

# Load and parse the XML file
tree = ET.parse(PATH)
root = tree.getroot()

# Function to recursively print all tags
def print_tags(element, level=0):
    indent = "  " * level
    print(f"{indent}{element.tag}")
    for child in element:
        print_tags(child, level + 1)

# Start printing from the root
print_tags(root)


article
  front
    journal-meta
      journal-id
      journal-id
      journal-title-group
        journal-title
      issn
      issn
      publisher
        publisher-name
    article-meta
      article-id
      article-id
      article-id
      article-id
      article-categories
        subj-group
          subject
      title-group
        article-title
          sup
          sup
        subtitle
      contrib-group
        contrib
          name
            surname
            given-names
        contrib
          name
            surname
            given-names
        contrib
          name
            surname
            given-names
        contrib
          name
            surname
            given-names
      aff
      author-notes
        fn
          p
            email
      pub-date
        day
        month
        year
      volume
      issue
      fpage
      lpage
      history
        date
          day
          month
          year
        date
          day


In [3]:
print(root)

<Element 'article' at 0x7f77bc62e7f0>
