In [3]:
import os
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm

# Path to your directory containing XML files
DIR_PATH = "/Users/jacobhessels/KU/bachelor/src/comm/PMC011_600/refids/data"
OUTPUT_FILE = "/Users/jacobhessels/KU/bachelor/src/comm/PMC011_600/refids/papers.json"

# Function to extract all text within an element, including text in child elements
def extract_text(element):
    return ''.join(element.itertext()).strip()

# Function to extract title, abstract, and PMID from an XML file
def extract_article_info(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract PMID
        pmid_element = root.find('.//article-id[@pub-id-type="pmid"]')
        pmid = pmid_element.text.strip() if pmid_element is not None else None

        if pmid is None:
            # print(f"Skipping article with no PMID: {file_path}")
            return None

        # Extract title
        title_element = root.find('.//article-title')
        title = extract_text(title_element) if title_element is not None else 'No title found'

        # Extract abstract
        abstract_element = root.find('.//abstract')
        abstract = extract_text(abstract_element) if abstract_element is not None else 'No abstract found'

        return {pmid: {"title": title, "abstract": abstract}}
    except ET.ParseError:
        # print(f"Skipping invalid XML file: {file_path}")
        return None

# Collect all articles' information
articles = {}

# Iterate through all XML files in the directory and extract information
xml_files = [f for f in os.listdir(DIR_PATH) if f.endswith(('.xml', '.nxml'))]

for file_name in tqdm(xml_files, desc="Processing XML files"):
    file_path = os.path.join(DIR_PATH, file_name)
    article_info = extract_article_info(file_path)
    if article_info:  # Only add if article_info is not None
        articles.update(article_info)

# Write the collected data to a JSON file
with open(OUTPUT_FILE, 'w') as json_file:
    json.dump(articles, json_file, indent=4)

print(f"Extracted data has been written to {OUTPUT_FILE}")


Processing XML files:   0%|          | 0/3524 [00:00<?, ?it/s]

Processing XML files: 100%|██████████| 3524/3524 [00:15<00:00, 224.62it/s]


Extracted data has been written to /Users/jacobhessels/KU/bachelor/src/comm/PMC011_600/refids/papers.json
