This is python code to strip all attributes from PMC XML files

In [1]:
from bs4 import BeautifulSoup
import os
from copy import deepcopy

OLD_XML_DIR = 'abstract_and_results_xml_files'
NEW_XML_DIR = 'no_attributes_xml_files'

RAW_CASE_STUDY_XML_DIR = 'case_study_xml_files'
NO_ATTR_CASE_STUDY_XML_DIR = 'no_attributes_case_study_xml_files'

In [2]:
def remove_html_body(soup):
    """Given a BeautifulSoup object, remove the html and body tags"""
    html_tag = soup.html
    body_tag = soup.body
    
    # Unwrap the tags that are added by lxml
    if html_tag is not None:
        html_tag.unwrap()
    if body_tag is not None:
        body_tag.unwrap()
        
    return soup

In [3]:
def remove_all_attributes(soup):
    """Given a BeautifulSoup object, remove all the attributes from the tags."""
    new_soup = deepcopy(soup)
    # Iterate over all tags in the soup
    for tag in new_soup.find_all(True):
        # Clear all the attributes
        tag.attrs = {}
    return new_soup

In [5]:
def clean_xml_directory(from_directory, to_directory):
    """Given a directory of XML files, remove all the attributes from the tags and save the new files in a new directory."""
    # BeautifulSoup objects for each XML file
    for filename in os.listdir(from_directory):
        if filename.endswith(".xml"):
            # Get the pmcid from the filename
            pmcid = int(filename.split('.')[0].split('C')[1])
            filepath = os.path.join(from_directory, filename)
            with open(filepath, 'r') as file:
                soup = BeautifulSoup(file.read(), 'lxml')
                
                remove_html_body(soup)
                
                # Remove all attributes from the tags
            new_soup = remove_all_attributes(soup)
            with open(f'{to_directory}/PMC{pmcid}.xml', 'w') as file:
                file.write(str(new_soup))

In [12]:
clean_xml_directory(OLD_XML_DIR, NEW_XML_DIR)

In [6]:
clean_xml_directory(RAW_CASE_STUDY_XML_DIR, NO_ATTR_CASE_STUDY_XML_DIR)