In [2]:
import xml.etree.ElementTree as ET
import os

def extract_plaintext_from_pmc_xml(xml_file_path):
    """
    Parses a single PMC XML file and returns a dictionary with the
    extracted title and body text, including tables. Returns None on failure.
    """
    try:
        # 1. Parse the XML file
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # 2. Extract the article title
        title_element = root.find(".//article-title")
        title = "".join(title_element.itertext()).strip() if title_element is not None else "No Title Found"

        # 3. Extract the main body of the article, handling paragraphs and tables
        body_element = root.find(".//body")
        
        body_content_parts = []
        if body_element is not None:
            # We'll recursively walk the body to find relevant text
            for child in body_element.iter():
                # Process paragraphs
                if child.tag == 'p':
                    # Use itertext() to get all text including text from nested tags (like xref)
                    # then join it and strip whitespace.
                    paragraph_text = "".join(child.itertext()).strip()
                    if paragraph_text:
                        body_content_parts.append(paragraph_text)
                
                # Process tables
                elif child.tag == 'table-wrap':
                    table_text = extract_table_content(child)
                    if table_text:
                        body_content_parts.append(table_text)
        
        body_text = "\n\n".join(body_content_parts) if body_content_parts else "No Body Found"

        return {
            "title": title,
            "body": body_text
        }

    except ET.ParseError as e:
        print(f"Error parsing XML file {xml_file_path}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred with {xml_file_path}: {e}")
        return None

def extract_table_content(table_wrap_element):
    """
    Extracts the caption, headers, and cell data from a <table-wrap> element
    and formats it into a readable plaintext string.
    """
    table_parts = []

    # Extract table label and caption
    caption_element = table_wrap_element.find(".//caption")
    caption_text = "".join(caption_element.itertext()).strip() if caption_element is not None else ""
    label_element = table_wrap_element.find(".//label")
    label = "".join(label_element.itertext()).strip() if label_element is not None else "Table"
    
    if caption_text:
        table_parts.append(f"{label}: {caption_text}")
    elif label:
        table_parts.append(f"{label}")

    # Extract table headers
    headers = []
    header_elements = table_wrap_element.findall(".//thead//th")
    for th in header_elements:
        header_text = "".join(th.itertext()).strip()
        headers.append(header_text)
    
    if headers:
        table_parts.append(" | ".join(headers))
        table_parts.append("-" * 50) # A separator line

    # Extract table rows
    rows = table_wrap_element.findall(".//tbody//tr")
    for row in rows:
        cells = []
        cell_elements = row.findall(".//td")
        for td in cell_elements:
            cell_text = "".join(td.itertext()).strip()
            # Clean up the text by removing reference tags like [<xref>]
            clean_text = ' '.join(cell_text.split()) # This will remove extra spaces
            cells.append(clean_text)
        
        # Only add the row if it's not empty
        if any(cells):
            table_parts.append(" | ".join(cells))
    
    return "\n".join(table_parts) if table_parts else ""


def process_all_xml_in_folder(input_folder, output_folder):
    """
    Iterates through all XML files in a folder, extracts plaintext from each,
    and saves the results to a new folder.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output directory: {output_folder}")
    
    # Check if the input folder exists
    if not os.path.exists(input_folder):
        print(f"Error: The specified input folder '{input_folder}' does not exist.")
        return
        
    xml_files = [f for f in os.listdir(input_folder) if f.endswith(".xml")]
    if not xml_files:
        print(f"No XML files found in the directory: {input_folder}")
        return
        
    print(f"Found {len(xml_files)} XML files to process.")

    for filename in xml_files:
        xml_path = os.path.join(input_folder, filename)
        
        print(f"Processing {filename}...")
        
        # Call the parsing function
        plaintext_data = extract_plaintext_from_pmc_xml(xml_path)
        
        if plaintext_data:
            # Assemble the final plaintext content
            plaintext_content = f"Title: {plaintext_data['title']}\n\n{plaintext_data['body']}"
            
            # Create a filename for the output file
            base_name = os.path.splitext(filename)[0]
            output_file_path = os.path.join(output_folder, f"{base_name}.txt")
            
            # Save the plaintext to a new file
            with open(output_file_path, "w", encoding="utf-8") as f:
                f.write(plaintext_content)
                
            print(f" -> Saved plaintext to {output_file_path}\n")
        else:
            print(f" -> Failed to process {filename}. Skipping.\n")


# --- Main Execution ---
if __name__ == "__main__":
    # --- Configuration ---
    # Path to the folder containing your downloaded XML files
    # This path is based on your request.
    input_folder = "Dataset/Dataset_PMC_Filtered_Reviews/"
    # Path to the folder where the plaintext files will be saved
    output_folder = "Dataset/PMC_Filtered_Reviews_plaintext_gemini"

    # Run the batch processing function
    process_all_xml_in_folder(input_folder, output_folder)

Found 603 XML files to process.
Processing PMC4916179.xml...
 -> Saved plaintext to Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC4916179.txt

Processing PMC10458802.xml...
 -> Saved plaintext to Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC10458802.txt

Processing PMC11892266.xml...
 -> Saved plaintext to Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC11892266.txt

Processing PMC8828558.xml...
 -> Saved plaintext to Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC8828558.txt

Processing PMC10968813.xml...
 -> Saved plaintext to Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC10968813.txt

Processing PMC11379892.xml...
 -> Saved plaintext to Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC11379892.txt

Processing PMC9092649.xml...
 -> Saved plaintext to Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC9092649.txt

Processing PMC6669641.xml...
 -> Saved plaintext to Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC6669641.txt

Processing PMC11171065.xml...
 -> Saved plaintex