We can use the provided stylesheets under ./support/stylesheets as interpreters to extract the source data xml files.

In [20]:
import pandas as pd
from lxml import etree
import os

def xslt_transform_to_csv(xslt_path, xml_path, csv_path):
  # Load the XSLT stylesheet
  with open(xslt_path, 'r') as f:
      xsl = etree.parse(f)

  # Load the XML document
  with open(xml_path, 'r') as f:
      xml = etree.parse(f)

  # Create an XSLT transformer
  transform = etree.XSLT(xsl)

  # Apply the transformation
  result = transform(xml)

  # Write the result to a file
  with open('temp.xml', 'wb') as f:
      f.write(etree.tostring(result))

  # Read the XML data into a DataFrame
  df = pd.read_xml('temp.xml')

  # Write the DataFrame to a CSV file
  df.to_csv(csv_path, index=False)

  # Delete the temporary XML file
  if os.path.exists('temp.xml'):
      os.remove('temp.xml')
  else:
      print("The file does not exist")


In [21]:
path_to_xslt_stylesheet = '../sourceandrenderingpipeline/support/stylesheets/extractsrtemplatesforpixelmed.xsl'
path_to_xml_document = '../sourceandrenderingpipeline/source/docbook/part03/part03.xml'
path_to_output_csv = '../P03_extract_templates.csv'

xslt_transform_to_csv(path_to_xslt_stylesheet, path_to_xml_document, path_to_output_csv)

ValueError: xpath does not return any nodes or attributes. Be sure to specify in `xpath` the parent nodes of children and attributes to parse. If document uses namespaces denoted with xmlns, be sure to define namespaces and use them in xpath.

In [42]:
from lxml import etree

path_to_xslt_stylesheet = '../sourceandrenderingpipeline/support/stylesheets/extractsrtemplatesforpixelmed.xsl'
path_to_xml_document = '../sourceandrenderingpipeline/source/docbook/part03/part03.xml'
path_to_output_csv = '../P03_extract_templates.csv'

with open(path_to_xslt_stylesheet, 'r') as f:
    xsl = etree.parse(f)

# Load the XML document
with open(path_to_xml_document, 'r') as f:
    xml = etree.parse(f)

# Create an XSLT transformer
transform = etree.XSLT(xsl)

# Apply the transformation
result = transform(xml)

# Write the result to a file
with open('temp.xml', 'wb') as f:
    f.write(etree.tostring(result))
    
# Load and parse the XML data from 'temp.xml'
with open('temp.xml', 'rb') as f:
    xml_content = f.read()

# Parse the XML content
xml_tree = etree.fromstring(xml_content)

# Print the entire XML content (formatted)
print(etree.tostring(xml_tree, pretty_print=True, encoding='utf-8').decode('utf-8'))

<definetemplates xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.pixelmed.com/namespaces/contextgroups" xsi:schemaLocation="http://www.pixelmed.com/namespaces/contextgroups http://www.pixelmed.com/schemas/contextgroups.xsd"/>



In [3]:
import pandas as pd
from bs4 import BeautifulSoup

def load_and_parse_xml(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return BeautifulSoup(content, 'lxml')

def extract_information_to_dataframe(soup):
    data = []
    for obj in soup.find_all('obj'):
        # Extract relevant details
        element_details = {
            'element': obj.get('element'),
            'href': obj.get('href'),
            'number': obj.get('number', 'N/A'),
            'targetptr': obj.get('targetptr'),
            'title': obj.find('ttl').text.strip() if obj.find('ttl') else 'No Title',
            'xref_text': obj.find('xreftext').text.strip() if obj.find('xreftext') else 'No Cross-Reference Text'
        }
        data.append(element_details)
    
    return pd.DataFrame(data)

In [7]:
file_path = '/Users/wooyeonpark/sourceandrenderingpipeline/output/html/targetdb/PS3_03_target.db'  # Update path
soup = load_and_parse_xml(file_path)
df = extract_information_to_dataframe(soup)

df.head(200)

Unnamed: 0,element,href,number,targetptr,title,xref_text
0,para,#para_80687a13-9d6e-433a-b7a4-36746d82979e,,para_80687a13-9d6e-433a-b7a4-36746d82979e,???TITLE???,
1,para,#para_ca3b4c38-80a2-41ab-8378-33410ef2664a,,para_ca3b4c38-80a2-41ab-8378-33410ef2664a,???TITLE???,"Section , “Notice and Disclaimer”"
2,para,#para_acac4b3b-0236-405c-9a2e-5de428a86acc,,para_acac4b3b-0236-405c-9a2e-5de428a86acc,???TITLE???,"Section , “Notice and Disclaimer”"
3,para,#para_8370e127-477e-40a5-92c9-1406636d5d41,,para_8370e127-477e-40a5-92c9-1406636d5d41,???TITLE???,"Section , “Notice and Disclaimer”"
4,para,#para_73660222-86aa-4ce9-9cd8-56c8aa07edff,,para_73660222-86aa-4ce9-9cd8-56c8aa07edff,???TITLE???,"Section , “Notice and Disclaimer”"
...,...,...,...,...,...,...
195,para,#para_0cb79023-b05b-4e3b-b523-ba3c801ef1ab,,para_0cb79023-b05b-4e3b-b523-ba3c801ef1ab,???TITLE???,Service Class
196,glossentry,#glossentry_ServiceClassUser,,glossentry_ServiceClassUser,Service Class User,Service Class User (SCU)
197,para,#para_441ca129-07e3-474e-aec7-6bdebcb3ba97,,para_441ca129-07e3-474e-aec7-6bdebcb3ba97,???TITLE???,Service Class User
198,glossentry,#glossentry_ServiceClassProvider,,glossentry_ServiceClassProvider,Service Class Provider,Service Class Provider (SCP)
