# 2. TEG XML Parser

> Converting XML data to JSON from factory and customer files

In [1]:
#| code-fold: true
from datetime import datetime; datetime.now().strftime("created: %F %H:%M")

'created: 2023-04-24 17:44'

In [2]:
#| hide
%load_ext autoreload
%autoreload 2

In [3]:
#| default_exp io

In [4]:
#| hide
from fastcore.test import *
from nbdev.showdoc import *

In [5]:
#| export
from collections import abc
from loguru import logger as log
import ast
import json
import os
import re
import sys
import xmltodict 

In [6]:
#| hide
log.remove()
log.add(sys.stdout, format="{time:YYYY-MM-DD@HH:mm:ss.SSS}|{level}|{function}|{message}")
log.debug("Logger started")

2023-04-24@17:44:54.636|DEBUG|<module>|Logger started


In [7]:
#| exports
class TEGXMLProcessor:
    """Pipeline for TEG6s XML processing."""

    ROOT_TAG = bytes("Cartridge", encoding="utf-8")  # assumption that this will be true
  
    def __init__(
        self,
        filepath: str  # path to input XML file
    ):
        self.filepath = filepath
        self.read_xml()

    def read_xml(self):
        """Read raw contents of the file."""
        try:
            with open(self.filepath, "rb") as f:
                self.contents = f.read()
            self.raw_length = len(self.contents)
            log.info(f"Read {self.raw_length:,d} bytes of raw XML data")
        except Exception as e:
            log.error(f"unable to read file: {e}")

    def find_xml_offsets(self):
        """Scans for XML fragments."""
        xml_open_bytes = [x.start() for x in re.finditer(b"<\?xml", self.contents)]
        xml_close_bytes = [x.end() for x in re.finditer(self.ROOT_TAG+b">", self.contents)]
        if len(xml_open_bytes) != len(xml_close_bytes):
            log.error("XML heads not equal XML tails")
        else:
            xml_offsets = [(start, end) for start, end in zip(xml_open_bytes, xml_close_bytes)]
            log.info(f"found {len(xml_offsets)} XML fragment(s)")
            return xml_offsets

## Factory Files

In [8]:
base_folder = "../../hae/random_factory_XML_48001_53286/"
xml_file = os.path.join(base_folder, os.listdir(base_folder)[0])
tegxml = TEGXMLProcessor(xml_file)

try:
    d = xmltodict.parse(tegxml.contents)
    log.info("XML parsed successfully")
except:
    log.error("Nope")

2023-04-24@17:44:54.706|INFO|read_xml|Read 788,588 bytes of raw XML data
2023-04-24@17:44:54.851|INFO|<module>|XML parsed successfully


## Customer Files
There are two large XML fragments of similar structure.  All sections except `CartridgeResult` appear to be identical.

In [9]:
base_folder = "../../hae/T1-19B-Cartridges/"
xml_file = [f for f in os.listdir(base_folder) if f.endswith(".xml")][0]
xml_file = os.path.join(base_folder, xml_file)

tegxml = TEGXMLProcessor(xml_file)
tegxml.find_xml_offsets()

2023-04-24@17:44:54.880|INFO|read_xml|Read 219,361 bytes of raw XML data
2023-04-24@17:44:54.882|INFO|find_xml_offsets|found 2 XML fragment(s)


[(120, 11953), (11972, 219171)]

In [10]:
d = {}
for i, (x,y) in enumerate([(120, 11953), (11972, 219171)]):
    d[i] = xmltodict.parse(
        tegxml.contents[x:y],
        attr_prefix="",
        # postprocessor=xml_unstringify,
    )['Cartridge']

In [11]:
for k in d[0].keys():
    print(d[0][k] == d[1][k], k)

True xmlns:xsi
True xmlns:xsd
True SchemaVersion
True AllowTemperatureChange
True BarcodeExpirationDate
True CalculatedExpirationDate
True CartridgeName
True SampleType
True TestType
True DisplayTemplate
True MicrofluidicsScriptName
True MicrofluidicsScriptVersion
True IsDiagnostics
True ExecutedTest
True CartridgeID
True CartridgeRevision
True ManufacturingData
True CartridgeTests
True ExtraCalculations
False CartridgeResult


In [12]:
#| hide
import nbdev; nbdev.nbdev_export()