# 3. TEG XML Parser

> Converting XML data to JSON from factory and customer files

In [1]:
#| code-fold: true
from datetime import datetime; datetime.now().strftime("created: %F %H:%M")

'created: 2023-04-27 11:22'

## Generating insights from XML collected from different sources.

TLDR: we can pull data straight from SharePoint and process many files in minutes.  Scroll to <a href="#Insights">insights</a>.

In the first notebook, rationale is presented for processing XML files into JSON format.

The second notebook demostrated retrieval of XML files from SharePoint.

Here, we're bringing it together.

In [2]:
#| hide
%load_ext autoreload
%autoreload 2

In [3]:
#| default_exp io

In [4]:
#| hide
from fastcore.test import *
from nbdev.showdoc import *

In [5]:
#| export
from collections import abc
from dotenv import dotenv_values
from loguru import logger as log
from office365.runtime.auth.user_credential import UserCredential
from office365.runtime.client_request_exception import ClientRequestException
from office365.sharepoint.client_context import ClientContext as SharePointClientContext
import ast
import io
import json
import os
import re
import sys
import xmltodict 

In [6]:
#| hide
log.remove()
log.add(sys.stdout, format="{time:YYYY-MM-DD@HH:mm:ss.SSS}|{level}|{function}|{message}")
log.debug("Logger started")

2023-04-27@11:22:23.041|DEBUG|<module>|Logger started


In [7]:
#| exports
#| code-fold: true
class TEGXMLProcessor:
    """Pipeline for TEG6s XML processing."""

    ROOT_TAG = bytes("Cartridge", encoding="utf-8")  # assumption that this will be true

    @classmethod
    def from_local_file(
        cls,
        local_filepath: str  # path to input XML file
    ):
        """Read from a local XML file."""
        instance = cls()
        instance.local_filepath = local_filepath
        log.debug("reading from a local file...")

        try:
            with open(local_filepath, "rb") as f:
                instance.content = f.read()
            instance.raw_length = len(instance.content)
            log.info(f"Read {instance.raw_length:,d} bytes of raw XML data")
        except Exception as e:
            log.error(f"unable to read file: {e}")
        return instance

    @classmethod
    def from_sharepoint_guid(
        cls,
        ctx: SharePointClientContext,  # SharePoint connection object
        sharepoint_guid: str  # SharePoint globally unique ID
    ):
        """Read from SharePoint. Requires initialized ClientContext."""
        instance = cls()
        instance.sharepoint_guid = sharepoint_guid
        log.debug("reading from SharePoint...")

        try:
            with io.BytesIO() as bytestream:
                ctx.web.get_file_by_id(sharepoint_guid).download(bytestream).execute_query()
                instance.raw_length = bytestream.tell()
                log.info(f"read {instance.raw_length:,d} bytes from file {sharepoint_guid}")
                bytestream.seek(0)  # rewind IO buffer
                instance.content = bytestream.read()
        except Exception as e:
            log.error(f"unable to read SharePoint file: {e}")          
        return instance

    def find_xml_offsets(self):
        """Scans for XML fragments."""
        xml_open_bytes = [x.start() for x in re.finditer(b"<\?xml", self.content)]
        xml_close_bytes = [x.end() for x in re.finditer(self.ROOT_TAG+b">", self.content)]
        if len(xml_open_bytes) != len(xml_close_bytes):
            log.error("XML heads not equal XML tails")
        else:
            xml_offsets = [(start, end) for start, end in zip(xml_open_bytes, xml_close_bytes)]
            log.info(f"found {len(xml_offsets)} XML fragment(s)")
            return xml_offsets

    def parse(self):
        if self.content.startswith(b"<?xml"):
            _to_parse = self.content
        else:
            offsets = self.find_xml_offsets()
            start, end  = offsets[-1]  # this is an ASSUMPTION
            _to_parse = self.content[start:end]
        try:
            self.d = xmltodict.parse(
                _to_parse,
                attr_prefix="",
                postprocessor=xml_unstringify,
            )
            self.id = self.d["Cartridge"]["CartridgeResult"]["TestResultID"]
            log.info("XML parsed")
        except Exception as e:
            log.error(f"Nope: {e}")

    def pop_lists(self):
        """Recursively iterate through dictionary and pop all lists into a separate object."""
        self.lists = {}
        def walk_and_pop(nested_dict, path=tuple()):
            for key, value in nested_dict.copy().items():
                nested_path = path + (key,)
                if isinstance(value, list):  # is it a list? pop it
                    self.lists[nested_path] = nested_dict.pop(key)
                elif isinstance(value, abc.Mapping):  # https://stackoverflow.com/a/35691011/9511034
                    walk_and_pop(value, path=nested_path)  # is it a dict? go deeper
                else:
                    continue
        walk_and_pop(self.d)
        return

    def save_jsons(self):
        """Saves parsed XML data in JSON files."""
        metadata_filepath = self.filepath.rstrip(".xml") + "_metadata.json"
        self.save_metadata_json(output_filepath=metadata_filepath)
        
    def save_metadata_json(
        self,
        output_filepath: str  # path to output file
    ):
        """Saves metadata as a JSON file."""
        _output = json.dumps(self.d, indent=4)
        with open(output_filepath, "w") as f:
            f.write(_output)
        log.info(f"wrote {len(_output):,d} bytes to {output_filepath}")


def unstringify(value):
    """Converts stringified numbers to integers or floats."""
    try:
        value = ast.literal_eval(val)
    except:
        pass
    return value


def xml_unstringify(path, key, value):
    """Used as a callable for `xmldict.parse`.
    
    Casts numbers that appears as strings in XML back to integers or floats.
    """
    if value == "true":
        value = "True"
    if value == "false":
        value = "False"
    return key, unstringify(value)

## Factory Files

In [8]:
base_folder = "../../hae/random_factory_XML_48001_53286/"
xml_file = os.path.join(base_folder, os.listdir(base_folder)[0])
tegxml = TEGXMLProcessor.from_local_file(xml_file)
tegxml.parse()
print(tegxml.d['Cartridge'].keys())

2023-04-27@11:22:23.118|DEBUG|from_local_file|reading from a local file...
2023-04-27@11:22:23.127|INFO|from_local_file|Read 788,588 bytes of raw XML data
2023-04-27@11:22:23.324|INFO|parse|XML parsed
dict_keys(['xmlns:xsi', 'xmlns:xsd', 'SchemaVersion', 'AllowTemperatureChange', 'BarcodeExpirationDate', 'CalculatedExpirationDate', 'CartridgeName', 'SampleType', 'TestType', 'DisplayTemplate', 'MicrofluidicsScriptName', 'MicrofluidicsScriptVersion', 'IsDiagnostics', 'ExecutedTest', 'CartridgeID', 'CartridgeRevision', 'ManufacturingData', 'CartridgeTests', 'ExtraCalculations', 'CartridgeResult'])


## Customer Files

In [9]:
base_folder = "../../hae/T1-19B-Cartridges/"
xml_file = [f for f in os.listdir(base_folder) if f.endswith(".xml")][0]
xml_file = os.path.join(base_folder, xml_file)

tegxml = TEGXMLProcessor.from_local_file(xml_file)
tegxml.parse()
print(tegxml.d['Cartridge'].keys())

2023-04-27@11:22:23.366|DEBUG|from_local_file|reading from a local file...
2023-04-27@11:22:23.372|INFO|from_local_file|Read 219,361 bytes of raw XML data
2023-04-27@11:22:23.376|INFO|find_xml_offsets|found 2 XML fragment(s)
2023-04-27@11:22:23.427|INFO|parse|XML parsed
dict_keys(['xmlns:xsi', 'xmlns:xsd', 'SchemaVersion', 'AllowTemperatureChange', 'BarcodeExpirationDate', 'CalculatedExpirationDate', 'CartridgeName', 'SampleType', 'TestType', 'DisplayTemplate', 'MicrofluidicsScriptName', 'MicrofluidicsScriptVersion', 'IsDiagnostics', 'ExecutedTest', 'CartridgeID', 'CartridgeRevision', 'ManufacturingData', 'CartridgeTests', 'ExtraCalculations', 'CartridgeResult'])


## SharePoint Files

In [10]:
#| export
config = dotenv_values("../.env")
sharepoint_url = config["SHAREPOINT_URL"]
credentials = UserCredential(config["USERNAME"], config["PASSWORD"])
ctx = SharePointClientContext(sharepoint_url).with_credentials(credentials)

In [11]:
sharepoint_guid = "66e6a98a-3308-4a83-bc38-00818e953985"
tegxml = TEGXMLProcessor.from_sharepoint_guid(ctx, sharepoint_guid)
tegxml.parse()
print(tegxml.d['Cartridge'].keys())

2023-04-27@11:22:23.494|DEBUG|from_sharepoint_guid|reading from SharePoint...
2023-04-27@11:22:25.544|INFO|from_sharepoint_guid|read 1,342,694 bytes from file 66e6a98a-3308-4a83-bc38-00818e953985
2023-04-27@11:22:25.547|INFO|find_xml_offsets|found 2 XML fragment(s)
2023-04-27@11:22:25.879|INFO|parse|XML parsed
dict_keys(['xmlns:xsi', 'xmlns:xsd', 'SchemaVersion', 'AllowTemperatureChange', 'BarcodeExpirationDate', 'CalculatedExpirationDate', 'CartridgeName', 'SampleType', 'TestType', 'DisplayTemplate', 'MicrofluidicsScriptName', 'MicrofluidicsScriptVersion', 'IsDiagnostics', 'ExecutedTest', 'CartridgeID', 'CartridgeRevision', 'ManufacturingData', 'CartridgeTests', 'ExtraCalculations', 'CartridgeResult'])


### What are the two fragments?

There are two large XML fragments of similar structure. All sections except `CartridgeResult` appear to be identical. In `CartridgeResult`, the first section and the second section are the same, except the second section has one extra key `'SRs'`, which contains the experimental results.

Assuming this is the case for every file (**to be verified**), we will always use the second section.

In [12]:
xml_offsets = tegxml.find_xml_offsets()
xml_offsets

2023-04-27@11:22:25.906|INFO|find_xml_offsets|found 2 XML fragment(s)


[(120, 10673), (10692, 1340947)]

In [13]:
d = {}
for i, (x,y) in enumerate(xml_offsets):
    d[i] = xmltodict.parse(
        tegxml.content[x:y],
        attr_prefix="",
    )['Cartridge']

In [14]:
for k in d[0].keys():
    print(d[0][k] == d[1][k], k)

True xmlns:xsi
True xmlns:xsd
True SchemaVersion
True AllowTemperatureChange
True BarcodeExpirationDate
True CalculatedExpirationDate
True CartridgeName
True SampleType
True TestType
True DisplayTemplate
True MicrofluidicsScriptName
True MicrofluidicsScriptVersion
True IsDiagnostics
True ExecutedTest
True CartridgeID
True CartridgeRevision
True ManufacturingData
True CartridgeTests
True ExtraCalculations
False CartridgeResult


In [15]:
for k in d[0]['CartridgeResult'].keys():
    print(d[0]['CartridgeResult'][k] == d[1]['CartridgeResult'][k], k)

True TestResultID
True CompletionTime
True TestStartTime
True Status
True ErrorReason
True ResultFlags
True TestDurationSec
True PatientID
True UserName
True L1LastQCTestResultID
True L2LastQCTestResultID
True LastCalibrationID
True L1WetQCStatus
True L2WetQCStatus
True PPIDRequestResult
True InstrumentDetails
True TestInformation
False MicrofluidicsLog
False CartridgeResultEventLog
False InstrumentMonitoring
False ChannelResults


In [16]:
for k in d[0]['CartridgeResult']['ChannelResults']['ChannelResult'][0].keys():
    print(d[0]['CartridgeResult']['ChannelResults']['ChannelResult'][0][k] == \
          d[1]['CartridgeResult']['ChannelResults']['ChannelResult'][0][k], k)

True CN
True CST
True CalculationParameterType
True Stat
True TempC
True LowSetpointMaximumTemperature
True Flags
True CPs
True MA
False SRs
True ExtraChannelCalculationResults


## Extracting useful data

### Processing a SharePoint Folder
400 SharePoint files in 6 minutes

In [17]:
#| exports
#| code-fold: true
def get_sharepoint_ids(
    parent_folder_url: str,  # SharePoint URL (TODO: %20 URL formatting)
    ctx: SharePointClientContext,  # SharePoint connection object
):
    """Recursively traverse files and folders and retrieve SharePoint GUIDs."""

    sharepoint_folder = ctx.web.get_folder_by_server_relative_url(parent_folder_url).get().execute_query()

    def traverse_sharepoint_folder(parent_folder, file_dict):
        parent_folder.expand(["Files", "Folders"]).get().execute_query()
        for file in parent_folder.files:
            if file.name.endswith(".xml"):
                file_prefix = file.name.split('/')[-1].split('-')[0]
                if file_prefix in ["Cartridge_PAT","Cartridge_WET"]:
                  file_dict[file.serverRelativeUrl] = file.unique_id
        for folder in parent_folder.folders:
            traverse_sharepoint_folder(folder, file_dict=file_dict)       

    file_dict = {}
    traverse_sharepoint_folder(sharepoint_folder, file_dict)
    return file_dict

In [18]:
%%time
sharepoint_ids = get_sharepoint_ids(config["SAMPLE_DATA_FOLDER_1"], ctx)
len(sharepoint_ids)

CPU times: user 646 ms, sys: 16.5 ms, total: 662 ms
Wall time: 2.43 s


400

In [19]:
dest_folder = "../../hae/From_SharePoint/"
metadata_filepath = os.path.join(dest_folder, "T1-20D-101854_20230204_140216_metadata.json")

In [20]:
for guid in sharepoint_ids.values():
    tegxml = TEGXMLProcessor.from_sharepoint_guid(ctx, guid)
    tegxml.parse()
    tegxml.pop_lists()

    with open(metadata_filepath, "a") as f:
        f.write(json.dumps(tegxml.d, indent=4))

2023-04-27@11:22:28.825|DEBUG|from_sharepoint_guid|reading from SharePoint...
2023-04-27@11:22:29.110|INFO|from_sharepoint_guid|read 430,097 bytes from file 3082a42f-f450-455c-b9f8-00644b180c6e
2023-04-27@11:22:29.117|INFO|find_xml_offsets|found 2 XML fragment(s)
2023-04-27@11:22:29.253|INFO|parse|XML parsed
2023-04-27@11:22:29.260|DEBUG|from_sharepoint_guid|reading from SharePoint...
2023-04-27@11:22:29.580|INFO|from_sharepoint_guid|read 1,197,273 bytes from file 88cbb53f-0fb0-4dc5-9ea6-0094b2b1a0bb
2023-04-27@11:22:29.584|INFO|find_xml_offsets|found 2 XML fragment(s)
2023-04-27@11:22:29.943|INFO|parse|XML parsed
2023-04-27@11:22:29.951|DEBUG|from_sharepoint_guid|reading from SharePoint...
2023-04-27@11:22:30.301|INFO|from_sharepoint_guid|read 539,636 bytes from file 29380e25-bde5-40a6-8963-00d3b0a67116
2023-04-27@11:22:30.304|INFO|find_xml_offsets|found 2 XML fragment(s)
2023-04-27@11:22:30.457|INFO|parse|XML parsed
2023-04-27@11:22:30.470|DEBUG|from_sharepoint_guid|reading from Shar

In [21]:
# # for local folders:
# base_folder = "../../hae/T1-19B-Cartridges/"
# xml_files = [f for f in os.listdir(base_folder) if f.endswith(".xml")]

# for file in xml_files:
#     xml_file = os.path.join(base_folder, file)
#     tegxml = TEGXMLProcessor(xml_file)
#     tegxml.parse()
#     tegxml.pop_lists()
#     tegxml.save_jsons()

## Insights

In [22]:
import duckdb
import pandas as pd

In [23]:
pd.options.display.max_rows = 96
pd.options.display.max_colwidth = 256

In [24]:
con = duckdb.connect()

In [25]:
q = f"""
SELECT COUNT(DISTINCT Cartridge.CartridgeResult.TestResultID) AS run_count
FROM read_json_auto('{metadata_filepath}') metadata
"""
con.execute(q).df()

Unnamed: 0,run_count
0,400


In [30]:
q = f"""
SELECT
  Cartridge.CartridgeResult.InstrumentDetails.SerialNumber AS instrument_sn
  ,Cartridge.CartridgeResult.InstrumentDetails.InstrumentLocation AS instrument_location
  ,Cartridge.ManufacturingData.PrintedCartridgeLot AS printed_cartridge_lot
  ,Cartridge.CartridgeResult.ErrorReason AS error_reason
  ,COUNT(*) AS run_count
FROM read_json_auto('{metadata_filepath}') metadata
GROUP BY 1,2,3,4
ORDER BY 1,2,3
"""
df = con.execute(q).df()
df

Unnamed: 0,instrument_sn,instrument_location,printed_cartridge_lot,error_reason,run_count
0,T1-20D-101854,TML 2031,010622-1,,16
1,T1-20D-101854,TML 2031,010622-1,Aborted,8
2,T1-20D-101854,TML 2031,010622-1,ErrorDetectedInCartridgeOrChannel,1
3,T1-20D-101854,TML 2031,011222-1,,13
4,T1-20D-101854,TML 2031,021722-1,,2
5,T1-20D-101854,TML 2031,030322-4,,10
6,T1-20D-101854,TML 2031,030322-4,Aborted,11
7,T1-20D-101854,TML 2031,030322-4,ErrorDetectedInCartridgeOrChannel,1
8,T1-20D-101854,TML 2031,030522-1,Aborted,6
9,T1-20D-101854,TML 2031,030522-1,,14


In [31]:
pd.crosstab(df.printed_cartridge_lot, df.error_reason)

error_reason,Aborted,ErrorDetectedInCartridgeOrChannel,None,TimedOut
printed_cartridge_lot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
010622-1,1,1,1,0
011222-1,0,0,1,0
021722-1,0,0,1,0
030322-4,1,1,1,0
030522-1,1,0,1,0
031822-2,0,0,1,0
042021-1,0,0,1,0
050422-6,1,1,1,0
062222-9,1,1,1,0
062522-5,0,0,1,0


## Sandbox

In [32]:
# some other data elements of interest
# f"""
#   --,Cartridge.CartridgeResult.InstrumentDetails.InstrumentName AS instrument_name
#   --,Cartridge.TestType AS test_type
#   --,Cartridge.CartridgeResult.UserName AS user
#   --,YEAR(STRPTIME(Cartridge.CartridgeResult.InstrumentDetails.StartTime,'%Y-%m-%dT%H:%M:%S')) AS year
# """

In [33]:
#| hide
import nbdev; nbdev.nbdev_export()