### Camunda unique element tags

In [5]:
import re

def extract_unique_bpmn_tags(xml_text):
    """
    Extract all unique tags in the form <bpmn:generic_string> from an XML text.

    :param xml_text: Input XML text as a string
    :return: Set of unique tags without duplicates
    """
    # Find all tags in the form <bpmn:generic_string>
    bpmn_tags = re.findall(r'<bpmn:(\w+)', xml_text)
    # Return a set of unique tags
    return set(bpmn_tags)

# Camunda
if __name__ == "__main__":
    # Sample XML input
    xml_input = """  <bpmn:process id="Process_1evc5zd" isExecutable="true">
    <bpmn:startEvent id="StartEvent_1" name="Start">
      <bpmn:outgoing>Flow_08gc8bu</bpmn:outgoing>
      <bpmn:outgoing>Flow_0p3omji</bpmn:outgoing>
      <bpmn:outgoing>Flow_1cokig2</bpmn:outgoing>
      <bpmn:outgoing>Flow_05d6qru</bpmn:outgoing>
    </bpmn:startEvent>
    <bpmn:endEvent id="Event_1o85fzr" name="Ciao 3">
      <bpmn:incoming>Flow_08gc8bu</bpmn:incoming>
    </bpmn:endEvent>
    <bpmn:exclusiveGateway id="Gateway_1l277fu" name="Ciao">
      <bpmn:incoming>Flow_0p3omji</bpmn:incoming>
    </bpmn:exclusiveGateway>
    <bpmn:task id="Activity_0cx4jw3" name="Task 1">
      <bpmn:incoming>Flow_1cokig2</bpmn:incoming>
    </bpmn:task>
    <bpmn:intermediateThrowEvent id="Event_19qi251" name="Ciao 2">
      <bpmn:incoming>Flow_05d6qru</bpmn:incoming>
    </bpmn:intermediateThrowEvent>
    <bpmn:exclusiveGateway id="Gateway_0w21oq5">
      <bpmn:outgoing>Flow_0m3ka4j</bpmn:outgoing>
    </bpmn:exclusiveGateway>
    <bpmn:parallelGateway id="Gateway_13hx8gz">
      <bpmn:incoming>Flow_0m3ka4j</bpmn:incoming>
      <bpmn:outgoing>Flow_1fzvkkr</bpmn:outgoing>
      <bpmn:outgoing>Flow_0ue8vxz</bpmn:outgoing>
    </bpmn:parallelGateway>
    <bpmn:eventBasedGateway id="Gateway_11i16eo">
      <bpmn:incoming>Flow_1fzvkkr</bpmn:incoming>
      <bpmn:outgoing>Flow_1trkccq</bpmn:outgoing>
      <bpmn:outgoing>Flow_0661cct</bpmn:outgoing>
      <bpmn:outgoing>Flow_02iop1d</bpmn:outgoing>
      <bpmn:outgoing>Flow_0kq3j4m</bpmn:outgoing>
      <bpmn:outgoing>Flow_06dhp9c</bpmn:outgoing>
    </bpmn:eventBasedGateway>
    <bpmn:receiveTask id="Activity_08x10bt">
      <bpmn:incoming>Flow_1trkccq</bpmn:incoming>
    </bpmn:receiveTask>
    <bpmn:intermediateCatchEvent id="Event_1fjxnv4">
      <bpmn:incoming>Flow_0661cct</bpmn:incoming>
      <bpmn:messageEventDefinition id="MessageEventDefinition_0n5gclf" />
    </bpmn:intermediateCatchEvent>
    <bpmn:intermediateCatchEvent id="Event_11kny6x">
      <bpmn:incoming>Flow_02iop1d</bpmn:incoming>
      <bpmn:timerEventDefinition id="TimerEventDefinition_0ysivg8" />
    </bpmn:intermediateCatchEvent>
    <bpmn:intermediateCatchEvent id="Event_1ewsie2">
      <bpmn:incoming>Flow_0kq3j4m</bpmn:incoming>
      <bpmn:conditionalEventDefinition id="ConditionalEventDefinition_0ankq3b">
        <bpmn:condition xsi:type="bpmn:tFormalExpression" />
      </bpmn:conditionalEventDefinition>
    </bpmn:intermediateCatchEvent>
    <bpmn:intermediateCatchEvent id="Event_1palb28">
      <bpmn:incoming>Flow_06dhp9c</bpmn:incoming>
      <bpmn:signalEventDefinition id="SignalEventDefinition_03w6gck" />
    </bpmn:intermediateCatchEvent>
    <bpmn:userTask id="Activity_0n50avy">
      <bpmn:incoming>Flow_0ue8vxz</bpmn:incoming>
      <bpmn:outgoing>Flow_0fg0v0h</bpmn:outgoing>
    </bpmn:userTask>
    <bpmn:serviceTask id="Activity_1r77er3">
      <bpmn:incoming>Flow_0fg0v0h</bpmn:incoming>
      <bpmn:outgoing>Flow_17wd2oo</bpmn:outgoing>
    </bpmn:serviceTask>
    <bpmn:businessRuleTask id="Activity_0j5g7ln">
      <bpmn:incoming>Flow_17wd2oo</bpmn:incoming>
      <bpmn:outgoing>Flow_19izr72</bpmn:outgoing>
    </bpmn:businessRuleTask>
    <bpmn:scriptTask id="Activity_11ipfw2">
      <bpmn:incoming>Flow_19izr72</bpmn:incoming>
      <bpmn:outgoing>Flow_1ychg9d</bpmn:outgoing>
    </bpmn:scriptTask>
    <bpmn:callActivity id="Activity_0eyiiz4">
      <bpmn:extensionElements>
        <zeebe:calledElement propagateAllChildVariables="false" />
      </bpmn:extensionElements>
      <bpmn:incoming>Flow_1ychg9d</bpmn:incoming>
      <bpmn:outgoing>Flow_03et73k</bpmn:outgoing>
    </bpmn:callActivity>
    <bpmn:intermediateThrowEvent id="Event_1gth92g">
      <bpmn:incoming>Flow_1xjhb81</bpmn:incoming>
      <bpmn:outgoing>Flow_0pobvme</bpmn:outgoing>
    </bpmn:intermediateThrowEvent>
    <bpmn:intermediateCatchEvent id="Event_0ku65jl">
      <bpmn:incoming>Flow_0pobvme</bpmn:incoming>
      <bpmn:outgoing>Flow_0erq7g0</bpmn:outgoing>
      <bpmn:dataOutputAssociation id="DataOutputAssociation_0w7fhjm">
        <bpmn:targetRef>DataStoreReference_0wkz1qq</bpmn:targetRef>
      </bpmn:dataOutputAssociation>
      <bpmn:dataOutputAssociation id="DataOutputAssociation_1ymhdta">
        <bpmn:targetRef>DataObjectReference_0kv2ofb</bpmn:targetRef>
      </bpmn:dataOutputAssociation>
      <bpmn:messageEventDefinition id="MessageEventDefinition_07sfbdp" />
    </bpmn:intermediateCatchEvent>
    <bpmn:intermediateThrowEvent id="Event_039xncd">
      <bpmn:incoming>Flow_0erq7g0</bpmn:incoming>
      <bpmn:outgoing>Flow_0zwnv7i</bpmn:outgoing>
      <bpmn:messageEventDefinition id="MessageEventDefinition_1p1lo99" />
    </bpmn:intermediateThrowEvent>
    <bpmn:intermediateCatchEvent id="Event_1xc35r0">
      <bpmn:incoming>Flow_0zwnv7i</bpmn:incoming>
      <bpmn:outgoing>Flow_1fpzox9</bpmn:outgoing>
      <bpmn:timerEventDefinition id="TimerEventDefinition_0af70pl" />
    </bpmn:intermediateCatchEvent>
    <bpmn:intermediateThrowEvent id="Event_0kzfual">
      <bpmn:incoming>Flow_1fpzox9</bpmn:incoming>
      <bpmn:outgoing>Flow_0a9bgtv</bpmn:outgoing>
      <bpmn:escalationEventDefinition id="EscalationEventDefinition_15pji6h" />
    </bpmn:intermediateThrowEvent>
    <bpmn:intermediateCatchEvent id="Event_0resyto">
      <bpmn:incoming>Flow_0a9bgtv</bpmn:incoming>
      <bpmn:conditionalEventDefinition id="ConditionalEventDefinition_1agaxl7">
        <bpmn:condition xsi:type="bpmn:tFormalExpression" />
      </bpmn:conditionalEventDefinition>
    </bpmn:intermediateCatchEvent>
    <bpmn:intermediateThrowEvent id="Event_0xuijo1">
      <bpmn:incoming>Flow_1h2xf8y</bpmn:incoming>
      <bpmn:linkEventDefinition id="LinkEventDefinition_0kf9lj2" name="" />
    </bpmn:intermediateThrowEvent>
    <bpmn:intermediateThrowEvent id="Event_1na5ehx">
      <bpmn:incoming>Flow_07ipzkp</bpmn:incoming>
      <bpmn:outgoing>Flow_0isq9j7</bpmn:outgoing>
      <bpmn:compensateEventDefinition id="CompensateEventDefinition_1hnq2vl" />
    </bpmn:intermediateThrowEvent>
    <bpmn:intermediateCatchEvent id="Event_0tnfnvr">
      <bpmn:incoming>Flow_0isq9j7</bpmn:incoming>
      <bpmn:outgoing>Flow_0jcmf9k</bpmn:outgoing>
      <bpmn:outgoing>Flow_0cdhbb9</bpmn:outgoing>
      <bpmn:outgoing>Flow_1n1psdh</bpmn:outgoing>
      <bpmn:outgoing>Flow_0bexoya</bpmn:outgoing>
      <bpmn:signalEventDefinition id="SignalEventDefinition_0yuwnpm" />
    </bpmn:intermediateCatchEvent>
    <bpmn:intermediateCatchEvent id="Event_1swi8du">
      <bpmn:outgoing>Flow_1h2xf8y</bpmn:outgoing>
      <bpmn:outgoing>Flow_07ipzkp</bpmn:outgoing>
      <bpmn:linkEventDefinition id="LinkEventDefinition_1p8yxu7" name="" />
    </bpmn:intermediateCatchEvent>
    <bpmn:intermediateThrowEvent id="Event_1cpua1x">
      <bpmn:incoming>Flow_0jcmf9k</bpmn:incoming>
      <bpmn:outgoing>Flow_1cpty0u</bpmn:outgoing>
      <bpmn:outgoing>Flow_0ubvzsp</bpmn:outgoing>
      <bpmn:outgoing>Flow_0xtttr7</bpmn:outgoing>
      <bpmn:outgoing>Flow_1vm4oob</bpmn:outgoing>
      <bpmn:signalEventDefinition id="SignalEventDefinition_1xfcky2" />
    </bpmn:intermediateThrowEvent>
    <bpmn:endEvent id="Event_1vfqdhx">
      <bpmn:incoming>Flow_1cpty0u</bpmn:incoming>
      <bpmn:messageEventDefinition id="MessageEventDefinition_0bafkhl" />
    </bpmn:endEvent>
    <bpmn:endEvent id="Event_0ie683y">
      <bpmn:incoming>Flow_0ubvzsp</bpmn:incoming>
      <bpmn:escalationEventDefinition id="EscalationEventDefinition_0s7bq1f" />
    </bpmn:endEvent>
    <bpmn:endEvent id="Event_0lm9595">
      <bpmn:incoming>Flow_0xtttr7</bpmn:incoming>
      <bpmn:errorEventDefinition id="ErrorEventDefinition_0i5p050" />
    </bpmn:endEvent>
    <bpmn:endEvent id="Event_0jx9tak">
      <bpmn:incoming>Flow_0cdhbb9</bpmn:incoming>
      <bpmn:compensateEventDefinition id="CompensateEventDefinition_1j61gvm" />
    </bpmn:endEvent>
    <bpmn:endEvent id="Event_1pnub3r">
      <bpmn:incoming>Flow_1n1psdh</bpmn:incoming>
      <bpmn:signalEventDefinition id="SignalEventDefinition_00zitdw" />
    </bpmn:endEvent>
    <bpmn:endEvent id="Event_1d60irg">
      <bpmn:incoming>Flow_0bexoya</bpmn:incoming>
      <bpmn:terminateEventDefinition id="TerminateEventDefinition_18e41ky" />
    </bpmn:endEvent>
    <bpmn:dataStoreReference id="DataStoreReference_1s0i6d9" />
    <bpmn:dataStoreReference id="DataStoreReference_0wkz1qq" />
    <bpmn:dataObjectReference id="DataObjectReference_0kv2ofb" dataObjectRef="DataObject_15fzkrc" />
    <bpmn:dataObject id="DataObject_15fzkrc" />
    <bpmn:transaction id="Activity_16714qd">
      <bpmn:incoming>Flow_03et73k</bpmn:incoming>
      <bpmn:outgoing>Flow_19kh9kp</bpmn:outgoing>
    </bpmn:transaction>
    <bpmn:subProcess id="Activity_1sdjo12" triggeredByEvent="true" />
    <bpmn:subProcess id="Activity_1emy2u0">
      <bpmn:incoming>Flow_19kh9kp</bpmn:incoming>
      <bpmn:outgoing>Flow_045o66u</bpmn:outgoing>
    </bpmn:subProcess>
    <bpmn:subProcess id="Activity_0fummc9">
      <bpmn:incoming>Flow_045o66u</bpmn:incoming>
      <bpmn:outgoing>Flow_1xjhb81</bpmn:outgoing>
    </bpmn:subProcess>
    <bpmn:boundaryEvent id="Event_0capipk" attachedToRef="Activity_0fummc9">
      <bpmn:messageEventDefinition id="MessageEventDefinition_009uyux" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_0hu1bjb" attachedToRef="Activity_0fummc9">
      <bpmn:timerEventDefinition id="TimerEventDefinition_15hdroj" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_0tk95s7" attachedToRef="Activity_0fummc9">
      <bpmn:escalationEventDefinition id="EscalationEventDefinition_1mrvo70" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_18cp971" attachedToRef="Activity_0fummc9">
      <bpmn:conditionalEventDefinition id="ConditionalEventDefinition_0ssupfs">
        <bpmn:condition xsi:type="bpmn:tFormalExpression" />
      </bpmn:conditionalEventDefinition>
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_13ozsfm" attachedToRef="Activity_0fummc9">
      <bpmn:errorEventDefinition id="ErrorEventDefinition_15jict2" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_0sf2set" attachedToRef="Activity_0fummc9" />
    <bpmn:boundaryEvent id="Event_0novvbl" attachedToRef="Activity_0fummc9" />
    <bpmn:boundaryEvent id="Event_1n8q94b" attachedToRef="Activity_0fummc9">
      <bpmn:signalEventDefinition id="SignalEventDefinition_1s2qf8u" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_1vr9am4" attachedToRef="Activity_0fummc9">
      <bpmn:compensateEventDefinition id="CompensateEventDefinition_14j9fa4" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_0k0jf4p" cancelActivity="false" attachedToRef="Activity_0fummc9">
      <bpmn:messageEventDefinition id="MessageEventDefinition_0mmzmyc" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_1hbx51s" cancelActivity="false" attachedToRef="Activity_0fummc9">
      <bpmn:timerEventDefinition id="TimerEventDefinition_1duoj7n" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_18h7b6w" cancelActivity="false" attachedToRef="Activity_0fummc9">
      <bpmn:escalationEventDefinition id="EscalationEventDefinition_0fnhunl" />
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_029ept6" cancelActivity="false" attachedToRef="Activity_0fummc9">
      <bpmn:conditionalEventDefinition id="ConditionalEventDefinition_0fmctrh">
        <bpmn:condition xsi:type="bpmn:tFormalExpression" />
      </bpmn:conditionalEventDefinition>
    </bpmn:boundaryEvent>
    <bpmn:boundaryEvent id="Event_1euytx9" cancelActivity="false" attachedToRef="Activity_0fummc9">
      <bpmn:dataOutputAssociation id="DataOutputAssociation_01uz99k">
        <bpmn:targetRef>DataStoreReference_1s0i6d9</bpmn:targetRef>
      </bpmn:dataOutputAssociation>
      <bpmn:signalEventDefinition id="SignalEventDefinition_0n9n301" />
    </bpmn:boundaryEvent>
    <bpmn:sequenceFlow id="Flow_08gc8bu" sourceRef="StartEvent_1" targetRef="Event_1o85fzr" />
    <bpmn:sequenceFlow id="Flow_0p3omji" sourceRef="StartEvent_1" targetRef="Gateway_1l277fu" />
    <bpmn:sequenceFlow id="Flow_1cokig2" sourceRef="StartEvent_1" targetRef="Activity_0cx4jw3" />
    <bpmn:sequenceFlow id="Flow_05d6qru" sourceRef="StartEvent_1" targetRef="Event_19qi251" />
    <bpmn:sequenceFlow id="Flow_0m3ka4j" sourceRef="Gateway_0w21oq5" targetRef="Gateway_13hx8gz" />
    <bpmn:sequenceFlow id="Flow_1fzvkkr" sourceRef="Gateway_13hx8gz" targetRef="Gateway_11i16eo" />
    <bpmn:sequenceFlow id="Flow_0ue8vxz" sourceRef="Gateway_13hx8gz" targetRef="Activity_0n50avy" />
    <bpmn:sequenceFlow id="Flow_1trkccq" sourceRef="Gateway_11i16eo" targetRef="Activity_08x10bt" />
    <bpmn:sequenceFlow id="Flow_0661cct" sourceRef="Gateway_11i16eo" targetRef="Event_1fjxnv4" />
    <bpmn:sequenceFlow id="Flow_02iop1d" sourceRef="Gateway_11i16eo" targetRef="Event_11kny6x" />
    <bpmn:sequenceFlow id="Flow_0kq3j4m" sourceRef="Gateway_11i16eo" targetRef="Event_1ewsie2" />
    <bpmn:sequenceFlow id="Flow_06dhp9c" sourceRef="Gateway_11i16eo" targetRef="Event_1palb28" />
    <bpmn:sequenceFlow id="Flow_0fg0v0h" sourceRef="Activity_0n50avy" targetRef="Activity_1r77er3" />
    <bpmn:sequenceFlow id="Flow_17wd2oo" sourceRef="Activity_1r77er3" targetRef="Activity_0j5g7ln" />
    <bpmn:sequenceFlow id="Flow_19izr72" sourceRef="Activity_0j5g7ln" targetRef="Activity_11ipfw2" />
    <bpmn:sequenceFlow id="Flow_1ychg9d" sourceRef="Activity_11ipfw2" targetRef="Activity_0eyiiz4" />
    <bpmn:sequenceFlow id="Flow_03et73k" sourceRef="Activity_0eyiiz4" targetRef="Activity_16714qd" />
    <bpmn:sequenceFlow id="Flow_1xjhb81" sourceRef="Activity_0fummc9" targetRef="Event_1gth92g" />
    <bpmn:sequenceFlow id="Flow_0pobvme" sourceRef="Event_1gth92g" targetRef="Event_0ku65jl" />
    <bpmn:sequenceFlow id="Flow_0erq7g0" sourceRef="Event_0ku65jl" targetRef="Event_039xncd" />
    <bpmn:sequenceFlow id="Flow_0zwnv7i" sourceRef="Event_039xncd" targetRef="Event_1xc35r0" />
    <bpmn:sequenceFlow id="Flow_1fpzox9" sourceRef="Event_1xc35r0" targetRef="Event_0kzfual" />
    <bpmn:sequenceFlow id="Flow_0a9bgtv" sourceRef="Event_0kzfual" targetRef="Event_0resyto" />
    <bpmn:sequenceFlow id="Flow_1h2xf8y" sourceRef="Event_1swi8du" targetRef="Event_0xuijo1" />
    <bpmn:sequenceFlow id="Flow_07ipzkp" sourceRef="Event_1swi8du" targetRef="Event_1na5ehx" />
    <bpmn:sequenceFlow id="Flow_0isq9j7" sourceRef="Event_1na5ehx" targetRef="Event_0tnfnvr" />
    <bpmn:sequenceFlow id="Flow_0jcmf9k" sourceRef="Event_0tnfnvr" targetRef="Event_1cpua1x" />
    <bpmn:sequenceFlow id="Flow_0cdhbb9" sourceRef="Event_0tnfnvr" targetRef="Event_0jx9tak" />
    <bpmn:sequenceFlow id="Flow_1n1psdh" sourceRef="Event_0tnfnvr" targetRef="Event_1pnub3r" />
    <bpmn:sequenceFlow id="Flow_0bexoya" sourceRef="Event_0tnfnvr" targetRef="Event_1d60irg" />
    <bpmn:sequenceFlow id="Flow_1cpty0u" sourceRef="Event_1cpua1x" targetRef="Event_1vfqdhx" />
    <bpmn:sequenceFlow id="Flow_0ubvzsp" sourceRef="Event_1cpua1x" targetRef="Event_0ie683y" />
    <bpmn:sequenceFlow id="Flow_0xtttr7" sourceRef="Event_1cpua1x" targetRef="Event_0lm9595" />
    <bpmn:sequenceFlow id="Flow_1vm4oob" sourceRef="Event_1cpua1x" targetRef="Event_1yonqbw" />
    <bpmn:sequenceFlow id="Flow_19kh9kp" sourceRef="Activity_16714qd" targetRef="Activity_1emy2u0" />
    <bpmn:sequenceFlow id="Flow_045o66u" sourceRef="Activity_1emy2u0" targetRef="Activity_0fummc9" />
    <bpmn:endEvent id="Event_1yonqbw">
      <bpmn:incoming>Flow_1vm4oob</bpmn:incoming>
    </bpmn:endEvent>
    <bpmn:association id="Association_0mggxhv" associationDirection="None" sourceRef="StartEvent_1" targetRef="TextAnnotation_0b9e6xq" />
  </bpmn:process>"""
    
    # Extract unique tags with the prefix "bpmn:"
    unique_tags = extract_unique_bpmn_tags(xml_input)
    # Print the result
    print("Unique tags:", unique_tags)


Unique tags: {'scriptTask', 'businessRuleTask', 'dataOutputAssociation', 'errorEventDefinition', 'incoming', 'intermediateThrowEvent', 'compensateEventDefinition', 'serviceTask', 'process', 'subProcess', 'messageEventDefinition', 'receiveTask', 'extensionElements', 'linkEventDefinition', 'transaction', 'startEvent', 'condition', 'dataObjectReference', 'escalationEventDefinition', 'association', 'eventBasedGateway', 'outgoing', 'conditionalEventDefinition', 'targetRef', 'callActivity', 'timerEventDefinition', 'task', 'sequenceFlow', 'signalEventDefinition', 'exclusiveGateway', 'endEvent', 'userTask', 'boundaryEvent', 'parallelGateway', 'intermediateCatchEvent', 'dataStoreReference', 'terminateEventDefinition', 'dataObject'}


### BPMN Designer unique element tags

In [6]:
import re

def extract_unique_bpmn_types(xml_text):
    """
    Extract all unique strings in the format testo_variabile xsi:type="bpmn:Stringa_da_prendere".

    :param xml_text: Input XML text as a string
    :return: Set of unique strings captured from xsi:type="bpmn:Stringa_da_prendere"
    """
    # Regex to find xsi:type="bpmn:Stringa_da_prendere"
    bpmn_types = re.findall(r'xsi:type="bpmn:(\w+)"', xml_text)
    # Return a set of unique strings
    return set(bpmn_types)

# BPMN DESIGNER
if __name__ == "__main__":
    # Sample XML input
    xml_input = """<?xml version="1.0" encoding="ASCII"?>
<bpmn:Definitions xmi:version="2.0" xmlns:xmi="http://www.omg.org/XMI" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:bpmn="http://www.omg.org/spec/BPMN/20100524/MODEL-XMI" id="_n78V0Y7lEe-v648egk3nxQ">
  <rootElements xsi:type="bpmn:Process" id="_n78V0I7lEe-v648egk3nxQ">
    <flowElements xsi:type="bpmn:StartEvent" id="_sJFEoI7lEe-v648egk3nxQ" name="Start Event" outgoing="_aRDdsI7mEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:MessageEventDefinition" id="_O8rUgI7oEe-v648egk3nxQ"/>
      <eventDefinitions xsi:type="bpmn:TimerEventDefinition" id="_P8ZsII7oEe-v648egk3nxQ"/>
      <eventDefinitions xsi:type="bpmn:ConditionalEventDefinition" id="_Qm-6MI7oEe-v648egk3nxQ"/>
      <eventDefinitions xsi:type="bpmn:SignalEventDefinition" id="_Qzo2II7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:EndEvent" id="_te_HoI7lEe-v648egk3nxQ" name="End Event" outgoing="_JSnp0I7qEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:TerminateEventDefinition" id="_c0c8oI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:IntermediateCatchEvent" id="_u-vDII7lEe-v648egk3nxQ" name="Intermediate Catch"/>
    <flowElements xsi:type="bpmn:IntermediateThrowEvent" id="_wYY9sI7lEe-v648egk3nxQ" name="Intermediate Throw" outgoing="_IoV9wI7qEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:Task" id="_zFHCoI7lEe-v648egk3nxQ" name="Task 1" incoming="_aRDdsI7mEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:BusinessRuleTask" id="_0F-CsI7lEe-v648egk3nxQ" name="Business Rule Task"/>
    <flowElements xsi:type="bpmn:ManualTask" id="_110PsI7lEe-v648egk3nxQ" name="Manual Task"/>
    <flowElements xsi:type="bpmn:ReceiveTask" id="_2yJ8II7lEe-v648egk3nxQ" name="Receive Task"/>
    <flowElements xsi:type="bpmn:ScriptTask" id="_5rs8II7lEe-v648egk3nxQ" name="Script Task"/>
    <flowElements xsi:type="bpmn:SendTask" id="_66u7EI7lEe-v648egk3nxQ" name="Send Task"/>
    <flowElements xsi:type="bpmn:ServiceTask" id="_72BeoI7lEe-v648egk3nxQ" name="Service Task" incoming="_sZdtMI7pEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:UserTask" id="_9IWCII7lEe-v648egk3nxQ" name="User Task"/>
    <flowElements xsi:type="bpmn:SubProcess" id="_-ZGfYI7lEe-v648egk3nxQ" name="New Sub Process">
      <extensionValues id="_uWN1II7mEe-v648egk3nxQ">
        <value xsi:type="bpmn:TextAnnotation" id="_uWN1IY7mEe-v648egk3nxQ" text="New Tex"/>
      </extensionValues>
      <flowElements xsi:type="bpmn:Task" id="_BUIGEI7mEe-v648egk3nxQ" name="Task Into SubProcess"/>
    </flowElements>
    <flowElements xsi:type="bpmn:AdHocSubProcess" id="__KankI7lEe-v648egk3nxQ" name="New Ad-Hoc Sub Process">
      <flowElements xsi:type="bpmn:Task" id="_Gh7AoI7mEe-v648egk3nxQ" name="Task into Ad-Hoc SubProcess"/>
    </flowElements>
    <flowElements xsi:type="bpmn:CallActivity" id="_KCWacI7mEe-v648egk3nxQ" name="Call Activity" calledElementRef="_n78V0I7lEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_Q7k8EI7mEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_RiuboI7mEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_TKj8II7mEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_XGtooI7mEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:SequenceFlow" id="_aRDdsI7mEe-v648egk3nxQ" targetRef="_zFHCoI7lEe-v648egk3nxQ" sourceRef="_sJFEoI7lEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:Task" id="_29cwsI7mEe-v648egk3nxQ" name="Task 2"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_Iin3oI7nEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_Ix30kI7nEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_LBUHoI7nEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_JR6xII7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_JeRyII7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_JsPVoI7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:StartEvent" id="_SC5ekI7oEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:TimerEventDefinition" id="_Sl7TII7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:StartEvent" id="_TEv9II7oEe-v648egk3nxQ" outgoing="_sZdtMI7pEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:MessageEventDefinition" id="_TaCQoI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:StartEvent" id="_T4ZnoI7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:IntermediateCatchEvent" id="_VrJXoI7oEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:MessageEventDefinition" id="_WD8SII7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:IntermediateCatchEvent" id="_WYxSoI7oEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:TimerEventDefinition" id="_WuSPoI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:IntermediateCatchEvent" id="_XNsvgI7oEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:ConditionalEventDefinition" id="_YuB50I7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:IntermediateCatchEvent" id="_ZNVsAI7oEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:LinkEventDefinition" id="_ZpeOYI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:IntermediateCatchEvent" id="_aBynwI7oEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:SignalEventDefinition" id="_bi51MI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:IntermediateCatchEvent" id="_cEnbII7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:EndEvent" id="_ddzboI7oEe-v648egk3nxQ" incoming="_JSnp0I7qEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:EscalationEventDefinition" id="_d3XyMI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:EndEvent" id="_ePjBoI7oEe-v648egk3nxQ" incoming="_IoV9wI7qEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:ErrorEventDefinition" id="_ezCJMI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:EndEvent" id="_fYj4II7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:IntermediateThrowEvent" id="_jACVII7oEe-v648egk3nxQ">
      <eventDefinitions xsi:type="bpmn:MessageEventDefinition" id="_jTnYcI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:ParallelGateway" id="_kY17II7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:ExclusiveGateway" id="_k6a-MI7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:InclusiveGateway" id="_lNiHcI7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:ComplexGateway" id="_levMMI7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:EventBasedGateway" id="_lvb6oI7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:Task" id="_nCEAII7oEe-v648egk3nxQ" name="Task 3">
      <loopCharacteristics xsi:type="bpmn:StandardLoopCharacteristics" id="_to-CsI7oEe-v648egk3nxQ"/>
    </flowElements>
    <flowElements xsi:type="bpmn:CallActivity" id="_0ioCII7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:CallActivity" id="_0wGdcI7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:Task" id="_X2mGEI7pEe-v648egk3nxQ" name="Task 4"/>
    <flowElements xsi:type="bpmn:SendTask" id="_dhJ1II7pEe-v648egk3nxQ" name="Send Task 2"/>
    <flowElements xsi:type="bpmn:SequenceFlow" id="_sZdtMI7pEe-v648egk3nxQ" targetRef="_72BeoI7lEe-v648egk3nxQ" sourceRef="_TEv9II7oEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:ReceiveTask" id="_x4hW4I7pEe-v648egk3nxQ" name="Receive Task"/>
    <flowElements xsi:type="bpmn:SendTask" id="_6OimoI7pEe-v648egk3nxQ" name="Send Task"/>
    <flowElements xsi:type="bpmn:SequenceFlow" id="_IoV9wI7qEe-v648egk3nxQ" targetRef="_ePjBoI7oEe-v648egk3nxQ" sourceRef="_wYY9sI7lEe-v648egk3nxQ"/>
    <flowElements xsi:type="bpmn:SequenceFlow" id="_JSnp0I7qEe-v648egk3nxQ" targetRef="_ddzboI7oEe-v648egk3nxQ" sourceRef="_te_HoI7lEe-v648egk3nxQ"/>
    <laneSets id="_pqAaII7lEe-v648egk3nxQ" name="General">
      <lanes id="_JB-GEI7nEe-v648egk3nxQ" name="New Lane" flowNodeRefs="_kY17II7oEe-v648egk3nxQ _k6a-MI7oEe-v648egk3nxQ _lNiHcI7oEe-v648egk3nxQ _levMMI7oEe-v648egk3nxQ _lvb6oI7oEe-v648egk3nxQ">
        <extensionValues id="_D_2IoI7qEe-v648egk3nxQ">
          <value xsi:type="bpmn:TextAnnotation" id="_D_2IoY7qEe-v648egk3nxQ" text="New Data Exchange"/>
        </extensionValues>
      </lanes>
      <lanes id="_pqAaIY7lEe-v648egk3nxQ" flowNodeRefs="_sJFEoI7lEe-v648egk3nxQ _te_HoI7lEe-v648egk3nxQ _u-vDII7lEe-v648egk3nxQ _wYY9sI7lEe-v648egk3nxQ _zFHCoI7lEe-v648egk3nxQ _0F-CsI7lEe-v648egk3nxQ _110PsI7lEe-v648egk3nxQ _2yJ8II7lEe-v648egk3nxQ _5rs8II7lEe-v648egk3nxQ _66u7EI7lEe-v648egk3nxQ _72BeoI7lEe-v648egk3nxQ _9IWCII7lEe-v648egk3nxQ _-ZGfYI7lEe-v648egk3nxQ __KankI7lEe-v648egk3nxQ _KCWacI7mEe-v648egk3nxQ _Q7k8EI7mEe-v648egk3nxQ _RiuboI7mEe-v648egk3nxQ _XGtooI7mEe-v648egk3nxQ _SC5ekI7oEe-v648egk3nxQ _T4ZnoI7oEe-v648egk3nxQ _VrJXoI7oEe-v648egk3nxQ _WYxSoI7oEe-v648egk3nxQ _XNsvgI7oEe-v648egk3nxQ _ZNVsAI7oEe-v648egk3nxQ _aBynwI7oEe-v648egk3nxQ _cEnbII7oEe-v648egk3nxQ _ddzboI7oEe-v648egk3nxQ _ePjBoI7oEe-v648egk3nxQ _fYj4II7oEe-v648egk3nxQ _jACVII7oEe-v648egk3nxQ _x4hW4I7pEe-v648egk3nxQ _6OimoI7pEe-v648egk3nxQ _TEv9II7oEe-v648egk3nxQ">
        <extensionValues id="_x0nOoI7mEe-v648egk3nxQ">
          <value xsi:type="bpmn:Group" id="_x0nOoY7mEe-v648egk3nxQ"/>
        </extensionValues>
        <extensionValues id="_M-RRoI7pEe-v648egk3nxQ">
          <value xsi:type="bpmn:DataStore" id="_M-RRoY7pEe-v648egk3nxQ" name="Data Storage"/>
        </extensionValues>
        <extensionValues id="_PUfiII7pEe-v648egk3nxQ">
          <value xsi:type="bpmn:DataObject" id="_PUgJMI7pEe-v648egk3nxQ" name="Data Object"/>
        </extensionValues>
        <partitionElement xsi:type="bpmn:InputOutputSpecification" id="_Qdg_sI7pEe-v648egk3nxQ">
          <dataInputs id="_Qdg_sY7pEe-v648egk3nxQ" name="Data Input"/>
          <dataOutputs id="_RlRR8I7pEe-v648egk3nxQ" name="Data Output"/>
        </partitionElement>
      </lanes>
      <lanes id="_pByuII7mEe-v648egk3nxQ" name="New Lane" flowNodeRefs="_29cwsI7mEe-v648egk3nxQ">
        <extensionValues id="_rb-4oI7mEe-v648egk3nxQ">
          <value xsi:type="bpmn:Group" id="_rb-4oY7mEe-v648egk3nxQ"/>
        </extensionValues>
      </lanes>
    </laneSets>
    <laneSets id="_pqAaII7lEe-v648egk3nxQ" name="NEW">
      <lanes id="_J0LwsI7oEe-v648egk3nxQ" name="New Lane" flowNodeRefs="_nCEAII7oEe-v648egk3nxQ _X2mGEI7pEe-v648egk3nxQ _dhJ1II7pEe-v648egk3nxQ"/>
    </laneSets>
    <artifacts xsi:type="bpmn:Association" id="_NhX-sI7pEe-v648egk3nxQ" sourceRef="_M-RRoY7pEe-v648egk3nxQ" targetRef="_2yJ8II7lEe-v648egk3nxQ"/>
    <artifacts xsi:type="bpmn:Association" id="_G-SPsI7qEe-v648egk3nxQ" sourceRef="_Qdg_sY7pEe-v648egk3nxQ" targetRef="_2yJ8II7lEe-v648egk3nxQ"/>
    <artifacts xsi:type="bpmn:Association" id="_HsuDAI7qEe-v648egk3nxQ" sourceRef="_RlRR8I7pEe-v648egk3nxQ" targetRef="_u-vDII7lEe-v648egk3nxQ"/>
  </rootElements>
</bpmn:Definitions>"""
    
    # Extract unique strings from xsi:type
    unique_types = extract_unique_bpmn_types(xml_input)
    # Print the result
    print("Unique types:", unique_types)


Unique types: {'AdHocSubProcess', 'TerminateEventDefinition', 'DataStore', 'Task', 'ExclusiveGateway', 'EventBasedGateway', 'CallActivity', 'InputOutputSpecification', 'SequenceFlow', 'ReceiveTask', 'TimerEventDefinition', 'IntermediateThrowEvent', 'ConditionalEventDefinition', 'SubProcess', 'Process', 'ComplexGateway', 'ScriptTask', 'Association', 'UserTask', 'StandardLoopCharacteristics', 'MessageEventDefinition', 'ServiceTask', 'ErrorEventDefinition', 'IntermediateCatchEvent', 'ManualTask', 'DataObject', 'SignalEventDefinition', 'StartEvent', 'EscalationEventDefinition', 'InclusiveGateway', 'EndEvent', 'Group', 'TextAnnotation', 'LinkEventDefinition', 'ParallelGateway', 'SendTask', 'BusinessRuleTask'}


### HD dataset BPMN Element occurrencies

In [1]:
import os
from collections import Counter
from xml.etree import ElementTree as ET
import pandas as pd

def count_tags_in_bpmn_process(file_path):
    tag_counter = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        # Namespace handling for BPMN
        namespaces = {'bpmn': 'http://www.omg.org/spec/BPMN/20100524/MODEL'}
        # Find all <bpmn:process> tags
        for process in root.findall(".//bpmn:process", namespaces):
            # Count all tags within <process>
            for elem in process.iter():
                tag_counter[elem.tag] += 1
    except ET.ParseError:
        pass  # Ignore non-XML files or invalid XML
    return tag_counter

def analyze_bpmn_tags_in_folders_and_save(folder_path):
    # Traverse the folder and its subfolders
    for root_dir, dirs, files in os.walk(folder_path):
        # If the folder contains BPMN files, process them
        bpmn_files = [file for file in files if file.endswith(".bpmn")]
        if bpmn_files:
            # Initialize a dictionary to store tag counts for each file
            file_tag_occurrences = {}

            for file in bpmn_files:
                file_path = os.path.join(root_dir, file)
                file_tag_occurrences[file] = count_tags_in_bpmn_process(file_path)

            # Convert the results to a structured DataFrame
            file_tag_data = []
            for file_name, tag_counts in file_tag_occurrences.items():
                for tag, count in tag_counts.items():
                    # Remove namespace for better readability
                    namespace_to_remove = "{http://www.omg.org/spec/BPMN/20100524/MODEL}"
                    clean_tag = tag.replace(namespace_to_remove, "")
                    file_tag_data.append({"File": file_name, "Tag": clean_tag, "Occurrences": count})

            # Create a DataFrame
            file_tag_df = pd.DataFrame(file_tag_data)
            file_tag_df = file_tag_df.sort_values(by=["File", "Occurrences"], ascending=[True, False])

            # Save the DataFrame to a CSV file
            folder_name = os.path.basename(root_dir)  # Use the folder name as the CSV name
            output_csv = os.path.join(root_dir, f"{folder_name}.csv")
            file_tag_df.to_csv(output_csv, index=False)
            print(f"Results saved to {output_csv}")

# Example usage:
# Replace 'your_folder_path' with the path to the folder you want to analyze
folder_path = r"C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations"  # Replace with the path to your folder containing XML files
analyze_bpmn_tags_in_folders_and_save(folder_path)

Results saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex00\ex00.csv
Results saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex01\ex01.csv
Results saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex02\ex02.csv
Results saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex03\ex03.csv
Results saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex04\ex04.csv
Results saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIG

In [7]:
# With Categories
import os
from collections import Counter
from xml.etree import ElementTree as ET
import pandas as pd

def categorize_tag(tag):
    """
    Assign a category to the tag based on predefined rules.
    """
    tag_lower = tag.lower()
    flow_objects = [
        'scripttask', 'businessruletask', 'intermediatethrowevent', 'servicetask',
        'subprocess', 'receivetask', 'transaction', 'startevent', 'association',
        'eventbasedgateway', 'callactivity', 'task', 'exclusivegateway', 'endevent',
        'usertask', 'boundaryevent', 'parallelgateway', 'intermediatecatchevent'
    ]
    connecting_objects = ['incoming', 'outgoing', 'targetref', 'sourceref', 'sequenceflow']
    swim_lanes = ['laneset', 'lane']
    artifacts = ['dataobject', 'datastore']

    if tag_lower in flow_objects:
        return "Flow Objects"
    elif tag_lower in connecting_objects:
        return "Connecting Objects"
    elif tag_lower in swim_lanes:
        return "Swim Lanes"
    elif tag_lower in artifacts:
        return "Artifacts"
    else:
        return "Uncategorized"

def count_tags_in_bpmn_process(file_path):
    tag_counter = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        # Namespace handling for BPMN
        namespaces = {'bpmn': 'http://www.omg.org/spec/BPMN/20100524/MODEL'}
        # Find all <bpmn:process> tags
        for process in root.findall(".//bpmn:process", namespaces):
            # Count all tags within <process>
            for elem in process.iter():
                tag_counter[elem.tag] += 1
    except ET.ParseError:
        pass  # Ignore non-XML files or invalid XML
    return tag_counter

def analyze_bpmn_tags_in_folders_and_save(folder_path):
    # Traverse the folder and its subfolders
    for root_dir, dirs, files in os.walk(folder_path):
        # If the folder contains BPMN files, process them
        bpmn_files = [file for file in files if file.endswith(".bpmn")]
        if bpmn_files:
            # Initialize a dictionary to store tag counts for each file
            file_tag_occurrences = {}

            for file in bpmn_files:
                file_path = os.path.join(root_dir, file)
                file_tag_occurrences[file] = count_tags_in_bpmn_process(file_path)

            # Convert the results to a structured DataFrame
            file_tag_data = []
            for file_name, tag_counts in file_tag_occurrences.items():
                for tag, count in tag_counts.items():
                    # Remove namespace for better readability
                    namespace_to_remove = "{http://www.omg.org/spec/BPMN/20100524/MODEL}"
                    clean_tag = tag.replace(namespace_to_remove, "")
                    category = categorize_tag(clean_tag)
                    file_tag_data.append({"File": file_name, "Tag": clean_tag, "Occurrences": count, "Category": category})

            # Create a DataFrame
            file_tag_df = pd.DataFrame(file_tag_data)
            print(file_tag_df)
            file_tag_df = file_tag_df.sort_values(by=["File", "Occurrences", "Category"], ascending=[True, False, False])

            # Save the DataFrame to a CSV file
            folder_name = os.path.basename(root_dir)  # Use the folder name as the CSV name
            output_csv = os.path.join(root_dir, f"{folder_name}.csv")
            file_tag_df.to_csv(output_csv, index=False)
            print(f"Results saved to {output_csv}")

# Example usage:
# Replace 'your_folder_path' with the path to the folder you want to analyze
folder_path = r"C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations"  # Replace with the path to your folder containing XML files
analyze_bpmn_tags_in_folders_and_save(folder_path)

                     File               Tag  Occurrences            Category
0    ex00_writer0001.bpmn           process            1       Uncategorized
1    ex00_writer0001.bpmn              task            6        Flow Objects
2    ex00_writer0001.bpmn          incoming           13  Connecting Objects
3    ex00_writer0001.bpmn          outgoing           13  Connecting Objects
4    ex00_writer0001.bpmn  exclusiveGateway            1        Flow Objects
..                    ...               ...          ...                 ...
601  ex00_writer0105.bpmn  exclusiveGateway            1        Flow Objects
602  ex00_writer0105.bpmn   parallelGateway            2        Flow Objects
603  ex00_writer0105.bpmn        startEvent            1        Flow Objects
604  ex00_writer0105.bpmn          endEvent            2        Flow Objects
605  ex00_writer0105.bpmn      sequenceFlow           13  Connecting Objects

[606 rows x 4 columns]
Results saved to C:\Users\vitto\Desktop\github\BP-MA

                      File                               Tag  Occurrences  \
0     ex07_writer0000.bpmn                           process            1   
1     ex07_writer0000.bpmn                           laneSet            1   
2     ex07_writer0000.bpmn                              lane            2   
3     ex07_writer0000.bpmn                       flowNodeRef           14   
4     ex07_writer0000.bpmn                        startEvent            2   
...                    ...                               ...          ...   
1739  ex07_writer0106.bpmn                      sequenceFlow           20   
1740  ex07_writer0106.bpmn                        subProcess            1   
1741  ex07_writer0106.bpmn  multiInstanceLoopCharacteristics            1   
1742  ex07_writer0106.bpmn                 eventBasedGateway            1   
1743  ex07_writer0106.bpmn              timerEventDefinition            1   

                Category  
0          Uncategorized  
1             Swim La

In [6]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis, shapiro

def calculate_extended_statistics(data):
    stats = {}
    stats["N"] = len(data)
    stats["Missing"] = data.isnull().sum()
    stats["Mean"] = data.mean()
    stats["SE"] = data.std() / (stats["N"] ** 0.5) if stats["N"] > 0 else None
    stats["95% Confidence Interval Lower"] = stats["Mean"] - 1.96 * stats["SE"] if stats["SE"] else None
    stats["95% Confidence Interval Upper"] = stats["Mean"] + 1.96 * stats["SE"] if stats["SE"] else None
    stats["Median"] = data.median()
    stats["Mode"] = data.mode()[0] if not data.mode().empty else None
    stats["Sum"] = data.sum()
    stats["SD"] = data.std()
    stats["Variance"] = data.var()
    stats["IQR"] = data.quantile(0.75) - data.quantile(0.25)
    stats["Range"] = data.max() - data.min()
    stats["Minimum"] = data.min()
    stats["Maximum"] = data.max()

    # Check for variability before calculating skewness and kurtosis
    if stats["Range"] > 0 and len(data) > 2:
        stats["Skewness"] = skew(data)
        stats["Kurtosis"] = kurtosis(data)
    else:
        stats["Skewness"] = None
        stats["Kurtosis"] = None

    # Check for valid range for Shapiro-Wilk
    if len(data) > 2 and stats["Range"] > 0:
        try:
            shapiro_test = shapiro(data)
            stats["Shapiro-Wilk W"] = shapiro_test.statistic
            stats["Shapiro-Wilk p value"] = shapiro_test.pvalue
        except Exception as e:
            stats["Shapiro-Wilk W"] = None
            stats["Shapiro-Wilk p value"] = None
    else:
        stats["Shapiro-Wilk W"] = None
        stats["Shapiro-Wilk p value"] = None

    stats["25th"] = data.quantile(0.25)
    stats["50th"] = data.quantile(0.50)
    stats["75th"] = data.quantile(0.75)
    stats["90th"] = data.quantile(0.90)
    stats["95th"] = data.quantile(0.95)
    stats["99th"] = data.quantile(0.99)
    return stats

def generate_boxplots_from_csv(folder_path):
    total_data = []  # To store all data across all folders

    for root_dir, dirs, files in os.walk(folder_path):
        csv_files = [file for file in files if file.endswith(".csv")]
        if csv_files:
            folder_name = os.path.basename(root_dir)
            all_data = []

            for csv_file in csv_files:
                file_path = os.path.join(root_dir, csv_file)
                df = pd.read_csv(file_path)
                if "Tag" in df.columns and "Occurrences" in df.columns:
                    all_data.append(df[["Tag", "Occurrences"]])

            if all_data:
                combined_data = pd.concat(all_data)
                total_data.append(combined_data)  # Append to the global dataset
                
                statistics = []
                for tag, group in combined_data.groupby("Tag")["Occurrences"]:
                    stats = calculate_extended_statistics(group)
                    stats["Tag"] = tag
                    statistics.append(stats)

                stats_df = pd.DataFrame(statistics)
                stats_csv_path = os.path.join(root_dir, f"{folder_name}_tag_statistics.csv")
                stats_df.to_csv(stats_csv_path, index=False)
                print(f"Statistics saved to {stats_csv_path}")
                
                plt.figure(figsize=(12, 6))
                combined_data.boxplot(column="Occurrences", by="Tag", grid=False)
                plt.title(f"Occurrences of Tags in Folder: {folder_name}")
                plt.suptitle("")  # Remove default Matplotlib subtitle
                plt.xlabel("Tag")
                plt.ylabel("Occurrences")
                plt.xticks(rotation=90)
                plt.tight_layout()

                plot_path = os.path.join(root_dir, f"{folder_name}_tag_occurrences_boxplot.png")
                plt.savefig(plot_path)
                plt.close()
                print(f"Boxplot saved to {plot_path}")

    # Generate aggregated boxplot for each Tag
    if total_data:
        total_combined_data = pd.concat(total_data)
        
        aggregated_boxplot_path = os.path.join(folder_path, "aggregated_tag_occurrences_boxplot.png")
        
        # Generate boxplot for all Tags aggregated
        plt.figure(figsize=(12, 6))
        total_combined_data.boxplot(column="Occurrences", by="Tag", grid=False)
        plt.title("Aggregated Occurrences of Tags Across All Files")
        plt.suptitle("")  # Remove default Matplotlib subtitle
        plt.xlabel("Tag")
        plt.ylabel("Occurrences")
        plt.xticks(rotation=90)
        plt.tight_layout()

        plt.savefig(aggregated_boxplot_path)
        plt.close()
        print(f"Aggregated boxplot saved to {aggregated_boxplot_path}")

# Example usage:
folder_path = r"C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations"
generate_boxplots_from_csv(folder_path)

Statistics saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex00\ex00_tag_statistics.csv
Boxplot saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex00\ex00_tag_occurrences_boxplot.png
Statistics saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex01\ex01_tag_statistics.csv
Boxplot saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex01\ex01_tag_occurrences_boxplot.png
Statistics saved to C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations\ex02\ex02_tag_statistics.csv
Boxplot saved to C:\Users\vitto\D

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

### BPMN Designer Element occurrencies

In [11]:
import os
import xml.etree.ElementTree as ET
from collections import Counter
import pandas as pd

def count_bpmn_occurrences(file_path):
    tag_counts = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        namespace = {'bpmn': 'http://www.omg.org/spec/BPMN/20100524/MODEL-XMI'}

        # Find all <rootElements> tags
        for root_element in root.findall(".//rootElements", namespace):
            # Count <flowElements> tags with specific xsi:type
            for flow_element in root_element.findall(".//flowElements", namespace):
                xsi_type = flow_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:") or xsi_type.startswith("bpmn::"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn:" or "bpmn::"
                    tag_counts[tag_name] += 1

                # Count specific attributes
                if "outgoing" in flow_element.attrib:
                    tag_counts["outgoing"] += 1
                if "incoming" in flow_element.attrib:
                    tag_counts["incoming"] += 1
                if "sourceRef" in flow_element.attrib:
                    tag_counts["sourceRef"] += 1
                if "targetRef" in flow_element.attrib:
                    tag_counts["targetRef"] += 1

            # Count specific tags: laneSets and lanes
            tag_counts["laneSets"] += len(root_element.findall(".//laneSets", namespace))
            tag_counts["lanes"] += len(root_element.findall(".//lanes", namespace))

            # Handle <value xsi:type="bpmn::DataObject"> and similar tags
            for value_element in root_element.findall(".//value", namespace):
                xsi_type = value_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn::"
                    tag_counts[tag_name] += 1

    except ET.ParseError as e:
        print(f"Error parsing file {file_path}: {e}")
    return tag_counts

def analyze_bpmn_files_in_folders(folder_path):
    # Initialize a dictionary to store occurrences for each file
    file_tag_occurrences = {}

    # Traverse the folder and its subfolders for BPMN files
    for root_dir, sub_dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".bpmn"):  # Focus only on BPMN files
                file_path = os.path.join(root_dir, file)
                folder_name = os.path.basename(root_dir)  # Get the folder name
                file_tag_occurrences[folder_name] = count_bpmn_occurrences(file_path)

    # Create a DataFrame to store results
    all_results = []
    for folder_name, counts in file_tag_occurrences.items():
        for tag, count in counts.items():
            all_results.append({"File": folder_name, "Tag": tag, "Occurrences": count})

    if not all_results:  # If no results, return an empty DataFrame
        print("No data found. Returning an empty DataFrame.")
        return pd.DataFrame(columns=["File", "Tag", "Occurrences"])

    results_df = pd.DataFrame(all_results)

    # Ensure sorting only happens if the columns exist
    if "File" in results_df.columns and "Occurrences" in results_df.columns:
        results_df = results_df.sort_values(by=["File", "Occurrences"], ascending=[True, False])

    return results_df

# Example usage:
# Replace 'your_folder_path' with the path to the folder containing BPMN files
folder_path = r"C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD-LLM-GPT4"
results_df = analyze_bpmn_files_in_folders(folder_path)

# Save results to a CSV file
if not results_df.empty:
    results_df.to_csv("bpmn_designer_tag_occurrences.csv", index=False)
    print("Results saved to bpmn_designer_tag_occurrences.csv")
else:
    print("No BPMN files processed.")

Results saved to bpmn_designer_tag_occurrences.csv


In [8]:
import os
import xml.etree.ElementTree as ET
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis, shapiro

# Function to count BPMN occurrences with tag aggregation
def count_bpmn_occurrences(file_path):
    tag_counts = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        namespace = {'bpmn': 'http://www.omg.org/spec/BPMN/20100524/MODEL-XMI'}

        # Find all <rootElements> tags
        for root_element in root.findall(".//rootElements", namespace):
            # Count <flowElements> tags with specific xsi:type
            for flow_element in root_element.findall(".//flowElements", namespace):
                xsi_type = flow_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:") or xsi_type.startswith("bpmn::"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn:" or "bpmn::"

                    # Aggregate specific tags under the generic "task" tag
                    if tag_name in ["UserTask", "ServiceTask", "ReceiveTask", "SendTask"]:
                        tag_name = "task"

                    tag_counts[tag_name] += 1

                # Count specific attributes
                if "outgoing" in flow_element.attrib:
                    tag_counts["outgoing"] += 1
                if "incoming" in flow_element.attrib:
                    tag_counts["incoming"] += 1
                if "sourceRef" in flow_element.attrib:
                    tag_counts["sourceRef"] += 1
                if "targetRef" in flow_element.attrib:
                    tag_counts["targetRef"] += 1

            # Count specific tags: laneSets and lanes
            tag_counts["laneSets"] += len(root_element.findall(".//laneSets", namespace))
            tag_counts["lanes"] += len(root_element.findall(".//lanes", namespace))

            # Handle <value xsi:type="bpmn::DataObject"> and similar tags
            for value_element in root_element.findall(".//value", namespace):
                xsi_type = value_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn::"

                    # Aggregate specific tags under "task"
                    if tag_name in ["UserTask", "ServiceTask", "ReceiveTask", "SendTask"]:
                        tag_name = "task"

                    tag_counts[tag_name] += 1

    except ET.ParseError as e:
        print(f"Error parsing file {file_path}: {e}")
    return tag_counts

# Function to calculate extended statistics
def calculate_extended_statistics(data):
    stats = {}
    stats["N"] = len(data)
    stats["Missing"] = data.isnull().sum()
    stats["Mean"] = data.mean()
    stats["SE"] = data.std() / (stats["N"] ** 0.5) if stats["N"] > 0 else None
    stats["95% Confidence Interval Lower"] = stats["Mean"] - 1.96 * stats["SE"] if stats["SE"] else None
    stats["95% Confidence Interval Upper"] = stats["Mean"] + 1.96 * stats["SE"] if stats["SE"] else None
    stats["Median"] = data.median()
    stats["Mode"] = data.mode()[0] if not data.mode().empty else None
    stats["Sum"] = data.sum()
    stats["SD"] = data.std()
    stats["Variance"] = data.var()
    stats["IQR"] = data.quantile(0.75) - data.quantile(0.25)
    stats["Range"] = data.max() - data.min()
    stats["Minimum"] = data.min()
    stats["Maximum"] = data.max()

    # Check for variability before calculating skewness and kurtosis
    if stats["Range"] > 0 and len(data) > 2:
        stats["Skewness"] = skew(data)
        stats["Kurtosis"] = kurtosis(data)
    else:
        stats["Skewness"] = None
        stats["Kurtosis"] = None

    # Check for valid range for Shapiro-Wilk
    if len(data) > 2 and stats["Range"] > 0:
        try:
            shapiro_test = shapiro(data)
            stats["Shapiro-Wilk W"] = shapiro_test.statistic
            stats["Shapiro-Wilk p value"] = shapiro_test.pvalue
        except Exception as e:
            stats["Shapiro-Wilk W"] = None
            stats["Shapiro-Wilk p value"] = None
    else:
        stats["Shapiro-Wilk W"] = None
        stats["Shapiro-Wilk p value"] = None

    stats["25th"] = data.quantile(0.25)
    stats["50th"] = data.quantile(0.50)
    stats["75th"] = data.quantile(0.75)
    stats["90th"] = data.quantile(0.90)
    stats["95th"] = data.quantile(0.95)
    stats["99th"] = data.quantile(0.99)
    return stats

# Function to analyze BPMN files
def analyze_bpmn_files_in_folders(folder_path):
    file_tag_occurrences = {}
    total_data = []  # Store all data for global analysis

    for root_dir, sub_dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".bpmn"):
                file_path = os.path.join(root_dir, file)
                folder_name = os.path.basename(root_dir)
                counts = count_bpmn_occurrences(file_path)
                file_tag_occurrences[folder_name] = counts

                # Collect all occurrences for total analysis
                for tag, count in counts.items():
                    total_data.append({"Tag": tag, "Occurrences": count})

    if not total_data:  # If no data, return empty DataFrame
        print("No data found. Returning an empty DataFrame.")
        return pd.DataFrame(columns=["Tag", "Occurrences"])

    # Create a DataFrame for all results
    results_df = pd.DataFrame(total_data)

    # Calculate aggregate statistics
    statistics = []
    for tag, group in results_df.groupby("Tag")["Occurrences"]:
        stats = calculate_extended_statistics(group)
        stats["Tag"] = tag
        statistics.append(stats)

    # Save the aggregate statistics
    stats_df = pd.DataFrame(statistics)
    stats_df.to_csv("bpmn_aggregate_statistics.csv", index=False)
    print("Aggregate statistics saved to bpmn_aggregate_statistics.csv")

    # Create and save boxplot for all tags
    plt.figure(figsize=(12, 6))
    results_df.boxplot(column="Occurrences", by="Tag", grid=False)
    plt.title("Occurrences of Tags Across All BPMN Files (Aggregated)")
    plt.suptitle("")  # Remove default Matplotlib subtitle
    plt.xlabel("Tag")
    plt.ylabel("Occurrences")
    plt.xticks(rotation=90)
    plt.tight_layout()

    boxplot_path = "bpmn_aggregate_boxplot.png"
    plt.savefig(boxplot_path)
    plt.close()
    print(f"Aggregate boxplot saved to {boxplot_path}")

    return results_df

# Example usage
folder_path = r"C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD-LLM-GPT4"
results_df = analyze_bpmn_files_in_folders(folder_path)

# Save results to a CSV file
if not results_df.empty:
    results_df.to_csv("bpmn_designer_tag_occurrences.csv", index=False)
    print("Results saved to bpmn_designer_tag_occurrences.csv")
else:
    print("No BPMN files processed.")

Aggregate statistics saved to bpmn_aggregate_statistics.csv
Aggregate boxplot saved to bpmn_aggregate_boxplot.png
Results saved to bpmn_designer_tag_occurrences.csv


<Figure size 1200x600 with 0 Axes>