# Leveraging Neo4j for Service Identification and Microservices Partitioning in Business Process Systems

# 1. Install Required Libraries

In [117]:
!pip install neo4j torch



# 2. Import Libraries

Import all necessary libraries for XML parsing, Neo4j interaction, GPU detection, and concurrent processing.

In [2]:
# Import Libraries
import xml.etree.ElementTree as ET
from neo4j import GraphDatabase
import os
import re
import html
import torch 
from concurrent.futures import ThreadPoolExecutor
import uuid

# 3. Check CUDA Availability

Detect whether CUDA (GPU) is available on your system. This information will be printed at the beginning of the notebook.

In [3]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

CUDA available: True
CUDA version: 11.8
Using device: cuda


In [4]:
# Function to check CUDA availability
def check_cuda():
    if torch.cuda.is_available():
        print("CUDA is available. GPU will be used if applicable.")
        print(f"Device Name: {torch.cuda.get_device_name(0)}")
    else:
        print("CUDA is not available. Using CPU.")

# Execute CUDA check
check_cuda()

CUDA is available. GPU will be used if applicable.
Device Name: NVIDIA GeForce RTX 3060 Laptop GPU


# 4. Define Connection to Neo4j

In [5]:
# Neo4j connection details
uri = "bolt://localhost:7687"
username = "neo4j"
password = "170202Kcf"

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

In [6]:
def test_connection():
    try:
        with driver.session(database="erpbpmn") as session:
            result = session.run("RETURN 1 AS test")
            for record in result:
                print(f"Connection successful, test query result: {record['test']}")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")

test_connection()

Connection successful, test query result: 1


# 5. Define Functions to Create Nodes and Relationships

In [7]:
# Define functions to create nodes and relationships
def create_node(tx, label, properties):
    color_map = {
        'Task': '#ADD8E6',
        'StartEvent': '#90EE90',
        'EndEvent': '#FFB6C1',
        'Gateway': '#FFFF00' 
    }
    color = color_map.get(label, '#D3D3D3')

    query = (
        f"MERGE (n:{label} {{id: $properties.id}}) "
        "SET n += $properties, n.color = $color "
        "RETURN n"
    )
    result = tx.run(query, properties=properties, color=color)
    return result.single()[0]

def create_relationship_with_id(tx, source_id, target_id, rel_type, properties):
    rel_color_map = {
        'SEQUENCE_FLOW': '#A9A9A9',
        'XOR_SPLIT': '#FF69B4',
        'XOR_JOIN': '#4169E1',
        'OR_SPLIT': '#FFD700',
        'OR_JOIN': '#00CED1'
    }
    color = rel_color_map.get(rel_type, '#696969')

    query = (
        f"MATCH (a {{id: $source_id}}), (b {{id: $target_id}}) "
        f"MERGE (a)-[r:{rel_type} {{id: $properties.id}}]->(b) "
        "SET r += $properties, r.color = $color "
        "RETURN r"
    )
    result = tx.run(query, source_id=source_id, target_id=target_id, properties=properties, color=color)
    record = result.single()
    if record:
        return record[0]
    else:
        print(f"Warning: Could not create relationship {rel_type} between {source_id} and {target_id}. One of the nodes may not exist.")
        return None

# 6. Parse BPMN XML Files and Load into Neo4j

In [8]:
# Define functions to parse BPMN XML files
def clean_name(name):
    name = re.sub('<[^<]+?>', '', name)
    name = html.unescape(name)
    return name.strip()

In [9]:
def parse_drawio_bpmn_xml(file_path, level, module, activity=None):
    tree = ET.parse(file_path)
    root = tree.getroot()

    elements = {
        'Task': [],
        'StartEvent': [],
        'EndEvent': [],
        'Gateway': []
    }
    flows = []
    
    for cell in root.findall('.//mxCell'):
        style = cell.get('style', '').lower()
        cell_id = cell.get('id')
        value = clean_name(cell.get('value', ''))
        if not value:
            value = f"Unnamed_{cell_id}"
        
        if 'shape=mxgraph.bpmn.task' in style:
            elements['Task'].append({'id': f"{level}_{cell_id}", 'name': value, 'level': level, 'module': module, 'activity': activity})
        elif 'shape=mxgraph.bpmn.event' in style:
            if 'outline=end' in style:
                elements['EndEvent'].append({'id': f"{level}_{cell_id}", 'name': value, 'type': 'EndEvent', 'level': level, 'module': module, 'activity': activity})
            else:
                elements['StartEvent'].append({'id': f"{level}_{cell_id}", 'name': value, 'type': 'StartEvent', 'level': level, 'module': module, 'activity': activity})
        elif 'shape=mxgraph.bpmn.gateway' in style:
            gateway_kind = 'XOR' if 'exclusive' in style else 'OR'
            elements['Gateway'].append({'id': f"{level}_{cell_id}", 'name': value, 'gateway_kind': gateway_kind, 'level': level, 'module': module, 'activity': activity})
        elif cell.get('edge') == '1':
            flows.append({
                'id': f"{level}_{cell_id}",
                'sourceRef': f"{level}_{cell.get('source')}",
                'targetRef': f"{level}_{cell.get('target')}",
                'name': value,
                'level': level,
                'module': module,
                'activity': activity
            })

    return elements, flows

In [10]:
# Helper function to determine the label of an element by its ID
def get_element_by_id(elements, element_id):
    for element_type, element_list in elements.items():
        for element in element_list:
            if element['id'] == element_id:
                return element_type, element
    return None, None

In [11]:
# Main function to process all BPMN files with parallel processing
def process_gateways(elements, flows, session, level, module, activity):
    gateways = elements.get('Gateway', [])
    new_relationships = []
    flows_to_remove = []
    
    for gateway in gateways:
        gateway_id = gateway['id']
        gateway_kind = gateway['gateway_kind']  # XOR or OR

        # Find incoming and outgoing flows for the gateway
        incoming_flows = [flow for flow in flows if flow['targetRef'] == gateway_id]
        outgoing_flows = [flow for flow in flows if flow['sourceRef'] == gateway_id]
        
        if len(incoming_flows) == 1 and len(outgoing_flows) > 1:
            # Split gateway
            incoming_flow = incoming_flows[0]
            for out_flow in outgoing_flows:
                rel_type = f"{gateway_kind}_SPLIT"
                new_relationships.append((incoming_flow['sourceRef'], out_flow['targetRef'], rel_type, out_flow['id']))
                flows_to_remove.extend([incoming_flow['id'], out_flow['id']])
        elif len(incoming_flows) > 1 and len(outgoing_flows) == 1:
            # Join gateway
            outgoing_flow = outgoing_flows[0]
            for in_flow in incoming_flows:
                rel_type = f"{gateway_kind}_JOIN"
                new_relationships.append((in_flow['sourceRef'], outgoing_flow['targetRef'], rel_type, in_flow['id']))
                flows_to_remove.extend([in_flow['id'], outgoing_flow['id']])
        else:
            # Handle other gateway types if necessary
            continue

    # Remove flows that pass through gateways
    flows[:] = [flow for flow in flows if flow['id'] not in flows_to_remove]
    
    # Create new relationships
    for source, target, rel_type, flow_id in new_relationships:
        rel_properties = {'id': flow_id, 'name': rel_type, 'level': level, 'module': module, 'activity': activity}
        session.write_transaction(create_relationship_with_id, source, target, rel_type, rel_properties)
        print(f"Created {rel_type} relationship from {source} to {target}")

In [13]:
def process_bpmn_file(session, filename, file_path, level, module, activity):
    print(f"\nProcessing {filename} at Level {level}...")
    elements, flows = parse_drawio_bpmn_xml(file_path, level, module, activity)

    print(f"Parsed {sum(len(v) for v in elements.values())} elements and {len(flows)} flows from {filename}")

    # Handle 'Unnamed' nodes based on level
    if level == 1:
        # Rename unnamed StartEvent and EndEvent
        for event_type in ['StartEvent', 'EndEvent']:
            for event in elements[event_type]:
                if event['name'].startswith('Unnamed'):
                    if event_type == 'StartEvent':
                        event['name'] = 'Start'
                    elif event_type == 'EndEvent':
                        event['name'] = 'End'
                    print(f"Renamed {event_type} {event['id']} to {event['name']}")
    elif level in (2, 3):
        # Identify unnamed nodes
        unnamed_node_ids = []
        for element_type, element_list in elements.items():
            for element in element_list:
                if element['name'].startswith('Unnamed'):
                    unnamed_node_ids.append(element['id'])
        
        if unnamed_node_ids:
            print(f"Identified {len(unnamed_node_ids)} unnamed nodes to remove at Level {level}")
        
        # For each unnamed node, adjust flows
        for unnamed_id in unnamed_node_ids:
            # Find incoming and outgoing flows
            incoming_flows = [flow for flow in flows if flow['targetRef'] == unnamed_id]
            outgoing_flows = [flow for flow in flows if flow['sourceRef'] == unnamed_id]
            
            # Create new SEQUENCE_FLOWs bypassing the unnamed node
            for in_flow in incoming_flows:
                for out_flow in outgoing_flows:
                    new_rel_type = 'SEQUENCE_FLOW'
                    new_rel_properties = {
                        'id': f"{level}_{uuid.uuid4()}",
                        'name': new_rel_type,
                        'level': level,
                        'module': module,
                        'activity': activity
                    }
                    session.write_transaction(create_relationship_with_id, in_flow['sourceRef'], out_flow['targetRef'], new_rel_type, new_rel_properties)
                    print(f"Created {new_rel_type} relationship from {in_flow['sourceRef']} to {out_flow['targetRef']} bypassing {unnamed_id}")
            
            # Remove flows connected to the unnamed node
            flows[:] = [flow for flow in flows if flow['id'] not in [in_flow['id'] for in_flow in incoming_flows] and flow['id'] not in [out_flow['id'] for out_flow in outgoing_flows]]
            
            # Remove the unnamed node from elements
            for element_type, element_list in elements.items():
                elements[element_type] = [element for element in element_list if element['id'] != unnamed_id]
            print(f"Removed unnamed node {unnamed_id} and its associated flows")

    # Create nodes
    for element_type, element_list in elements.items():
        for element in element_list:
            session.write_transaction(create_node, element_type, element)
            print(f"Created node: {element_type} with ID {element['id']} and name {element['name']}")

    # Process gateways: create direct relationships and remove gateway flows
    process_gateways(elements, flows, session, level, module, activity)

    # Create remaining sequence flows as SEQUENCE_FLOW relationships
    for flow in flows:
        source_id = flow['sourceRef']
        target_id = flow['targetRef']
        rel_properties = {'id': flow['id'], 'name': 'SEQUENCE_FLOW', 'level': level, 'module': module, 'activity': activity}
        
        rel_created = session.write_transaction(
            create_relationship_with_id,
            source_id, target_id, 'SEQUENCE_FLOW', rel_properties
        )
        if rel_created:
            print(f"Created SEQUENCE_FLOW relationship from {source_id} to {target_id}")
        else:
            print(f"Failed to create SEQUENCE_FLOW relationship from {source_id} to {target_id}")

    # Connect Start Event to the first task if level is 0
    if level == 0:
        start_event = next((e for e in elements['StartEvent']), None)
        first_task = next((e for e in elements['Task']), None)
        if start_event and first_task:
            rel_properties = {'id': f"{level}_start_to_first", 'name': 'SEQUENCE_FLOW', 'level': level, 'module': module, 'activity': activity}
            session.write_transaction(
                create_relationship_with_id,
                start_event['id'], first_task['id'], 'SEQUENCE_FLOW', rel_properties
            )
            print(f"Connected Start Event to first task in Level 0")

# 7. Main Execution

In [14]:
def verify_data_import():
    with driver.session(database="erpbpmn") as session:
        result = session.run("MATCH (n) RETURN labels(n) AS Label, count(n) AS Count ORDER BY Count DESC")
        print("Node counts by label:")
        for record in result:
            print(f"{record['Label']}: {record['Count']}")

        result = session.run("MATCH ()-[r]->() RETURN type(r) AS RelationType, count(r) AS Count ORDER BY Count DESC")
        print("\nRelationship counts by type:")
        for record in result:
            print(f"{record['RelationType']}: {record['Count']}")

        result = session.run("MATCH (n) RETURN n.level AS Level, count(n) AS Count ORDER BY Level")
        print("\nNode counts by level:")
        for record in result:
            print(f"Level {record['Level']}: {record['Count']} nodes")

In [15]:
# Run the processing function with parallelization
def main():
    bpmn_dir = './assets'  # Directory containing BPMN XML files
    filenames = [f for f in os.listdir(bpmn_dir) if f.endswith('.xml')]

    with driver.session(database="erpbpmn") as session:
        for filename in filenames:
            file_path = os.path.join(bpmn_dir, filename)
            if filename == "BPMN Level 0.xml":
                level = 0
                module = "ERP"
                activity = None
            else:
                match = re.match(r'BPMN\s+(.+)\s+Level\s+(\d+)(?:\s+-\s+(.+))?\.xml', filename)
                if match:
                    module = match.group(1)
                    level = int(match.group(2))
                    activity = match.group(3) if match.group(3) else None
                else:
                    print(f"Skipping file {filename} as it doesn't match the expected naming pattern.")
                    continue
            
            process_bpmn_file(session, filename, file_path, level, module, activity)

# 9. Execute Main Function

In [16]:
def run():
    main()
    verify_data_import()
    driver.close()

if __name__ == "__main__":
    run()


Processing BPMN Account Payable Level 1.xml at Level 1...
Parsed 3 elements and 2 flows from BPMN Account Payable Level 1.xml
Renamed StartEvent 1_4 to Start
Renamed EndEvent 1_8 to End


  session.write_transaction(create_node, element_type, element)


Created node: Task with ID 1_6 and name Account Payable
Created node: StartEvent with ID 1_4 and name Start
Created node: EndEvent with ID 1_8 and name End


  rel_created = session.write_transaction(


Created SEQUENCE_FLOW relationship from 1_6 to 1_8
Created SEQUENCE_FLOW relationship from 1_4 to 1_6

Processing BPMN Account Payable Level 2.xml at Level 2...
Parsed 34 elements and 38 flows from BPMN Account Payable Level 2.xml
Identified 9 unnamed nodes to remove at Level 2
Created SEQUENCE_FLOW relationship from 2_80 to 2_16 bypassing 2_13
Created SEQUENCE_FLOW relationship from 2_80 to 2_25 bypassing 2_13
Removed unnamed node 2_13 and its associated flows
Removed unnamed node 2_16 and its associated flows
Removed unnamed node 2_17 and its associated flows
Removed unnamed node 2_19 and its associated flows
Created SEQUENCE_FLOW relationship from 2_80 to 2_44 bypassing 2_41
Created SEQUENCE_FLOW relationship from 2_80 to 2_45 bypassing 2_41
Removed unnamed node 2_41 and its associated flows
Removed unnamed node 2_44 and its associated flows
Created SEQUENCE_FLOW relationship from 2_54 to 2_77 bypassing 2_59
Removed unnamed node 2_59 and its associated flows
Created SEQUENCE_FLOW re

  session.write_transaction(create_relationship_with_id, in_flow['sourceRef'], out_flow['targetRef'], new_rel_type, new_rel_properties)


Created node: Task with ID 2_67 and name Processing Purchase DP Return
Created node: Task with ID 2_71 and name Creating the Purchase DP Invoice
Created node: Task with ID 2_75 and name Validating Invoice DP Details
Created node: Task with ID 2_77 and name Creating the Purchase Invoice
Created node: StartEvent with ID 2_79 and name Start
Created node: EndEvent with ID 2_23 and name End
Created node: Gateway with ID 2_54 and name Goods Accepted?
Created node: Gateway with ID 2_66 and name Goods Accepted?
Created SEQUENCE_FLOW relationship from 2_64 to 2_32
Created SEQUENCE_FLOW relationship from 2_67 to 2_31
Created SEQUENCE_FLOW relationship from 2_56 to 2_25
Created SEQUENCE_FLOW relationship from 2_42 to 2_27
Failed to create SEQUENCE_FLOW relationship from 2_None to 2_29
Created SEQUENCE_FLOW relationship from 2_25 to 2_26
Created SEQUENCE_FLOW relationship from 2_29 to 2_30
Created SEQUENCE_FLOW relationship from 2_26 to 2_30
Created SEQUENCE_FLOW relationship from 2_33 to 2_30
Cre

  session.write_transaction(create_relationship_with_id, source, target, rel_type, rel_properties)


Created XOR_SPLIT relationship from 0_70 to 0_41
Created XOR_SPLIT relationship from 0_70 to 0_36
Created XOR_SPLIT relationship from 0_70 to 0_44
Created XOR_SPLIT relationship from 0_70 to 0_45
Created XOR_SPLIT relationship from 0_70 to 0_62
Created XOR_SPLIT relationship from 0_70 to 0_63
Created XOR_SPLIT relationship from 0_70 to 0_48
Created XOR_SPLIT relationship from 0_70 to 0_55
Created XOR_SPLIT relationship from 0_70 to 0_59
Created OR_JOIN relationship from 0_59 to 0_82
Created OR_JOIN relationship from 0_63 to 0_82
Created OR_JOIN relationship from 0_48 to 0_82
Created OR_JOIN relationship from 0_36 to 0_82
Created OR_JOIN relationship from 0_24 to 0_82
Created OR_JOIN relationship from 0_None to 0_82
Created OR_JOIN relationship from 0_23 to 0_82
Created OR_JOIN relationship from 0_29 to 0_82
Created OR_JOIN relationship from 0_31 to 0_82
Created OR_JOIN relationship from 0_33 to 0_82
Created OR_JOIN relationship from 0_41 to 0_82
Created OR_JOIN relationship from 0_55 t

  session.write_transaction(
