# Leveraging Neo4j for Service Identification and Microservices Partitioning in Business Process Systems

# 1. Install Required Libraries

In [88]:
!pip install neo4j torch



# 2. Import Libraries

Import all necessary libraries for XML parsing, Neo4j interaction, GPU detection, and concurrent processing.

In [89]:
# Import Libraries
import xml.etree.ElementTree as ET
from neo4j import GraphDatabase
import os
import re
import html
import torch 
from concurrent.futures import ThreadPoolExecutor

# 3. Check CUDA Availability

Detect whether CUDA (GPU) is available on your system. This information will be printed at the beginning of the notebook.

In [90]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

CUDA available: True
CUDA version: 11.8
Using device: cuda


In [91]:
# Function to check CUDA availability
def check_cuda():
    if torch.cuda.is_available():
        print("CUDA is available. GPU will be used if applicable.")
        print(f"Device Name: {torch.cuda.get_device_name(0)}")
    else:
        print("CUDA is not available. Using CPU.")

# Execute CUDA check
check_cuda()

CUDA is available. GPU will be used if applicable.
Device Name: NVIDIA GeForce RTX 3060 Laptop GPU


# 4. Define Connection to Neo4j

In [92]:
# Neo4j connection details
uri = "bolt://localhost:7687"
username = "neo4j"
password = "170202Kcf"

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

In [93]:
def test_connection():
    try:
        with driver.session(database="erpbpmn") as session:
            result = session.run("RETURN 1 AS test")
            for record in result:
                print(f"Connection successful, test query result: {record['test']}")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")

test_connection()

Connection successful, test query result: 1


# 5. Define Functions to Create Nodes and Relationships

In [94]:
# Define functions to create nodes and relationships
def create_node(tx, label, properties):
    color_map = {
        'Task': '#ADD8E6',
        'StartEvent': '#90EE90',
        'EndEvent': '#FFB6C1',
        'Gateway': '#FFA07A'
    }
    color = color_map.get(label, '#D3D3D3')

    query = (
        f"MERGE (n:{label} {{id: $properties.id}}) "
        "SET n += $properties, n.color = $color "
        "RETURN n"
    )
    result = tx.run(query, properties=properties, color=color)
    return result.single()[0]

def create_relationship_with_id(tx, label1, id1, label2, id2, rel_type, properties):
    rel_color_map = {
        'SEQUENCE_FLOW': '#A9A9A9',
        'XOR_SPLIT': '#FF69B4',
        'XOR_JOIN': '#4169E1',
        'OR_SPLIT': '#FFD700',
        'OR_JOIN': '#00CED1'
    }
    color = rel_color_map.get(rel_type, '#696969')

    query = (
        f"MATCH (a:{label1} {{id: $id1}}), (b:{label2} {{id: $id2}}) "
        f"MERGE (a)-[r:{rel_type} {{id: $properties.id}}]->(b) "
        "SET r += $properties, r.color = $color "
        "RETURN r"
    )
    result = tx.run(query, id1=id1, id2=id2, properties=properties, color=color)
    record = result.single()
    if record:
        return record[0]
    else:
        print(f"Warning: Could not create relationship {rel_type} between {label1}({id1}) and {label2}({id2}). One of the nodes may not exist.")
        return None

# 6. Parse BPMN XML Files and Load into Neo4j

In [95]:
# Define functions to parse BPMN XML files
def clean_name(name):
    name = re.sub('<[^<]+?>', '', name)
    name = html.unescape(name)
    return name.strip()

In [96]:
# Function to parse draw.io BPMN XML files
def parse_drawio_bpmn_xml(file_path, level, module, activity=None):
    tree = ET.parse(file_path)
    root = tree.getroot()

    tasks = []
    events = []
    gateways = []
    sequence_flows = []
    
    for cell in root.findall('.//mxCell'):
        style = cell.get('style', '').lower()
        cell_id = cell.get('id')
        value = clean_name(cell.get('value', ''))
        if 'shape=mxgraph.bpmn.task' in style:
            tasks.append({'id': f"{level}_{cell_id}", 'name': value, 'level': level, 'module': module, 'activity': activity})
        elif 'shape=mxgraph.bpmn.event' in style:
            if 'outline=end' in style:
                events.append({'id': f"{level}_{cell_id}", 'name': 'End', 'type': 'EndEvent', 'level': level, 'module': module, 'activity': activity})
            else:
                events.append({'id': f"{level}_{cell_id}", 'name': 'Start', 'type': 'StartEvent', 'level': level, 'module': module, 'activity': activity})
        elif 'shape=mxgraph.bpmn.gateway2' in style:
            gateway_type = 'XOR' if 'gwtype=exclusive' in style else 'OR'
            gateways.append({'id': f"{level}_{cell_id}", 'name': value, 'type': gateway_type, 'level': level, 'module': module, 'activity': activity})
        elif cell.get('edge') == '1':
            sequence_flows.append({
                'id': f"{level}_{cell_id}",
                'sourceRef': f"{level}_{cell.get('source')}",
                'targetRef': f"{level}_{cell.get('target')}",
                'name': value,
                'level': level,
                'module': module,
                'activity': activity
            })

    return tasks, events, gateways, sequence_flows

In [97]:
# Helper function to determine the label of an element by its ID
def get_element_label_by_id(elements, element_id):
    for element_type, element_list in elements.items():
        for element in element_list:
            if element['id'] == element_id:
                return element_type
    return 'Element'

In [98]:
# Main function to process all BPMN files with parallel processing
def process_bpmn_file(session, filename, file_path, level, module, activity):
    print(f"\nProcessing {filename} at Level {level}...")
    tasks, events, gateways, sequence_flows = parse_drawio_bpmn_xml(file_path, level, module, activity)

    print(f"Parsed {len(tasks)} tasks, {len(events)} events, {len(gateways)} gateways, {len(sequence_flows)} sequence flows from {filename}")

    elements = {
        'Task': tasks,
        'StartEvent': [e for e in events if e['type'] == 'StartEvent'],
        'EndEvent': [e for e in events if e['type'] == 'EndEvent'],
        'Gateway': gateways
    }

    for element_type, element_list in elements.items():
        for element in element_list:
            session.execute_write(create_node, element_type, element)

    for flow in sequence_flows:
        source_id = flow['sourceRef']
        target_id = flow['targetRef']
        rel_properties = {'id': flow['id'], 'name': flow.get('name'), 'level': level, 'module': module, 'activity': activity}
        source_label = get_element_label_by_id(elements, source_id)
        target_label = get_element_label_by_id(elements, target_id)
        
        if source_label == 'Gateway':
            gateway = next(g for g in gateways if g['id'] == source_id)
            rel_type = f"{gateway['type']}_SPLIT"
        elif target_label == 'Gateway':
            gateway = next(g for g in gateways if g['id'] == target_id)
            rel_type = f"{gateway['type']}_JOIN"
        else:
            rel_type = 'SEQUENCE_FLOW'

        rel_created = session.execute_write(
            create_relationship_with_id,
            source_label, source_id, target_label, target_id,
            rel_type, rel_properties
        )
        if rel_created:
            print(f"Created {rel_type} relationship from {source_label}({source_id}) to {target_label}({target_id})")
        else:
            print(f"Failed to create {rel_type} relationship from {source_id} to {target_id}")

# 7. Main Execution

In [99]:
# Run the processing function with parallelization
def main():
    bpmn_dir = './assets'
    filenames = [f for f in os.listdir(bpmn_dir) if f.endswith('.xml')]

    with driver.session(database="erpbpmn") as session:
        for filename in filenames:
            file_path = os.path.join(bpmn_dir, filename)
            if filename == "BPMN Level 0.xml":
                level = 0
                module = "ERP"
                activity = None
            else:
                match = re.match(r'BPMN\s+(.+)\s+Level\s+(\d)(?:\s+-\s+(.+))?\.xml', filename)
                if match:
                    module = match.group(1)
                    level = int(match.group(2))
                    activity = match.group(3) if match.group(3) else None
                else:
                    print(f"Skipping file {filename} as it doesn't match the expected naming pattern.")
                    continue
            
            process_bpmn_file(session, filename, file_path, level, module, activity)

# 8. Verification query

In [100]:
def verify_data_import():
    with driver.session(database="erpbpmn") as session:
        result = session.run("MATCH (n) RETURN labels(n) AS Label, count(n) AS Count ORDER BY Count DESC")
        print("Node counts by label:")
        for record in result:
            print(f"{record['Label']}: {record['Count']}")
        
        result = session.run("MATCH ()-[r]->() RETURN type(r) AS RelationType, count(r) AS Count ORDER BY Count DESC")
        print("\nRelationship counts by type:")
        for record in result:
            print(f"{record['RelationType']}: {record['Count']}")

        result = session.run("MATCH (n) RETURN n.level AS Level, count(n) AS Count ORDER BY Level")
        print("\nNode counts by level:")
        for record in result:
            print(f"Level {record['Level']}: {record['Count']} nodes")

# 9. Execute Main Function

In [101]:
if __name__ == "__main__":
    main()
    verify_data_import()
    driver.close()


Processing BPMN Account Payable Level 1.xml at Level 1...
Parsed 1 tasks, 2 events, 0 gateways, 2 sequence flows from BPMN Account Payable Level 1.xml
Created SEQUENCE_FLOW relationship from Task(1_6) to EndEvent(1_8)
Created SEQUENCE_FLOW relationship from StartEvent(1_4) to Task(1_6)

Processing BPMN Account Payable Level 2.xml at Level 2...
Parsed 21 tasks, 2 events, 11 gateways, 38 sequence flows from BPMN Account Payable Level 2.xml
Created XOR_SPLIT relationship from Gateway(2_80) to Gateway(2_13)
Created XOR_SPLIT relationship from Gateway(2_80) to Gateway(2_41)
Created SEQUENCE_FLOW relationship from Task(2_64) to Task(2_32)
Created SEQUENCE_FLOW relationship from Task(2_67) to Task(2_31)
Created SEQUENCE_FLOW relationship from Task(2_56) to Task(2_25)
Created SEQUENCE_FLOW relationship from Task(2_42) to Task(2_27)
Created XOR_SPLIT relationship from Gateway(2_13) to Gateway(2_16)
Created XOR_SPLIT relationship from Gateway(2_13) to Task(2_25)
Created XOR_SPLIT relationship f