# Leveraging Neo4j for Service Identification and Microservices Partitioning in Business Process Systems

# 1. Import Libraries

Import all necessary libraries for XML parsing, Neo4j interaction, GPU detection, and concurrent processing.

In [130]:
# Import Libraries
import os
import re
import xml.etree.ElementTree as ET
import html
import uuid
import pandas as pd
from neo4j import GraphDatabase
from pyvis.network import Network
import torch
from concurrent.futures import ThreadPoolExecutor

# 2. Check CUDA Availability

Detect whether CUDA (GPU) is available on your system. This information will be printed at the beginning of the notebook.

In [131]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

CUDA available: True
CUDA version: 11.8
Using device: cuda


In [132]:
# Function to check CUDA availability
def check_cuda():
    if torch.cuda.is_available():
        print("CUDA is available. GPU will be used if applicable.")
        print(f"Device Name: {torch.cuda.get_device_name(0)}")
    else:
        print("CUDA is not available. Using CPU.")

# Execute Cuda
check_cuda()

CUDA is available. GPU will be used if applicable.
Device Name: NVIDIA GeForce RTX 3060 Laptop GPU


# 3. Define Connection to Neo4j

In [133]:
# Neo4j connection details
uri = "bolt://localhost:7687"
username = "neo4j"
password = "170202Kcf"

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

In [134]:
def test_connection():
    try:
        with driver.session(database="erpbpmn") as session:
            result = session.run("RETURN 1 AS test")
            for record in result:
                print(f"Connection successful, test query result: {record['test']}")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")

test_connection()

Connection successful, test query result: 1


# 4. Define Functions to Create Nodes and Relationships

In [135]:
def get_node_color(node_type, level):
    color_map = {
        'Task': {
            0: '#FFD700',  # Gold
            1: '#FFFACD',  # LemonChiffon
            2: '#FAFAD2',  # LightGoldenrodYellow
            3: '#FFFFE0'   # LightYellow
        },
        'StartEvent': {
            0: '#90EE90',  # LightGreen
            1: '#98FB98',  # PaleGreen
            2: '#8FBC8F',  # DarkSeaGreen
            3: '#3CB371'   # MediumSeaGreen
        },
        'EndEvent': {
            0: '#FF6347',  # Tomato
            1: '#FF4500',  # OrangeRed
            2: '#FF0000',  # Red
            3: '#DC143C'   # Crimson
        }
    }
    default_color = '#D3D3D3'
    return color_map.get(node_type, {}).get(level, default_color)

def create_node(tx, label, properties):
    color = get_node_color(label, properties.get('level', 0))
    query = (
        f"CREATE (n:{label} {{id: $properties.id}}) "
        "SET n += $properties, n.color = $color "
        "RETURN n"
    )
    result = tx.run(query, properties=properties, color=color)
    return result.single()[0]

def create_relationship_with_id(tx, source_id, target_id, rel_type, properties):
    rel_color_map = {
        'SEQUENCE_FLOW': '#A9A9A9',
        'XOR_SPLIT': '#FF69B4',
        'XOR_JOIN': '#4169E1',
        'OR_SPLIT': '#FFD700',
        'OR_JOIN': '#00CED1'
    }
    color = rel_color_map.get(rel_type, '#696969')
    query = (
        f"MATCH (a {{id: $source_id}}), (b {{id: $target_id}}) "
        f"CREATE (a)-[r:{rel_type} {{id: $properties.id}}]->(b) "
        "SET r += $properties, r.color = $color "
        "RETURN r"
    )
    result = tx.run(query, source_id=source_id, target_id=target_id, properties=properties, color=color)
    record = result.single()
    if record:
        return record[0]
    else:
        print(f"Warning: Could not create relationship {rel_type} between {source_id} and {target_id}.")
        return None

# 5. Parse BPMN XML Files and Load into Neo4j

In [136]:
# Define functions to parse BPMN XML files
def clean_name(name):
    name = re.sub('<[^<]+?>', '', name)
    name = html.unescape(name)
    return name.strip()

In [137]:
def parse_bpmn_xml(file_path, level, module, activity=None):
    tree = ET.parse(file_path)
    root = tree.getroot()

    elements = {
        'Task': [],
        'StartEvent': [],
        'EndEvent': []
    }
    flows = []
    gateways = {}

    # The 'root' element is inside <mxGraphModel> -> <root>
    mx_root = root.find('.//root')

    if mx_root is None:
        print("No mxGraphModel root found in the XML.")
        return elements, flows, gateways

    cells = mx_root.findall('mxCell')

    id_prefix = f"{module}_{activity}_{level}_" if activity else f"{module}_{level}_"

    for cell in cells:
        cell_id = cell.get('id')
        value = cell.get('value', '').strip()
        value = clean_name(value)
        style = cell.get('style', '')
        vertex = cell.get('vertex')
        edge = cell.get('edge')

        if vertex == '1':
            # It's a node
            if 'shape=mxgraph.bpmn.task' in style:
                # It's a Task
                elements['Task'].append({
                    'id': id_prefix + cell_id,
                    'name': value if value else 'Unnamed Task',
                    'level': level,
                    'module': module,
                    'activity': activity
                })
            elif 'shape=mxgraph.bpmn.event' in style:
                # It's an Event
                if 'fillColor=#60a917' in style:
                    # Start Event (green)
                    elements['StartEvent'].append({
                        'id': id_prefix + cell_id,
                        'name': value if value else 'Start',
                        'level': level,
                        'module': module,
                        'activity': activity
                    })
                elif 'fillColor=#e51400' in style:
                    # End Event (red)
                    elements['EndEvent'].append({
                        'id': id_prefix + cell_id,
                        'name': value if value else 'End',
                        'level': level,
                        'module': module,
                        'activity': activity
                    })
            elif 'shape=mxgraph.bpmn.gateway2' in style:
                # It's a Gateway
                if 'gwType=exclusive' in style:
                    gateway_kind = 'XOR'
                elif 'gwType=inclusive' in style:
                    gateway_kind = 'OR'
                else:
                    gateway_kind = 'UNKNOWN'

                gateways[cell_id] = {
                    'id': id_prefix + cell_id,
                    'gateway_kind': gateway_kind,
                    'level': level,
                    'module': module,
                    'activity': activity
                }
        elif edge == '1':
            # It's an edge
            source = cell.get('source')
            target = cell.get('target')
            if source and target:
                flows.append({
                    'id': id_prefix + cell_id,
                    'sourceRef': id_prefix + source,
                    'targetRef': id_prefix + target,
                    'name': value if value else 'Unnamed Flow',
                    'level': level,
                    'module': module,
                    'activity': activity
                })

    return elements, flows, gateways

In [138]:
def process_gateways_and_flows(session, elements, flows, gateways):
    # Remove gateway nodes from elements
    # (we are not creating nodes for gateways)
    # Instead, we process flows to create direct relationships
    gateway_ids = gateways.keys()

    # Build maps for incoming and outgoing flows for each gateway
    incoming_flows = {}
    outgoing_flows = {}

    for flow in flows:
        source = flow['sourceRef']
        target = flow['targetRef']
        if target in gateway_ids:
            if target not in incoming_flows:
                incoming_flows[target] = []
            incoming_flows[target].append(flow)
        if source in gateway_ids:
            if source not in outgoing_flows:
                outgoing_flows[source] = []
            outgoing_flows[source].append(flow)

    # New flows to be created after processing gateways
    new_flows = []

    for gw_id, gw_info in gateways.items():
        gw_type = gw_info['gateway_kind']
        gw_incoming = incoming_flows.get(gw_id, [])
        gw_outgoing = outgoing_flows.get(gw_id, [])

        if len(gw_incoming) > 1 and len(gw_outgoing) == 1:
            # Join Gateway
            rel_type = f"{gw_type}_JOIN"
            target = gw_outgoing[0]['targetRef']
            for inc_flow in gw_incoming:
                source = inc_flow['sourceRef']
                new_flows.append({'source': source, 'target': target, 'type': rel_type, 'properties': inc_flow})
        elif len(gw_incoming) == 1 and len(gw_outgoing) > 1:
            # Split Gateway
            rel_type = f"{gw_type}_SPLIT"
            source = gw_incoming[0]['sourceRef']
            for out_flow in gw_outgoing:
                target = out_flow['targetRef']
                new_flows.append({'source': source, 'target': target, 'type': rel_type, 'properties': out_flow})
        else:
            # For other cases, create SEQUENCE_FLOW relationships
            print(f"Warning: Gateway {gw_id} has an unexpected number of incoming or outgoing flows.")
            # Connect each incoming flow to each outgoing flow
            for inc_flow in gw_incoming:
                source = inc_flow['sourceRef']
                for out_flow in gw_outgoing:
                    target = out_flow['targetRef']
                    new_flows.append({'source': source, 'target': target, 'type': 'SEQUENCE_FLOW', 'properties': out_flow})

    # Remove flows connected to gateways
    flows = [flow for flow in flows if flow['sourceRef'] not in gateway_ids and flow['targetRef'] not in gateway_ids]

    # Add new flows
    for flow in new_flows:
        rel_properties = flow['properties']
        rel_properties['name'] = flow['type']
        rel_type = flow['type']
        session.execute_write(
            create_relationship_with_id,
            flow['source'],
            flow['target'],
            rel_type,
            rel_properties
        )

    # Process remaining flows
    for flow in flows:
        rel_properties = {
            'id': flow['id'],
            'name': 'SEQUENCE_FLOW',
            'level': flow['level'],
            'module': flow['module'],
            'activity': flow['activity']
        }
        session.execute_write(
            create_relationship_with_id,
            flow['sourceRef'],
            flow['targetRef'],
            'SEQUENCE_FLOW',
            rel_properties
        )

In [139]:
def process_bpmn_file(session, filename, file_path, level, module, activity):
    print(f"\nProcessing {filename} at Level {level}...")
    elements, flows, gateways = parse_bpmn_xml(file_path, level, module, activity)

    total_elements = sum(len(v) for v in elements.values())
    print(f"Parsed {total_elements} elements and {len(flows)} flows from {filename}")

    # Identify unnamed nodes and rename them
    for element_type in ['Task', 'StartEvent', 'EndEvent']:
        for element in elements[element_type]:
            if not element['name'] or element['name'].startswith('Unnamed'):
                if element_type == 'StartEvent':
                    element['name'] = 'Start'
                elif element_type == 'EndEvent':
                    element['name'] = 'End'
                else:
                    element['name'] = f"{element_type}_{element['id']}"
                print(f"Renamed {element_type} {element['id']} to {element['name']}")

    # Create nodes (excluding Gateways)
    for element_type, element_list in elements.items():
        for element in element_list:
            session.execute_write(create_node, element_type, element)
            print(f"Created node: {element_type} with ID {element['id']} and name {element['name']}")

    # Process Gateways and Flows
    process_gateways_and_flows(session, elements, flows, gateways)

# 6. Main Execution

In [140]:
def main():
    bpmn_dir = './assets'  # Directory containing BPMN XML files
    filenames = [f for f in os.listdir(bpmn_dir) if f.endswith('.xml')]

    with driver.session(database="erpbpmn") as session:
        for filename in filenames:
            file_path = os.path.join(bpmn_dir, filename)
            if filename == "BPMN Level 0.xml":
                level = 0
                module = "ERP"
                activity = None
            else:
                match = re.match(r'BPMN\s+(.+?)\s+Level\s+(\d+)(?:\s+-\s+(.+))?\.xml', filename)
                if match:
                    module = match.group(1).strip()
                    level = int(match.group(2))
                    activity = match.group(3).strip() if match.group(3) else None
                else:
                    print(f"Skipping file {filename} as it doesn't match the expected naming pattern.")
                    continue

            process_bpmn_file(session, filename, file_path, level, module, activity)

In [141]:
def verify_data_import():
    with driver.session(database="erpbpmn") as session:
        result = session.run("MATCH (n) RETURN labels(n) AS Label, count(n) AS Count ORDER BY Count DESC")
        print("Node counts by label:")
        for record in result:
            print(f"{record['Label']}: {record['Count']}")

        result = session.run("MATCH ()-[r]->() RETURN type(r) AS RelationType, count(r) AS Count ORDER BY Count DESC")
        print("\nRelationship counts by type:")
        for record in result:
            print(f"{record['RelationType']}: {record['Count']}")

        result = session.run("MATCH (n) RETURN n.level AS Level, count(n) AS Count ORDER BY Level")
        print("\nNode counts by level:")
        for record in result:
            print(f"Level {record['Level']}: {record['Count']} nodes")

# 9. Execute Main Function

In [142]:
def run():
    main()
    verify_data_import()

if __name__ == "__main__":
    run()


Processing BPMN Account Payable Level 1.xml at Level 1...
Parsed 3 elements and 2 flows from BPMN Account Payable Level 1.xml
Created node: Task with ID Account Payable_1_5 and name Account Payable
Created node: StartEvent with ID Account Payable_1_3 and name Start
Created node: EndEvent with ID Account Payable_1_7 and name End

Processing BPMN Account Payable Level 2.xml at Level 2...
Parsed 23 elements and 39 flows from BPMN Account Payable Level 2.xml
Created node: Task with ID Account Payable_2_24 and name Reviewing Purchase Return
Created node: Task with ID Account Payable_2_25 and name Approving Purchase Return
Created node: Task with ID Account Payable_2_26 and name Checking Purchase Invoice Detail
Created node: Task with ID Account Payable_2_28 and name Updating Data Purchase Invoice
Created node: Task with ID Account Payable_2_29 and name Finalizing Document
Created node: Task with ID Account Payable_2_30 and name Reviewing Purchase Return
Created node: Task with ID Account P

# 9. Visualize the Graph

In [143]:
def get_neo4j_data():
    with driver.session(database="erpbpmn") as session:
        nodes_query = """
        MATCH (n) 
        RETURN n.id AS id, labels(n) AS labels, n.name AS name, n.color AS color, n.level AS level, n.module AS module, n.activity AS activity
        """
        nodes = session.run(nodes_query).data()
        nodes_df = pd.DataFrame(nodes)

        relationships_query = """
        MATCH (a)-[r]->(b) 
        RETURN r.id AS id, type(r) AS type, a.id AS source, b.id AS target, r.color AS color, r.level AS level, r.module AS module, r.activity AS activity
        """
        relationships = session.run(relationships_query).data()
        relationships_df = pd.DataFrame(relationships)

    return nodes_df, relationships_df

def visualize_neo4j_graph(nodes_df, relationships_df):
    net = Network(height='750px', width='100%', directed=True, notebook=True, cdn_resources='remote')

    for _, row in nodes_df.iterrows():
        label = row['name'] if pd.notnull(row['name']) else row['id']

        net.add_node(
            row['id'],
            label=label,
            title=f"ID: {row['id']}<br>Type: {row['labels']}<br>Name: {row['name']}<br>Level: {row['level']}<br>Module: {row['module']}<br>Activity: {row['activity']}",
            color=row['color'],
            level=row['level']
        )

    for _, row in relationships_df.iterrows():
        net.add_edge(
            row['source'],
            row['target'],
            title=f"ID: {row['id']}<br>Type: {row['type']}<br>Level: {row['level']}<br>Module: {row['module']}<br>Activity: {row['activity']}",
            color=row['color'],
            arrows='to'
        )

    net.force_atlas_2based()

    net.show('neo4j_graph.html')

    from IPython.display import IFrame
    return IFrame('neo4j_graph.html', width='100%', height='750px')

In [144]:
nodes_df, relationships_df = get_neo4j_data()

print("Nodes:")
display(nodes_df.head())

print("\nRelationships:")
display(relationships_df.head())

graph_display = visualize_neo4j_graph(nodes_df, relationships_df)
graph_display

Nodes:


Unnamed: 0,id,labels,name,color,level,module,activity
0,Account Receivable_Finalizing Sales Invoice Do...,[Task],Verify Data Accuracy,#FFFFE0,3,Account Receivable,Finalizing Sales Invoice Document
1,Account Receivable_Finalizing Sales Invoice Do...,[Task],Complete Invoice Document,#FFFFE0,3,Account Receivable,Finalizing Sales Invoice Document
2,Account Receivable_Finalizing Sales Invoice Do...,[StartEvent],Start,#3CB371,3,Account Receivable,Finalizing Sales Invoice Document
3,Account Receivable_Finalizing Sales Invoice Do...,[EndEvent],End,#DC143C,3,Account Receivable,Finalizing Sales Invoice Document
4,Account Receivable_Reviewing Sales DP Invoice_3_5,[Task],Receive Sales DP Invoice,#FFFFE0,3,Account Receivable,Reviewing Sales DP Invoice



Relationships:


Unnamed: 0,id,type,source,target,color,level,module,activity
0,Account Receivable_Finalizing Sales Invoice Do...,SEQUENCE_FLOW,Account Receivable_Finalizing Sales Invoice Do...,Account Receivable_Finalizing Sales Invoice Do...,#A9A9A9,3,Account Receivable,Finalizing Sales Invoice Document
1,Account Receivable_Finalizing Sales Invoice Do...,SEQUENCE_FLOW,Account Receivable_Finalizing Sales Invoice Do...,Account Receivable_Finalizing Sales Invoice Do...,#A9A9A9,3,Account Receivable,Finalizing Sales Invoice Document
2,Account Receivable_Finalizing Sales Invoice Do...,SEQUENCE_FLOW,Account Receivable_Finalizing Sales Invoice Do...,Account Receivable_Finalizing Sales Invoice Do...,#A9A9A9,3,Account Receivable,Finalizing Sales Invoice Document
3,Account Receivable_Reviewing Sales DP Invoice_...,SEQUENCE_FLOW,Account Receivable_Reviewing Sales DP Invoice_3_5,Account Receivable_Reviewing Sales DP Invoice_...,#A9A9A9,3,Account Receivable,Reviewing Sales DP Invoice
4,Account Receivable_Reviewing Sales DP Invoice_3_3,SEQUENCE_FLOW,Account Receivable_Reviewing Sales DP Invoice_3_4,Account Receivable_Reviewing Sales DP Invoice_3_5,#A9A9A9,3,Account Receivable,Reviewing Sales DP Invoice


neo4j_graph.html
