# Leveraging Neo4j for Service Identification and Microservices Partitioning in Business Process Systems

# 1. Install Required Libraries

In [115]:
!pip install neo4j torch pyvis pandas



# 2. Import Libraries

Import all necessary libraries for XML parsing, Neo4j interaction, GPU detection, and concurrent processing.

In [116]:
# Import Libraries
import xml.etree.ElementTree as ET
from neo4j import GraphDatabase
import os
import re
import html
import torch 
from concurrent.futures import ThreadPoolExecutor
import uuid
import pandas as pd
from pyvis.network import Network

# 3. Check CUDA Availability

Detect whether CUDA (GPU) is available on your system. This information will be printed at the beginning of the notebook.

In [117]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

CUDA available: True
CUDA version: 11.8
Using device: cuda


In [118]:
# Function to check CUDA availability
def check_cuda():
    if torch.cuda.is_available():
        print("CUDA is available. GPU will be used if applicable.")
        print(f"Device Name: {torch.cuda.get_device_name(0)}")
    else:
        print("CUDA is not available. Using CPU.")

# Execute CUDA check
check_cuda()

CUDA is available. GPU will be used if applicable.
Device Name: NVIDIA GeForce RTX 3060 Laptop GPU


# 4. Define Connection to Neo4j

In [119]:
# Neo4j connection details
uri = "bolt://localhost:7687"
username = "neo4j"
password = "170202Kcf"

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

In [120]:
def test_connection():
    try:
        with driver.session(database="erpbpmn") as session:
            result = session.run("RETURN 1 AS test")
            for record in result:
                print(f"Connection successful, test query result: {record['test']}")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")

test_connection()

Connection successful, test query result: 1


# 5. Define Functions to Create Nodes and Relationships

In [121]:
def get_node_color(node_type, level):
    color_map = {
        'Task': {
            0: '#FFD700',  # Kuning untuk level 0
            1: '#FFFACD',  # LemonChiffon untuk level 1
            2: '#FAFAD2',  # LightGoldenrodYellow untuk level 2
            3: '#FFFFE0'   # LightYellow untuk level 3
        },
        'StartEvent': {
            0: '#90EE90',  # Hijau muda untuk level 0
            1: '#98FB98',  # PaleGreen untuk level 1
            2: '#8FBC8F',  # DarkSeaGreen untuk level 2
            3: '#3CB371'   # MediumSeaGreen untuk level 3
        },
        'EndEvent': {
            0: '#FF6347',  # Tomato untuk level 0
            1: '#FF4500',  # OrangeRed untuk level 1
            2: '#FF0000',  # Merah untuk level 2
            3: '#DC143C'   # Crimson untuk level 3
        },
        'Gateway': {
            0: '#1E90FF',  # DodgerBlue untuk level 0
            1: '#00BFFF',  # DeepSkyBlue untuk level 1
            2: '#87CEFA',  # LightSkyBlue untuk level 2
            3: '#ADD8E6'   # LightBlue untuk level 3
        }
    }
    
    default_color = '#D3D3D3'
    
    return color_map.get(node_type, {}).get(level, default_color)

In [122]:
# Define functions to create nodes and relationships
def create_node(tx, label, properties):
    color = get_node_color(label, properties.get('level', 0))
    
    query = (
        f"MERGE (n:{label} {{id: $properties.id}}) "
        "SET n += $properties, n.color = $color "
        "RETURN n"
    )
    result = tx.run(query, properties=properties, color=color)
    return result.single()[0]

def create_relationship_with_id(tx, source_id, target_id, rel_type, properties):
    rel_color_map = {
        'SEQUENCE_FLOW': '#A9A9A9',
        'XOR_SPLIT': '#FF69B4',
        'XOR_JOIN': '#4169E1',
        'OR_SPLIT': '#FFD700',
        'OR_JOIN': '#00CED1'
    }
    color = rel_color_map.get(rel_type, '#696969')

    # Validasi bahwa kedua node ada
    validation_query = """
    MATCH (a {id: $source_id}), (b {id: $target_id})
    RETURN a, b
    """
    validation = tx.run(validation_query, source_id=source_id, target_id=target_id).data()
    if not validation:
        print(f"Warning: Nodes {source_id} or {target_id} do not exist. Cannot create relationship {rel_type}.")
        return None

    query = (
        f"MATCH (a {{id: $source_id}}), (b {{id: $target_id}}) "
        f"MERGE (a)-[r:{rel_type} {{id: $properties.id}}]->(b) "
        "SET r += $properties, r.color = $color "
        "RETURN r"
    )
    result = tx.run(query, source_id=source_id, target_id=target_id, properties=properties, color=color)
    record = result.single()
    if record:
        return record[0]
    else:
        print(f"Warning: Could not create relationship {rel_type} between {source_id} and {target_id}.")
        return None

# 6. Parse BPMN XML Files and Load into Neo4j

In [123]:
# Define functions to parse BPMN XML files
def clean_name(name):
    name = re.sub('<[^<]+?>', '', name)
    name = html.unescape(name)
    return name.strip()

In [124]:
def parse_drawio_bpmn_xml(file_path, level, module, activity=None):
    tree = ET.parse(file_path)
    root = tree.getroot()

    elements = {
        'Task': [],
        'StartEvent': [],
        'EndEvent': [],
        'Gateway': []
    }
    flows = []
    
    for cell in root.findall('.//mxCell'):
        style = cell.get('style', '').lower()
        cell_id = cell.get('id')
        value = clean_name(cell.get('value', ''))
        if not value:
            value = f"Unnamed_{cell_id}"
        
        if 'shape=mxgraph.bpmn.task' in style:
            elements['Task'].append({'id': f"{level}_{cell_id}", 'name': value, 'level': level, 'module': module, 'activity': activity})
        elif 'shape=mxgraph.bpmn.event' in style:
            if 'outline=end' in style:
                elements['EndEvent'].append({'id': f"{level}_{cell_id}", 'name': value, 'type': 'EndEvent', 'level': level, 'module': module, 'activity': activity})
            else:
                elements['StartEvent'].append({'id': f"{level}_{cell_id}", 'name': value, 'type': 'StartEvent', 'level': level, 'module': module, 'activity': activity})
        elif 'shape=mxgraph.bpmn.gateway' in style:
            gateway_kind = 'XOR' if 'exclusive' in style else 'OR'
            elements['Gateway'].append({'id': f"{level}_{cell_id}", 'name': value, 'gateway_kind': gateway_kind, 'level': level, 'module': module, 'activity': activity})
        elif cell.get('edge') == '1':
            source = cell.get('source')
            target = cell.get('target')
            if source and target:
                flows.append({
                    'id': f"{level}_{cell_id}",
                    'sourceRef': f"{level}_{source}",
                    'targetRef': f"{level}_{target}",
                    'name': value,
                    'level': level,
                    'module': module,
                    'activity': activity
                })
            else:
                print(f"Warning: Flow {cell_id} missing source or target. Skipping.")

    return elements, flows

In [125]:
# Helper function to determine the label of an element by its ID
def get_element_by_id(elements, element_id):
    for element_type, element_list in elements.items():
        for element in element_list:
            if element['id'] == element_id:
                return element_type, element
    return None, None

In [126]:
# Main function to process gateways
def process_gateways(elements, flows, session, level, module, activity):
    gateways = elements.get('Gateway', [])
    new_relationships = []
    flows_to_remove = []
    
    for gateway in gateways:
        gateway_id = gateway['id']
        gateway_kind = gateway['gateway_kind']  # XOR or OR

        # Find incoming and outgoing flows for the gateway
        incoming_flows = [flow for flow in flows if flow['targetRef'] == gateway_id]
        outgoing_flows = [flow for flow in flows if flow['sourceRef'] == gateway_id]
        
        if len(incoming_flows) == 1 and len(outgoing_flows) > 1:
            # Split gateway
            incoming_flow = incoming_flows[0]
            for out_flow in outgoing_flows:
                rel_type = f"{gateway_kind}_SPLIT"
                # Pastikan sourceRef dan targetRef tidak None
                if incoming_flow['sourceRef'] and out_flow['targetRef']:
                    new_relationships.append((incoming_flow['sourceRef'], out_flow['targetRef'], rel_type, out_flow['id']))
                    flows_to_remove.extend([incoming_flow['id'], out_flow['id']])
                else:
                    print(f"Warning: Invalid sourceRef or targetRef for gateway {gateway_id}. Skipping.")
        elif len(incoming_flows) > 1 and len(outgoing_flows) == 1:
            # Join gateway
            outgoing_flow = outgoing_flows[0]
            for in_flow in incoming_flows:
                rel_type = f"{gateway_kind}_JOIN"
                if in_flow['sourceRef'] and outgoing_flow['targetRef']:
                    new_relationships.append((in_flow['sourceRef'], outgoing_flow['targetRef'], rel_type, in_flow['id']))
                    flows_to_remove.extend([in_flow['id'], outgoing_flow['id']])
                else:
                    print(f"Warning: Invalid sourceRef or targetRef for gateway {gateway_id}. Skipping.")
        else:
            # Handle other gateway types if necessary
            continue

    # Remove flows that pass through gateways
    flows[:] = [flow for flow in flows if flow['id'] not in flows_to_remove]
    
    # Create new relationships
    for source, target, rel_type, flow_id in new_relationships:
        rel_properties = {'id': flow_id, 'name': rel_type, 'level': level, 'module': module, 'activity': activity}
        session.execute_write(create_relationship_with_id, source, target, rel_type, rel_properties)
        print(f"Created {rel_type} relationship from {source} to {target}")

In [127]:
def process_bpmn_file(session, filename, file_path, level, module, activity):
    print(f"\nProcessing {filename} at Level {level}...")
    elements, flows = parse_drawio_bpmn_xml(file_path, level, module, activity)

    print(f"Parsed {sum(len(v) for v in elements.values())} elements and {len(flows)} flows from {filename}")

    # Handle 'Unnamed' nodes based on level
    if level == 0:
        # Rename unnamed StartEvent and EndEvent
        for event_type in ['StartEvent', 'EndEvent']:
            for event in elements[event_type]:
                if event['name'].startswith('Unnamed'):
                    if event_type == 'StartEvent':
                        event['name'] = 'Start'
                    elif event_type == 'EndEvent':
                        event['name'] = 'End'
                    print(f"Renamed {event_type} {event['id']} to {event['name']}")
        # Optionally, handle unnamed Gateways if necessary
        # For this example, kita biarkan Gateway tetap 'Unnamed'

    elif level in (1, 2, 3):
        # Identify unnamed nodes (tasks, events, gateways)
        unnamed_node_ids = []
        for element_type, element_list in elements.items():
            for element in element_list:
                if element['name'].startswith('Unnamed'):
                    unnamed_node_ids.append(element['id'])
        
        if unnamed_node_ids:
            print(f"Identified {len(unnamed_node_ids)} unnamed nodes to remove at Level {level}")
        
        # For each unnamed node, adjust flows
        for unnamed_id in unnamed_node_ids:
            # Find incoming and outgoing flows
            incoming_flows = [flow for flow in flows if flow['targetRef'] == unnamed_id]
            outgoing_flows = [flow for flow in flows if flow['sourceRef'] == unnamed_id]
            
            # Create new SEQUENCE_FLOWs bypassing the unnamed node
            for in_flow in incoming_flows:
                for out_flow in outgoing_flows:
                    new_rel_type = 'SEQUENCE_FLOW'
                    new_rel_properties = {
                        'id': f"{level}_{uuid.uuid4()}",
                        'name': new_rel_type,
                        'level': level,
                        'module': module,
                        'activity': activity
                    }
                    session.execute_write(create_relationship_with_id, in_flow['sourceRef'], out_flow['targetRef'], new_rel_type, new_rel_properties)
                    print(f"Created {new_rel_type} relationship from {in_flow['sourceRef']} to {out_flow['targetRef']} bypassing {unnamed_id}")
            
            # Remove flows connected to the unnamed node
            flows[:] = [flow for flow in flows if flow['id'] not in [in_flow['id'] for in_flow in incoming_flows] and flow['id'] not in [out_flow['id'] for out_flow in outgoing_flows]]
            
            # Remove the unnamed node from elements
            for element_type, element_list in elements.items():
                elements[element_type] = [element for element in element_list if element['id'] != unnamed_id]
            print(f"Removed unnamed node {unnamed_id} and its associated flows")

    # Create nodes
    for element_type, element_list in elements.items():
        for element in element_list:
            session.execute_write(create_node, element_type, element)
            print(f"Created node: {element_type} with ID {element['id']} and name {element['name']}")

    # Process gateways: create direct relationships and remove gateway flows
    process_gateways(elements, flows, session, level, module, activity)

    # Create remaining sequence flows as SEQUENCE_FLOW relationships
    for flow in flows:
        source_id = flow['sourceRef']
        target_id = flow['targetRef']
        rel_properties = {'id': flow['id'], 'name': 'SEQUENCE_FLOW', 'level': level, 'module': module, 'activity': activity}
        
        rel_created = session.execute_write(
            create_relationship_with_id,
            source_id, target_id, 'SEQUENCE_FLOW', rel_properties
        )
        if rel_created:
            print(f"Created SEQUENCE_FLOW relationship from {source_id} to {target_id}")
        else:
            print(f"Failed to create SEQUENCE_FLOW relationship from {source_id} to {target_id}")

    # Connect Start Event to the first task if level is 0
    if level == 0:
        start_event = next((e for e in elements['StartEvent']), None)
        first_task = next((e for e in elements['Task']), None)
        if start_event and first_task:
            rel_properties = {'id': f"{level}_start_to_first", 'name': 'SEQUENCE_FLOW', 'level': level, 'module': module, 'activity': activity}
            rel_created = session.execute_write(
                create_relationship_with_id,
                start_event['id'], first_task['id'], 'SEQUENCE_FLOW', rel_properties
            )
            if rel_created:
                print(f"Connected Start Event to first task in Level 0")
            else:
                print(f"Failed to connect Start Event to first task in Level 0")

# 7. Main Execution

In [128]:
def verify_data_import():
    with driver.session(database="erpbpmn") as session:
        result = session.run("MATCH (n) RETURN labels(n) AS Label, count(n) AS Count ORDER BY Count DESC")
        print("Node counts by label:")
        for record in result:
            print(f"{record['Label']}: {record['Count']}")
    
        result = session.run("MATCH ()-[r]->() RETURN type(r) AS RelationType, count(r) AS Count ORDER BY Count DESC")
        print("\nRelationship counts by type:")
        for record in result:
            print(f"{record['RelationType']}: {record['Count']}")
    
        result = session.run("MATCH (n) RETURN n.level AS Level, count(n) AS Count ORDER BY Level")
        print("\nNode counts by level:")
        for record in result:
            print(f"Level {record['Level']}: {record['Count']} nodes")

In [129]:
def main():
    bpmn_dir = './assets'  # Directory containing BPMN XML files
    filenames = [f for f in os.listdir(bpmn_dir) if f.endswith('.xml')]

    with driver.session(database="erpbpmn") as session:
        for filename in filenames:
            file_path = os.path.join(bpmn_dir, filename)
            if filename == "BPMN Level 0.xml":
                level = 0
                module = "ERP"
                activity = None
            else:
                match = re.match(r'BPMN\s+(.+)\s+Level\s+(\d+)(?:\s+-\s+(.+))?\.xml', filename)
                if match:
                    module = match.group(1)
                    level = int(match.group(2))
                    activity = match.group(3) if match.group(3) else None
                else:
                    print(f"Skipping file {filename} as it doesn't match the expected naming pattern.")
                    continue
            
            process_bpmn_file(session, filename, file_path, level, module, activity)

# 9. Execute Main Function

In [130]:
def run():
    main()
    verify_data_import()
    # driver.close()

if __name__ == "__main__":
    run()


Processing BPMN Account Payable Level 1.xml at Level 1...
Parsed 3 elements and 2 flows from BPMN Account Payable Level 1.xml
Identified 2 unnamed nodes to remove at Level 1
Removed unnamed node 1_4 and its associated flows
Removed unnamed node 1_8 and its associated flows
Created node: Task with ID 1_6 and name Account Payable

Processing BPMN Account Payable Level 2.xml at Level 2...
Parsed 34 elements and 35 flows from BPMN Account Payable Level 2.xml
Identified 9 unnamed nodes to remove at Level 2
Created SEQUENCE_FLOW relationship from 2_80 to 2_16 bypassing 2_13
Created SEQUENCE_FLOW relationship from 2_80 to 2_25 bypassing 2_13
Removed unnamed node 2_13 and its associated flows
Removed unnamed node 2_16 and its associated flows
Removed unnamed node 2_17 and its associated flows
Removed unnamed node 2_19 and its associated flows
Created SEQUENCE_FLOW relationship from 2_80 to 2_44 bypassing 2_41
Created SEQUENCE_FLOW relationship from 2_80 to 2_45 bypassing 2_41
Removed unnamed 

# 9. Visualize the Graph

In [131]:
def get_neo4j_data():
    with driver.session(database="erpbpmn") as session:
        nodes_query = """
        MATCH (n) 
        RETURN n.id AS id, labels(n) AS labels, n.name AS name, n.color AS color, n.level AS level, n.module AS module, n.activity AS activity
        """
        nodes = session.run(nodes_query).data()
        nodes_df = pd.DataFrame(nodes)
        
        relationships_query = """
        MATCH (a)-[r]->(b) 
        RETURN r.id AS id, type(r) AS type, a.id AS source, b.id AS target, r.color AS color, r.level AS level, r.module AS module, r.activity AS activity
        """
        relationships = session.run(relationships_query).data()
        relationships_df = pd.DataFrame(relationships)
    
    return nodes_df, relationships_df

In [132]:
def visualize_neo4j_graph(nodes_df, relationships_df):
    net = Network(height='750px', width='100%', directed=True, notebook=True, cdn_resources='remote')
    
    for _, row in nodes_df.iterrows():
        label = row['name'] if pd.notnull(row['name']) else row['id']
        
        net.add_node(
            row['id'],
            label=label,
            title=f"ID: {row['id']}<br>Type: {row['labels']}<br>Name: {row['name']}<br>Level: {row['level']}<br>Module: {row['module']}<br>Activity: {row['activity']}",
            color=row['color'],
            level=row['level']
        )
    
    for _, row in relationships_df.iterrows():
        net.add_edge(
            row['source'],
            row['target'],
            title=f"ID: {row['id']}<br>Type: {row['type']}<br>Level: {row['level']}<br>Module: {row['module']}<br>Activity: {row['activity']}",
            color=row['color'],
            arrows='to'
        )
    
    net.force_atlas_2based()
    
    net.show('neo4j_graph.html')
    
    from IPython.display import IFrame
    return IFrame('neo4j_graph.html', width='100%', height='750px')

In [133]:
# Ambil data dari Neo4j
nodes_df, relationships_df = get_neo4j_data()

# Tampilkan DataFrame (Opsional, untuk verifikasi)
print("Nodes:")
display(nodes_df.head())

print("\nRelationships:")
display(relationships_df.head())

# Visualisasikan graf
graph_display = visualize_neo4j_graph(nodes_df, relationships_df)
graph_display

Nodes:


Unnamed: 0,id,labels,name,color,level,module,activity
0,2_51,[Task],Checking Goods for Return or Acceptance,#FAFAD2,2,Account Payable,
1,2_56,[Task],Processing Purchase Return,#FAFAD2,2,Account Payable,
2,2_63,[Task],Validating Invoice Details,#FAFAD2,2,Account Payable,
3,2_64,[Task],Completing Purchase DP Invoice Form,#FAFAD2,2,Account Payable,
4,2_67,[Task],Processing Purchase DP Return,#FAFAD2,2,Account Payable,



Relationships:


Unnamed: 0,id,type,source,target,color,level,module,activity
0,2_55,SEQUENCE_FLOW,2_51,2_54,#A9A9A9,2,Account Payable,
1,2_8,SEQUENCE_FLOW,2_56,2_25,#A9A9A9,2,Account Payable,
2,2_62,SEQUENCE_FLOW,2_63,2_42,#A9A9A9,2,Account Payable,
3,2_6,SEQUENCE_FLOW,2_64,2_32,#A9A9A9,2,Account Payable,
4,2_7,SEQUENCE_FLOW,2_67,2_31,#A9A9A9,2,Account Payable,


neo4j_graph.html
