# BPMN Designer

### HD dataset BPMN Element occurrencies

In [102]:
# With Categories
import os
from collections import Counter
from xml.etree import ElementTree as ET
import pandas as pd

def categorize_tag(tag):
    """
    Assign a category to the tag based on predefined rules.
    """
    tag_lower = tag.lower()
    flow_objects = [
        'scripttask', 'businessruletask', 'intermediatethrowevent', 'servicetask',
        'subprocess', 'receivetask', 'transaction', 'startevent', 'association',
        'eventbasedgateway', 'callactivity', 'task', 'exclusivegateway', 'endevent',
        'usertask', 'boundaryevent', 'parallelgateway', 'intermediatecatchevent', 'process'
    ]
    connecting_objects = ['incoming', 'outgoing', 'targetref', 'sourceref', 'sequenceflow']
    swim_lanes = ['laneset', 'lane']
    artifacts = ['dataobject', 'datastore']

    if tag_lower in flow_objects:
        return "Flow Objects"
    elif tag_lower in connecting_objects:
        return "Connecting Objects"
    elif tag_lower in swim_lanes:
        return "Swim Lanes"
    elif tag_lower in artifacts:
        return "Artifacts"
    else:
        return "Uncategorized"

def categorize_tag_EMF(tag):
    """
    Assign a category to the tag based on predefined rules.
    """
    tag_lower = tag.lower()
    classes = [
        'scripttask', 'businessruletask', 'intermediatethrowevent', 'servicetask',
        'subprocess', 'receivetask', 'transaction', 'startevent', 'association',
        'eventbasedgateway', 'callactivity', 'task', 'exclusivegateway', 'endevent',
        'usertask', 'boundaryevent', 'parallelgateway', 'intermediatecatchevent','sendtask',
        'laneset', 'lane','dataobject', 'datastore', 'process'
    ]
    attributes = ['incoming', 'outgoing', 'targetref', 'sourceref']
    relations = ['sequenceflow']

    if tag_lower in classes:
        return "Classes"
    elif tag_lower in attributes:
        return "Attributes"
    elif tag_lower in relations:
        return "Relations"
    else:
        return "Uncategorized"

def count_tags_in_bpmn_process(file_path):
    tag_counter = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        # Namespace handling for BPMN
        namespaces = {'bpmn': 'http://www.omg.org/spec/BPMN/20100524/MODEL'}
        # Find all <bpmn:process> tags
        for process in root.findall(".//bpmn:process", namespaces):
            # Count all tags within <process>
            for elem in process.iter():
                tag_counter[elem.tag] += 1
    except ET.ParseError:
        pass  # Ignore non-XML files or invalid XML
    return tag_counter

def analyze_bpmn_tags_in_folders_and_save(folder_path):
    # Traverse the folder and its subfolders
    for root_dir, dirs, files in os.walk(folder_path):
        # If the folder contains BPMN files, process them
        bpmn_files = [file for file in files if file.endswith(".bpmn")]
        if bpmn_files:
            # Initialize a dictionary to store tag counts for each file
            file_tag_occurrences = {}

            for file in bpmn_files:
                file_path = os.path.join(root_dir, file)
                file_tag_occurrences[file] = count_tags_in_bpmn_process(file_path)

            # Convert the results to a structured DataFrame
            file_tag_data = []
            for file_name, tag_counts in file_tag_occurrences.items():
                for tag, count in tag_counts.items():
                    # Remove namespace for better readability
                    namespace_to_remove = "{http://www.omg.org/spec/BPMN/20100524/MODEL}"
                    clean_tag = tag.replace(namespace_to_remove, "")
                    category = categorize_tag(clean_tag)
                    categoryEMF = categorize_tag_EMF(clean_tag)
                    file_tag_data.append({"File": file_name, "Tag": clean_tag, "Occurrences": count, "Category": category, "CategoryEMF": categoryEMF})

            # Create a DataFrame
            file_tag_df = pd.DataFrame(file_tag_data)
            print(file_tag_df)
            file_tag_df = file_tag_df.sort_values(by=["File", "Occurrences", "Category"], ascending=[True, False, False])

            # Save the DataFrame to a CSV file
            folder_name = os.path.basename(root_dir)  # Use the folder name as the CSV name
            output_csv = os.path.join(root_dir, f"{folder_name}.csv")
            file_tag_df.to_csv(output_csv, index=False)
            print(f"Results saved to {output_csv}")

# Example usage:
# Replace 'your_folder_path' with the path to the folder you want to analyze
folder_path = r"C:\Users\vitto\Desktop\github\Re-MASTER-LLM-MODELS\03_Model_Comparison\02 - Hallucination\BPMN-Designer\D1\BPMN-HD\data\annotations"  # Replace with the path to your folder containing XML files
analyze_bpmn_tags_in_folders_and_save(folder_path)

                     File               Tag  Occurrences            Category  \
0    ex00_writer0001.bpmn           process            1        Flow Objects   
1    ex00_writer0001.bpmn              task            6        Flow Objects   
2    ex00_writer0001.bpmn          incoming           13  Connecting Objects   
3    ex00_writer0001.bpmn          outgoing           13  Connecting Objects   
4    ex00_writer0001.bpmn  exclusiveGateway            1        Flow Objects   
..                    ...               ...          ...                 ...   
601  ex00_writer0105.bpmn  exclusiveGateway            1        Flow Objects   
602  ex00_writer0105.bpmn   parallelGateway            2        Flow Objects   
603  ex00_writer0105.bpmn        startEvent            1        Flow Objects   
604  ex00_writer0105.bpmn          endEvent            2        Flow Objects   
605  ex00_writer0105.bpmn      sequenceFlow           13  Connecting Objects   

    CategoryEMF  
0       Classes  
1  

### Statistics

In [14]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis, shapiro

def calculate_extended_statistics(data):
    stats = {}
    stats["N"] = len(data)
    stats["Missing"] = data.isnull().sum()
    stats["Mean"] = data.mean()
    stats["SE"] = data.std() / (stats["N"] ** 0.5) if stats["N"] > 0 else None
    stats["95% Confidence Interval Lower"] = stats["Mean"] - 1.96 * stats["SE"] if stats["SE"] else None
    stats["95% Confidence Interval Upper"] = stats["Mean"] + 1.96 * stats["SE"] if stats["SE"] else None
    stats["Median"] = data.median()
    stats["Mode"] = data.mode()[0] if not data.mode().empty else None
    stats["Sum"] = data.sum()
    stats["SD"] = data.std()
    stats["Variance"] = data.var()
    stats["IQR"] = data.quantile(0.75) - data.quantile(0.25)
    stats["Range"] = data.max() - data.min()
    stats["Minimum"] = data.min()
    stats["Maximum"] = data.max()

    # Check for variability before calculating skewness and kurtosis
    if stats["Range"] > 0 and len(data) > 2:
        stats["Skewness"] = skew(data)
        stats["Kurtosis"] = kurtosis(data)
    else:
        stats["Skewness"] = None
        stats["Kurtosis"] = None

    # Check for valid range for Shapiro-Wilk
    if len(data) > 2 and stats["Range"] > 0:
        try:
            shapiro_test = shapiro(data)
            stats["Shapiro-Wilk W"] = shapiro_test.statistic
            stats["Shapiro-Wilk p value"] = shapiro_test.pvalue
        except Exception as e:
            stats["Shapiro-Wilk W"] = None
            stats["Shapiro-Wilk p value"] = None
    else:
        stats["Shapiro-Wilk W"] = None
        stats["Shapiro-Wilk p value"] = None

    stats["25th"] = data.quantile(0.25)
    stats["50th"] = data.quantile(0.50)
    stats["75th"] = data.quantile(0.75)
    stats["90th"] = data.quantile(0.90)
    stats["95th"] = data.quantile(0.95)
    stats["99th"] = data.quantile(0.99)
    return stats

def generate_boxplots_from_csv(folder_path):
    total_data = []  # To store all data across all folders

    for root_dir, dirs, files in os.walk(folder_path):
        csv_files = [file for file in files if file.endswith(".csv")]
        if csv_files:
            folder_name = os.path.basename(root_dir)
            all_data = []

            for csv_file in csv_files:
                file_path = os.path.join(root_dir, csv_file)
                df = pd.read_csv(file_path)
                if "Tag" in df.columns and "Occurrences" in df.columns:
                    all_data.append(df[["Tag", "Occurrences"]])

            if all_data:
                combined_data = pd.concat(all_data)
                total_data.append(combined_data)  # Append to the global dataset
                
                statistics = []
                for tag, group in combined_data.groupby("Tag")["Occurrences"]:
                    stats = calculate_extended_statistics(group)
                    stats["Tag"] = tag
                    statistics.append(stats)

                stats_df = pd.DataFrame(statistics)
                stats_csv_path = os.path.join(root_dir, f"{folder_name}_tag_statistics.csv")
                stats_df.to_csv(stats_csv_path, index=False)
                print(f"Statistics saved to {stats_csv_path}")
                
                plt.figure(figsize=(12, 6))
                combined_data.boxplot(column="Occurrences", by="Tag", grid=False)
                plt.title(f"Occurrences of Tags in Folder: {folder_name}")
                plt.suptitle("")  # Remove default Matplotlib subtitle
                plt.xlabel("Tag")
                plt.ylabel("Occurrences")
                plt.xticks(rotation=90)
                plt.tight_layout()

                plot_path = os.path.join(root_dir, f"{folder_name}_tag_occurrences_boxplot.png")
                plt.savefig(plot_path)
                plt.close()
                print(f"Boxplot saved to {plot_path}")

    # Generate aggregated boxplot for each Tag
    if total_data:
        total_combined_data = pd.concat(total_data)
        
        aggregated_boxplot_path = os.path.join(folder_path, "aggregated_tag_occurrences_boxplot.png")
        
        # Generate boxplot for all Tags aggregated
        plt.figure(figsize=(12, 6))
        total_combined_data.boxplot(column="Occurrences", by="Tag", grid=False)
        plt.title("Aggregated Occurrences of Tags Across All Files")
        plt.suptitle("")  # Remove default Matplotlib subtitle
        plt.xlabel("Tag")
        plt.ylabel("Occurrences")
        plt.xticks(rotation=90)
        plt.tight_layout()

        plt.savefig(aggregated_boxplot_path)
        plt.close()
        print(f"Aggregated boxplot saved to {aggregated_boxplot_path}")

# Example usage:
folder_path = r"C:\Users\vitto\Desktop\github\BP-MASTER-LLM-ICSA\08_Quality_Checker\08b_Model_Quality_Checker\BPMN-Designer\BPMN-DESIGNER-HD\data\annotations"
generate_boxplots_from_csv(folder_path)

### BPMN Designer Element occurrencies

In [132]:
import os
import xml.etree.ElementTree as ET
from collections import Counter
import pandas as pd

def categorize_tag(tag):
    """
    Assign a category to the tag based on predefined rules.
    """
    tag_lower = tag.lower()
    flow_objects = [
        'scripttask', 'businessruletask', 'intermediatethrowevent', 'servicetask',
        'subprocess', 'receivetask', 'transaction', 'startevent', 'association',
        'eventbasedgateway', 'callactivity', 'task', 'exclusivegateway', 'endevent',
        'usertask', 'boundaryevent', 'parallelgateway', 'intermediatecatchevent','sendtask'
    ]
    connecting_objects = ['incoming', 'outgoing', 'targetref', 'sourceref', 'sequenceflow']
    swim_lanes = ['lanesets', 'lanes']
    artifacts = ['dataobject', 'datastore']

    if tag_lower in flow_objects:
        return "Flow Objects"
    elif tag_lower in connecting_objects:
        return "Connecting Objects"
    elif tag_lower in swim_lanes:
        return "Swim Lanes"
    elif tag_lower in artifacts:
        return "Artifacts"
    else:
        return "Uncategorized"

def categorize_tag_EMF(tag):
    """
    Assign a category to the tag based on predefined rules.
    """
    tag_lower = tag.lower()
    classes = [
        'scripttask', 'businessruletask', 'intermediatethrowevent', 'servicetask',
        'subprocess', 'receivetask', 'transaction', 'startevent', 'association',
        'eventbasedgateway', 'callactivity', 'task', 'exclusivegateway', 'endevent',
        'usertask', 'boundaryevent', 'parallelgateway', 'intermediatecatchevent','sendtask',
        'lanesets', 'lanes','dataobject', 'datastore'
    ]
    attributes = ['incoming', 'outgoing', 'targetref', 'sourceref']
    relations = ['sequenceflow']

    if tag_lower in classes:
        return "Classes"
    elif tag_lower in attributes:
        return "Attributes"
    elif tag_lower in relations:
        return "Relations"
    else:
        return "Uncategorized"

def count_bpmn_occurrences(file_path):
    tag_counts = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        namespace = {'bpmn': 'http://www.omg.org/spec/BPMN/20100524/MODEL-XMI'}

        # Find all <rootElements> tags
        for root_element in root.findall(".//rootElements", namespace):
            # Count <flowElements> tags with specific xsi:type
            for flow_element in root_element.findall(".//flowElements", namespace):
                xsi_type = flow_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:") or xsi_type.startswith("bpmn::"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn:" or "bpmn::"
                    tag_counts[tag_name] += 1

                # Count specific attributes
                if "outgoing" in flow_element.attrib:
                    tag_counts["outgoing"] += 1
                if "incoming" in flow_element.attrib:
                    tag_counts["incoming"] += 1
                if "sourceRef" in flow_element.attrib:
                    tag_counts["sourceRef"] += 1
                if "targetRef" in flow_element.attrib:
                    tag_counts["targetRef"] += 1

            # Count specific tags: laneSets and lanes
            tag_counts["laneSets"] += len(root_element.findall(".//laneSets", namespace))
            tag_counts["lanes"] += len(root_element.findall(".//lanes", namespace))

            # Handle <value xsi:type="bpmn::DataObject"> and similar tags
            for value_element in root_element.findall(".//value", namespace):
                xsi_type = value_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn::"
                    tag_counts[tag_name] += 1

    except ET.ParseError as e:
        print(f"Error parsing file {file_path}: {e}")
    return tag_counts

def analyze_bpmn_files_in_folders(folder_path):
    # Initialize a dictionary to store occurrences for each file
    file_tag_occurrences = {}

    # Traverse the folder and its subfolders for BPMN files
    for root_dir, sub_dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".bpmn"):  # Focus only on BPMN files
                file_path = os.path.join(root_dir, file)
                folder_name = os.path.basename(root_dir)  # Get the folder name
                file_tag_occurrences[folder_name] = count_bpmn_occurrences(file_path)

    # Create a DataFrame to store results
    all_results = []
    for folder_name, counts in file_tag_occurrences.items():
        for tag, count in counts.items():
            # Remove namespace for better readability
            namespace_to_remove = "{http://www.omg.org/spec/BPMN/20100524/MODEL}"
            clean_tag = tag.replace(namespace_to_remove, "")
            category = categorize_tag(clean_tag)
            categoryEMF = categorize_tag_EMF(clean_tag)
            all_results.append({"File": folder_name, "Tag": tag, "Occurrences": count, "Category": category, "CategoryEMF": categoryEMF})

    if not all_results:  # If no results, return an empty DataFrame
        print("No data found. Returning an empty DataFrame.")
        return pd.DataFrame(columns=["File", "Tag", "Occurrences"])

    results_df = pd.DataFrame(all_results)

    # Ensure sorting only happens if the columns exist
    if "File" in results_df.columns and "Occurrences" in results_df.columns:
        results_df = results_df.sort_values(by=["File", "Occurrences"], ascending=[True, False])

    return results_df

# Example usage:
# Replace 'your_folder_path' with the path to the folder containing BPMN files
folder_path = r"C:/Users/vitto/Desktop/github/Re-MASTER-LLM-MODELS/03_Model_Comparison/02 - Hallucination/D2-BPMN-Designer-RAG/LLM-mistral-small-latest-1.0"
results_df = analyze_bpmn_files_in_folders(folder_path)

# Save results to a CSV file
if not results_df.empty:
    results_df.to_csv("Results/bpmn_designer_RAG_LLM-mistral-small-latest-1.0.csv", index=False)
    print("Results saved to Results/bpmn_designer_RAG_LLM-mistral-small-latest-1.0.csv")
else:
    print("No BPMN files processed.")

Results saved to Results/bpmn_designer_RAG_LLM-mistral-small-latest-1.0.csv


### Statistics

In [36]:
import os
import xml.etree.ElementTree as ET
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis, shapiro

# Function to count BPMN occurrences with tag aggregation
def count_bpmn_occurrences(file_path):
    tag_counts = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        namespace = {'bpmn': 'http://www.omg.org/spec/BPMN/20100524/MODEL-XMI'}

        # Find all <rootElements> tags
        for root_element in root.findall(".//rootElements", namespace):
            # Count <flowElements> tags with specific xsi:type
            for flow_element in root_element.findall(".//flowElements", namespace):
                xsi_type = flow_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:") or xsi_type.startswith("bpmn::"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn:" or "bpmn::"

                    # Aggregate specific tags under the generic "task" tag
                    if tag_name in ["UserTask", "ServiceTask", "ReceiveTask", "SendTask"]:
                        tag_name = "task"

                    tag_counts[tag_name] += 1

                # Count specific attributes
                if "outgoing" in flow_element.attrib:
                    tag_counts["outgoing"] += 1
                if "incoming" in flow_element.attrib:
                    tag_counts["incoming"] += 1
                if "sourceRef" in flow_element.attrib:
                    tag_counts["sourceRef"] += 1
                if "targetRef" in flow_element.attrib:
                    tag_counts["targetRef"] += 1

            # Count specific tags: laneSets and lanes
            tag_counts["laneSets"] += len(root_element.findall(".//laneSets", namespace))
            tag_counts["lanes"] += len(root_element.findall(".//lanes", namespace))

            # Handle <value xsi:type="bpmn::DataObject"> and similar tags
            for value_element in root_element.findall(".//value", namespace):
                xsi_type = value_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn::"

                    # Aggregate specific tags under "task"
                    if tag_name in ["UserTask", "ServiceTask", "ReceiveTask", "SendTask"]:
                        tag_name = "task"

                    tag_counts[tag_name] += 1

    except ET.ParseError as e:
        print(f"Error parsing file {file_path}: {e}")
    return tag_counts

# Function to calculate extended statistics
def calculate_extended_statistics(data):
    stats = {}
    stats["N"] = len(data)
    stats["Missing"] = data.isnull().sum()
    stats["Mean"] = data.mean()
    stats["SE"] = data.std() / (stats["N"] ** 0.5) if stats["N"] > 0 else None
    stats["95% Confidence Interval Lower"] = stats["Mean"] - 1.96 * stats["SE"] if stats["SE"] else None
    stats["95% Confidence Interval Upper"] = stats["Mean"] + 1.96 * stats["SE"] if stats["SE"] else None
    stats["Median"] = data.median()
    stats["Mode"] = data.mode()[0] if not data.mode().empty else None
    stats["Sum"] = data.sum()
    stats["SD"] = data.std()
    stats["Variance"] = data.var()
    stats["IQR"] = data.quantile(0.75) - data.quantile(0.25)
    stats["Range"] = data.max() - data.min()
    stats["Minimum"] = data.min()
    stats["Maximum"] = data.max()

    # Check for variability before calculating skewness and kurtosis
    if stats["Range"] > 0 and len(data) > 2:
        stats["Skewness"] = skew(data)
        stats["Kurtosis"] = kurtosis(data)
    else:
        stats["Skewness"] = None
        stats["Kurtosis"] = None

    # Check for valid range for Shapiro-Wilk
    if len(data) > 2 and stats["Range"] > 0:
        try:
            shapiro_test = shapiro(data)
            stats["Shapiro-Wilk W"] = shapiro_test.statistic
            stats["Shapiro-Wilk p value"] = shapiro_test.pvalue
        except Exception as e:
            stats["Shapiro-Wilk W"] = None
            stats["Shapiro-Wilk p value"] = None
    else:
        stats["Shapiro-Wilk W"] = None
        stats["Shapiro-Wilk p value"] = None

    stats["25th"] = data.quantile(0.25)
    stats["50th"] = data.quantile(0.50)
    stats["75th"] = data.quantile(0.75)
    stats["90th"] = data.quantile(0.90)
    stats["95th"] = data.quantile(0.95)
    stats["99th"] = data.quantile(0.99)
    return stats

# Function to analyze BPMN files
def analyze_bpmn_files_in_folders(folder_path):
    file_tag_occurrences = {}
    total_data = []  # Store all data for global analysis

    for root_dir, sub_dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".bpmn"):
                file_path = os.path.join(root_dir, file)
                folder_name = os.path.basename(root_dir)
                counts = count_bpmn_occurrences(file_path)
                file_tag_occurrences[folder_name] = counts

                # Collect all occurrences for total analysis
                for tag, count in counts.items():
                    total_data.append({"Tag": tag, "Occurrences": count})

    if not total_data:  # If no data, return empty DataFrame
        print("No data found. Returning an empty DataFrame.")
        return pd.DataFrame(columns=["Tag", "Occurrences"])

    # Create a DataFrame for all results
    results_df = pd.DataFrame(total_data)

    # Calculate aggregate statistics
    statistics = []
    for tag, group in results_df.groupby("Tag")["Occurrences"]:
        stats = calculate_extended_statistics(group)
        stats["Tag"] = tag
        statistics.append(stats)

    # Save the aggregate statistics
    stats_df = pd.DataFrame(statistics)
    stats_df.to_csv("bpmn_aggregate_statistics.csv", index=False)
    print("Aggregate statistics saved to bpmn_aggregate_statistics.csv")

    # Create and save boxplot for all tags
    plt.figure(figsize=(12, 6))
    results_df.boxplot(column="Occurrences", by="Tag", grid=False)
    plt.title("Occurrences of Tags Across All BPMN Files (Aggregated)")
    plt.suptitle("")  # Remove default Matplotlib subtitle
    plt.xlabel("Tag")
    plt.ylabel("Occurrences")
    plt.xticks(rotation=90)
    plt.tight_layout()

    boxplot_path = "bpmn_aggregate_boxplot.png"
    plt.savefig(boxplot_path)
    plt.close()
    print(f"Aggregate boxplot saved to {boxplot_path}")

    return results_df

# Example usage
folder_path = r"C:/Users/vitto/Desktop/github/Re-MASTER-LLM-MODELS/03_Model_Comparison/02 - Hallucination/D2-BPMN-Designer-NoRAG/LLM-gpt-4o-2024-08-06-0.0"
results_df = analyze_bpmn_files_in_folders(folder_path)

# Save results to a CSV file
if not results_df.empty:
    results_df.to_csv("bpmn_designer_tag_occurrences.csv", index=False)
    print("Results saved to bpmn_designer_tag_occurrences.csv")
else:
    print("No BPMN files processed.")

Aggregate statistics saved to bpmn_aggregate_statistics.csv
Aggregate boxplot saved to bpmn_aggregate_boxplot.png
Results saved to bpmn_designer_tag_occurrences.csv


<Figure size 1200x600 with 0 Axes>