# HEPSYCODE 

In [10]:
# With Categories
import os
from collections import Counter
from xml.etree import ElementTree as ET
import pandas as pd

def categorize_tag(tag):
    """
    Assign a category to the tag based on predefined rules.
    """
    tag_lower = tag.lower()
    flow_objects = [
        'scripttask', 'businessruletask', 'intermediatethrowevent', 'servicetask',
        'subprocess', 'receivetask', 'transaction', 'startevent', 'association',
        'eventbasedgateway', 'callactivity', 'task', 'exclusivegateway', 'endevent',
        'usertask', 'boundaryevent', 'parallelgateway', 'intermediatecatchevent', 'process'
    ]
    connecting_objects = ['incoming', 'outgoing', 'targetref', 'sourceref', 'sequenceflow']
    swim_lanes = ['laneset', 'lane']
    artifacts = ['dataobject', 'datastore']

    if tag_lower in flow_objects:
        return "Flow Objects"
    elif tag_lower in connecting_objects:
        return "Connecting Objects"
    elif tag_lower in swim_lanes:
        return "Swim Lanes"
    elif tag_lower in artifacts:
        return "Artifacts"
    else:
        return "Uncategorized"

def categorize_tag_EMF(tag):
    """
    Assign a category to the tag based on HML-style classification.
    """
    tag_lower = tag.lower()
    
    classes = ['processes', 'nodes', 'ports', 'message', 'entry']
    attributes = ['nfrom', 'nto', 'pfrom', 'pto', 'type', 'queuesize', 'rendezvous', 'portextension',
                  'xsi:type="hml:stimulus"', 'xsi:type="hml:display"', 'xsi:type="hml:structurednode"']
    relations = ['nchannels', 'pchannels']

    if tag_lower in classes:
        return "Classes"
    elif tag_lower in attributes:
        return "Attributes"
    elif tag_lower in relations:
        return "Relations"
    else:
        return "Uncategorized"

def count_tags_in_bpmn_process(file_path):
    tag_counter = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        for elem in root.iter():
            # Clean tag from namespace if present
            clean_tag = elem.tag.split('}')[-1].lower()
            tag_counter[clean_tag] += 1

            # Count all attributes
            for attr_name, attr_val in elem.attrib.items():
                attr_key = attr_name.lower()
                tag_counter[attr_key] += 1

                # Handle xsi:type="hml:..." patterns
                if attr_key == "xsi:type":
                    xsi_type_combined = f'xsi:type="{attr_val.lower()}"'
                    tag_counter[xsi_type_combined] += 1

    except ET.ParseError:
        print(f"Warning: Failed to parse XML file: {file_path}")
    return tag_counter

def analyze_bpmn_tags_in_folders_and_save(folder_path):
    # Traverse the folder and its subfolders
    for root_dir, dirs, files in os.walk(folder_path):
        # Process only .hepsy files (HML-based XML)
        hml_files = [file for file in files if file.endswith(".hepsy")]
        if hml_files:
            file_tag_occurrences = {}

            for file in hml_files:
                file_path = os.path.join(root_dir, file)
                file_tag_occurrences[file] = count_tags_in_bpmn_process(file_path)

            # Convert the collected tag data into a structured DataFrame
            file_tag_data = []
            for file_name, tag_counts in file_tag_occurrences.items():
                for tag, count in tag_counts.items():
                    category = categorize_tag(tag)
                    categoryEMF = categorize_tag_EMF(tag)
                    file_tag_data.append({
                        "File": file_name,
                        "Tag": tag,
                        "Occurrences": count,
                        "Category": category,
                        "CategoryEMF": categoryEMF
                    })

            # Create and sort DataFrame
            file_tag_df = pd.DataFrame(file_tag_data)
            file_tag_df = file_tag_df.sort_values(by=["File", "Occurrences", "Category"], ascending=[True, False, False])
            print(file_tag_df)

            # Save results as CSV in the same folder
            folder_name = os.path.basename(root_dir)
            output_csv = os.path.join(root_dir, f"{folder_name}_HML_Tag_Counts.csv")
            file_tag_df.to_csv(output_csv, index=False)
            print(f"Results saved to {output_csv}")

# Example usage:
# Replace 'your_folder_path' with the path to the folder you want to analyze
folder_path = r"C:\Users\vitto\Desktop\github\Re-MASTER-LLM-MODELS\03_Model_Comparison\02 - Hallucination\HEPSYCODE\D1\HEPSY"  # Replace with the path to your folder containing XML files
analyze_bpmn_tags_in_folders_and_save(folder_path)

                                                 File  \
2   2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
9   2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
10  2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
11  2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
12  2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
13  2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
5   2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
14  2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
15  2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
16  2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
18  2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
3   2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
4   2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
6   2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
7   2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
8   2024-02-13 16.44 00%20-%20DigitalCam%20Nominal...   
0   2024-02-13 16.44 00%20-%20D

### HEPSYCODE Designer Element occurrencies

In [None]:
import os
import xml.etree.ElementTree as ET
from collections import Counter
import pandas as pd

def categorize_tag(tag):
    """
    Assign a category to the tag based on predefined rules.
    """
    tag_lower = tag.lower()
    flow_objects = [
        'scripttask', 'businessruletask', 'intermediatethrowevent', 'servicetask',
        'subprocess', 'receivetask', 'transaction', 'startevent', 'association',
        'eventbasedgateway', 'callactivity', 'task', 'exclusivegateway', 'endevent',
        'usertask', 'boundaryevent', 'parallelgateway', 'intermediatecatchevent', 'process'
    ]
    connecting_objects = ['incoming', 'outgoing', 'targetref', 'sourceref', 'sequenceflow']
    swim_lanes = ['laneset', 'lane']
    artifacts = ['dataobject', 'datastore']

    if tag_lower in flow_objects:
        return "Flow Objects"
    elif tag_lower in connecting_objects:
        return "Connecting Objects"
    elif tag_lower in swim_lanes:
        return "Swim Lanes"
    elif tag_lower in artifacts:
        return "Artifacts"
    else:
        return "Uncategorized"

def categorize_tag_EMF(tag):
    """
    Assign a category to the tag based on HML-style classification.
    """
    tag_lower = tag.lower()
    
    classes = ['processes', 'nodes', 'ports', 'message', 'entry']
    attributes = ['nfrom', 'nto', 'pfrom', 'pto', 'type', 'queuesize', 'rendezvous', 'portextension',
                  'xsi:type="hml:stimulus"', 'xsi:type="hml:display"', 'xsi:type="hml:structurednode"']
    relations = ['nchannels', 'pchannels']

    if tag_lower in classes:
        return "Classes"
    elif tag_lower in attributes:
        return "Attributes"
    elif tag_lower in relations:
        return "Relations"
    else:
        return "Uncategorized"

def count_hepsy_occurrences(file_path):
    tag_counts = Counter()
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        namespace = {'bpmn': 'http://www.omg.org/spec/BPMN/20100524/MODEL-XMI'}

        # Find all <rootElements> tags
        for root_element in root.findall(".//rootElements", namespace):
            # Count <flowElements> tags with specific xsi:type
            for flow_element in root_element.findall(".//flowElements", namespace):
                xsi_type = flow_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:") or xsi_type.startswith("bpmn::"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn:" or "bpmn::"
                    tag_counts[tag_name] += 1

                # Count specific attributes
                if "outgoing" in flow_element.attrib:
                    tag_counts["outgoing"] += 1
                if "incoming" in flow_element.attrib:
                    tag_counts["incoming"] += 1
                if "sourceRef" in flow_element.attrib:
                    tag_counts["sourceRef"] += 1
                if "targetRef" in flow_element.attrib:
                    tag_counts["targetRef"] += 1

            # Count specific tags: laneSets and lanes
            tag_counts["laneSets"] += len(root_element.findall(".//laneSets", namespace))
            tag_counts["lanes"] += len(root_element.findall(".//lanes", namespace))

            # Handle <value xsi:type="bpmn::DataObject"> and similar tags
            for value_element in root_element.findall(".//value", namespace):
                xsi_type = value_element.attrib.get("{http://www.w3.org/2001/XMLSchema-instance}type", "")
                if xsi_type.startswith("bpmn:"):
                    tag_name = xsi_type.split(":")[-1]  # Extract the part after "bpmn::"
                    tag_counts[tag_name] += 1

    except ET.ParseError as e:
        print(f"Error parsing file {file_path}: {e}")
    return tag_counts

def analyze_hepsy_files_in_folders(folder_path):
    # Initialize a dictionary to store occurrences for each file
    file_tag_occurrences = {}

    # Traverse the folder and its subfolders for .hepsy files
    for root_dir, sub_dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".hepsy"):  # Focus only on HML files
                file_path = os.path.join(root_dir, file)
                folder_name = os.path.basename(root_dir)
                file_tag_occurrences[file] = count_tags_in_bpmn_process(file_path)

    # Create a DataFrame to store results
    all_results = []
    for file_name, counts in file_tag_occurrences.items():
        for tag, count in counts.items():
            clean_tag = tag.lower()  # Ensure lowercase for consistency
            category = categorize_tag(clean_tag)
            category_emf = categorize_tag_EMF(clean_tag)
            all_results.append({
                "File": file_name,
                "Tag": clean_tag,
                "Occurrences": count,
                "Category": category,
                "CategoryEMF": category_emf
            })

    if not all_results:
        print("No data found. Returning an empty DataFrame.")
        return pd.DataFrame(columns=["File", "Tag", "Occurrences", "Category", "CategoryEMF"])

    results_df = pd.DataFrame(all_results)

    # Sort by file and number of occurrences
    results_df = results_df.sort_values(by=["File", "Occurrences"], ascending=[True, False])

    return results_df


# Example usage:
# Replace 'your_folder_path' with the path to the folder containing BPMN files
folder_path = r"C:/Users/vitto/Desktop/github/Re-MASTER-LLM-MODELS/03_Model_Comparison/02 - Hallucination/D2-BPMN-Designer-RAG/LLM-mistral-small-latest-1.0"
results_df = analyze_hepsy_files_in_folders(folder_path)

# Save results to a CSV file
if not results_df.empty:
    results_df.to_csv("Results/bpmn_designer_RAG_LLM-mistral-small-latest-1.0.csv", index=False)
    print("Results saved to Results/bpmn_designer_RAG_LLM-mistral-small-latest-1.0.csv")
else:
    print("No BPMN files processed.")