In [5]:
import os
import sys
import json

def save_version_controlled_file(data, base_filename='prodigy_bpmn_search', extension='jsonl'):
    # Initialize version
    version = 1

    # Format the filename
    filename = f'{base_filename}_{str(version).zfill(2)}.{extension}'

    # Check if the file already exists
    while os.path.isfile(filename):
        # If it does, increment the version number and update the filename
        version += 1
        filename = f'{base_filename}_{str(version).zfill(2)}.{extension}'

    # Save to file
    with open(filename, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

    return filename

# openai JSON extraction using azure

In [6]:
from openai import AzureOpenAI


OPENAI_DEPLOYMENT_ENDPOINT = "<ENDPOINT>"
OPENAI_API_KEY = "<API_KEY>"
OPENAI_DEPLOYMENT_NAME = "gpt-4o"
OPENAI_DEPLOYMENT_VERSION ="2024-02-01"

client = AzureOpenAI(
    api_key=OPENAI_API_KEY,  
    api_version=OPENAI_DEPLOYMENT_VERSION,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT
)

A BPMN (Business Process Model and Notation) diagram is a graphical representation used to model business processes. It provides a standard way to visualize the steps involved in a business process, making it easier for stakeholders to understand, analyze, and improve the process. BPMN diagrams are widely used in business process management and are designed to be understandable by all business users, from business analysts who create and refine the processes to technical developers responsible for implementing the processes and business managers who monitor and manage the processes.

Key elements of a BPMN diagram include:

1. **Flow Objects**:
   - **Events**: Represent something that happens (e.g., start, intermediate, end events).
   - **Activities**: Represent work that is performed (e.g., tasks, sub-processes).
   - **Gateways**: Represent decision points that control the flow of the process (e.g., exclusive, parallel gateways).

2. **Connecting Objects**:
   - **Sequence Flows**:

# prompting GPT to generate queries and keywords for each BPMN file

In [29]:
import os
import time
import json
import xml.etree.ElementTree as ET


def create_prompt(prompt_template, bpmn_xml):
    bpmn_xml = bpmn_xml.replace('"', '').replace("'", '')
    return prompt_template.format(bpmn_xml=bpmn_xml)

def extract_information(system_message, prompt_template, bpmn_xml):
    prompt = create_prompt(prompt_template, bpmn_xml)
    response = client.chat.completions.create(
        model=OPENAI_DEPLOYMENT_NAME,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
        temperature=0.5,
    )
    print(response.choices[0].message.content)
    return json.loads(response.choices[0].message.content)

def process_xml(file_path):
    """
    Parses the XML file and removes the 'bpmndi:BPMNDiagram' elements.

    Parameters:
    file_path (str): The path to the XML file.

    Returns:
    str: The modified XML as a string.
    """
    try:
        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Find and clear all 'bpmndi:BPMNDiagram' elements
        for elem in root.findall('.//bpmndi: BPMNDiagram', namespaces={'bpmndi': 'http://www.omg.org/spec/BPMN/20100524/DI'}):
            elem.clear()

        # Convert the modified XML tree back to a string
        xml_str = ET.tostring(root, encoding='utf-8').decode('utf-8')

        return xml_str
    except ET.ParseError as e:
        print(f"Error parsing XML file {file_path}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error processing XML file {file_path}: {e}")
        return None

def process_file(system_message, prompt_template, file_path):
    # Read the BPMN file
    bpmn_xml = process_xml(file_path)
    # Extract information from the BPMN XML
    extracted_info = extract_information(system_message, prompt_template, bpmn_xml)

    return extracted_info

def process_folder(system_message, prompt_template, folder_path):
    processed_files = {}

    # Get all files in the folder
    files = os.listdir(folder_path)

    # Process each file
    for file in files:
        file_path = os.path.join(folder_path, file)

        # Skip if the file does not end with .bpmn
        if not file_path.lower().endswith('.bpmn'):
            continue

        print(f"Processing file: {file_path}")

        try:
            processed_files[file] = process_file(system_message, prompt_template, file_path)
        except Exception as e:
            print(f"Error processing file: {file_path}")
            print(e)

        # pause for 5 seconds
        time.sleep(5)

    return processed_files


# Process all BPMN files in the folder
system_message = "You are an AI model trained to analyze text extracted from BPMN files and provide relevant information about the process described in the file. Your task is to identify keywords, provide a brief description of the process, and suggest relevant search queries made to a BPMN search engine. Respond in Dutch."
prompt_template = "The following text is extracted from a BPMN file: \"{bpmn_xml}\". Please analyze the text and provide the following information in a JSON format with fields 'keywords', 'description' and 'queries': 1) Dutch keywords that describe the diagram as a whole, 2) A brief description of the processes described in the XML file in Dutch, and 3) Formulate ten search queries in both Dutch, English and French (10 pairs of query and language for each language). These queries should be designed in such a way that if they were entered into a BPMN search engine, the given BPMN diagram would be among the top results. The queries should be closely related to the unique processes and features described in the BPMN file."
#processed_files = process_folder(system_message, prompt_template, 'data/bpmn')


In [31]:
# Convert to pandas dataframe
import pandas as pd


# Process all BPMN files in the folder
system_message = "You are an AI model trained to analyze text extracted from BPMN files and generate relevant natural language questions about the process described in the file. Your task is to generate 5 questions in Dutch that would lead to the process or one of its subprocesses if entered into a BPMN search engine."
prompt_template = """
The following text is extracted from a BPMN file: \"{bpmn_xml}\". 
Please analyze the text and generate 10 natural language questions in Dutch. 
These questions should be designed in such a way that if they were entered into a BPMN search engine, 
the given BPMN diagram or one of its subprocesses would be among the top results. 

The questions could start with phrases like:
- 'Geef mij een BPMN over ...'
- 'Ik zoek een procesdiagram dat ... betreft'
-  ...

The output should be a JSON object with the following structure:

{{
    "questions": [
        "Question 1",
        "Question 2",
        "Question 3",
        ...
    ]
}}
"""
processed_files = process_folder(system_message, prompt_template, 'data/bpmn')


processed_files_formatted = {}

for file, data in processed_files.items():
    processed_files_formatted[file] = {
        'keywords': [],
        'description': "",
        'queries': [{"query": query, "language": "Dutch"} for query in data["questions"]]
    }


# Create dataframe use key as the 'file' column
df = pd.DataFrame(processed_files_formatted).T

# Rename the index column to 'file'
df.index.name = 'file'

df.to_csv('data/bpmn/data_extraction/openai_keywords_search_queries.02.csv')

Processing file: data/bpmn\dispatch.bpmn
{
    "questions": [
        "Geef mij een BPMN over de verzending van goederen door een computerhardwarewinkel.",
        "Ik zoek een procesdiagram dat betrekking heeft op het verzekeren van pakketten.",
        "Toon mij een BPMN-diagram met een proces voor het schrijven van verzendlabels.",
        "Geef mij een BPMN over de selectie van logistieke bedrijven en het plaatsen van bestellingen.",
        "Ik zoek een procesdiagram dat betrekking heeft op het voorbereiden van goederen voor verzending.",
        "Toon mij een BPMN-diagram met een proces voor het verkrijgen van offertes van logistieke bedrijven.",
        "Geef mij een BPMN over het verpakken van goederen in een magazijn.",
        "Ik zoek een procesdiagram dat betrekking heeft op het bepalen van de verzendmethode.",
        "Toon mij een BPMN-diagram met een proces voor het afhandelen van speciale verzendingen.",
        "Geef mij een BPMN over het voorbereiden van een zending v

# Generate a textual description of the bpmn-diagram

In [61]:

# Define the system prompt and prompt template
system_prompt = "You are an assistant that summarizes BPMN processes."
prompt_template = "Break down the BPMN process diagram described in the following XML file into its sub-processes. Provide a detailed description in Dutch of each sub-process, focusing on the sequence of tasks, decision points, and key activities and events. Ensure clarity on the flow and purpose of each sub-process:\n\n\"{bpmn_xml}\"\n\n"
#message = f"Please summarize the BPMN process described in the following XML file:\n\n{bpmn_xml}\n\nFocus on the sequence of tasks, decision points, and key activities and events. Provide a detailed description of each step in the process, ensuring clarity on the flow and purpose of each action. The summary should be in Dutch."
#message = f"Summarize the BPMN process diagram described in the following XML file. Focus on the sequence of tasks, decision points, and key activities and events. The summary is in Dutch and provides a detailed description of each step in the process, ensuring clarity on the flow and purpose of each action:\n\n{bpmn_xml}\n\n"

# Call the process_folder function with the new system prompt and prompt template
processed_files = process_folder(system_prompt, prompt_template, 'data/bpmn')

Processing files:   0%|          | 0/26 [00:00<?, ?it/s]

Skipping file (not bpmn): data/bpmn\data_extraction
Skipping file (exists): data/bpmn\dispatch.bpmn
Skipping file (not bpmn): data/bpmn\dispatch.txt
Skipping file (exists): data/bpmn\hoofdprocesProcesMelding.bpmn
Skipping file (not bpmn): data/bpmn\hoofdprocesProcesMelding.txt
Skipping file (exists): data/bpmn\Jeugd-Fuifkwaliteitslabel-aanvraag-Poperinge_v0.1.bpmn
Skipping file (not bpmn): data/bpmn\Jeugd-Fuifkwaliteitslabel-aanvraag-Poperinge_v0.1.txt
Skipping file (exists): data/bpmn\Jeugd-Kadervormingsubsidieaanvraag-Poperinge_v0.1.bpmn
Skipping file (not bpmn): data/bpmn\Jeugd-Kadervormingsubsidieaanvraag-Poperinge_v0.1.txt
Skipping file (exists): data/bpmn\Jeugd-Project-subsidieaanvraag-Poperinge_v0.1.bpmn
Skipping file (not bpmn): data/bpmn\Jeugd-Project-subsidieaanvraag-Poperinge_v0.1.txt
Skipping file (exists): data/bpmn\Jeugd-subsidieaanvraag-Poperinge_v0.1.bpmn
Skipping file (not bpmn): data/bpmn\Jeugd-subsidieaanvraag-Poperinge_v0.1.txt
Skipping file (exists): data/bpmn\Jeug

Processing files:  77%|███████▋  | 20/26 [00:03<00:01,  5.14it/s]

Error processing file: data/bpmn\klachtenbehandeling.bpmn
Error code: 400 - {'error': {'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 723641 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Processing file: data/bpmn\Meldingen-Heuvelland.bpmn


Processing files:  81%|████████  | 21/26 [01:35<00:31,  6.27s/it]

Processing file: data/bpmn\Meldingen-Kortemark.bpmn


Processing files:  85%|████████▍ | 22/26 [03:01<00:52, 13.24s/it]

Processing file: data/bpmn\rijbewijs.bpmn


Processing files:  88%|████████▊ | 23/26 [04:44<01:09, 23.12s/it]

Skipping file (exists): data/bpmn\Sport-Subsidieaanvraag-De-Panne_v0.1.bpmn
Skipping file (not bpmn): data/bpmn\Sport-Subsidieaanvraag-De-Panne_v0.1.txt
Processing file: data/bpmn\Sport-Subsidieaanvraag-GrootstGemeneDeler_v0.3.bpmn


Processing files: 100%|██████████| 26/26 [08:25<00:00, 19.45s/it]


# Creating search dataset: query-bpmn pairs using the description as proxy

In [47]:
import pandas as pd
import ast
import random
from sentence_transformers import InputExample
from tqdm import tqdm

# Step 1: Load bpmn descriptions back into a dictionary
def load_bpmn_descriptions(folder_path):
    file_name_to_sentence = {}
    files = os.listdir(folder_path)
    for file in tqdm(files, desc="Loading BPMN descriptions"):
        file_path = os.path.join(folder_path, file)
        if file_path.lower().endswith('.txt'):
            try:
                with open(file_path, 'r') as f:
                    file_name = os.path.splitext(file)[0]
                    file_name_to_sentence[file_name+".bpmn"] = f.read()
            except Exception as e:
                print(f"Error reading file: {file_path}")
                print(e)

    return file_name_to_sentence

# Step 2: Fetch the data
def loading_bpmn_queries(file_path):
    df = pd.read_csv(file_path)
    data_dict = {}
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Loading BPMN queries"):
        file_name = os.path.splitext(row['file'].split('/')[-1])[0]
        keywords = ast.literal_eval(row['keywords'])
        queries = ast.literal_eval(row['queries'])
        description = row['description']
        data_dict[file_name] = {'keywords': keywords, 'queries': queries, 'description': description}

    return data_dict

# Step 3: Create positive examples
def create_positive_examples(data_dict, file_name_to_sentence):
    positive_examples = []
    for file_name, data in data_dict.items():
        if file_name in file_name_to_sentence:
            for keyword in data['keywords']:
                positive_examples.append(InputExample(texts=[keyword, file_name_to_sentence[file_name]], label=1))
            for query in data['queries']:
                positive_examples.append(InputExample(texts=[query["query"], file_name_to_sentence[file_name]], label=1))
    return positive_examples

# Step 3.b: Create positive examples
def postive_queries(data_dict, file_name_to_sentence):
    positive_examples = []
    for file_name, data in data_dict.items():
        if file_name in file_name_to_sentence:
            for query in data['queries']:
                positive_examples.append({'query': query["query"],
                                           "content": file_name_to_sentence[file_name],
                                           'meta':{"type": "query", "language": query["language"], "file": file_name, "label": True}})
    return positive_examples

# Step 4: Create negative examples
def create_negative_examples(data_dict, file_name_to_sentence, num_negative_examples):
    negative_examples = []
    for file_name, data in data_dict.items():
        if file_name in file_name_to_sentence:
            other_file_names = [other_file for other_file in file_name_to_sentence.keys() if other_file != file_name]
            for _ in range(num_negative_examples):
                negative_file_name = random.choice(other_file_names)
                for keyword in data['keywords']:
                    negative_examples.append(InputExample(texts=[keyword, file_name_to_sentence[negative_file_name]], label=0))
                for query in data['queries']:
                    negative_examples.append(InputExample(texts=[query["query"], file_name_to_sentence[negative_file_name]], label=0))
                negative_examples.append(InputExample(texts=[data['description'], file_name_to_sentence[negative_file_name]], label=0))
    return negative_examples

# Step 4.b: Create negative queries
def negative_queries(data_dict, file_name_to_sentence, num_negative_examples):
    negative_examples = []
    for file_name, data in data_dict.items():
        if file_name in file_name_to_sentence:
            other_file_names = [other_file for other_file in file_name_to_sentence.keys() if other_file != file_name]
            
            for query in data['queries']:
                for _ in range(num_negative_examples):
                    negative_file_name = random.choice(other_file_names)
                    negative_examples.append({'query': query["query"], 
                                              "content": file_name_to_sentence[negative_file_name],
                                                'meta':{"type": "query", "language": query["language"], "file": negative_file_name, "label": False}})
    return negative_examples

# Fetch the data
bpmn_descriptions = load_bpmn_descriptions('data/bpmn/descriptions')
bpmn_queries = loading_bpmn_queries('data/bpmn/data_extraction/openai_keywords_search_queries.02.csv')

# Create positive and negative examples
#positive_examples = create_positive_examples(bpmn_queries, bpmn_descriptions)
#negative_examples = create_negative_examples(bpmn_queries, bpmn_descriptions, num_negative_examples=1)

# Create positive and negative queries
positive_queries = postive_queries(bpmn_queries, bpmn_descriptions)
negative_queries = negative_queries(bpmn_queries, bpmn_descriptions, num_negative_examples=4)

# Combine positive and negative queries
training_data = positive_queries + negative_queries

# Logging the number of positive and negative examples
num_positive_examples = len(positive_queries)
num_negative_examples = len(negative_queries)
print(f"Number of positive queries: {num_positive_examples}")
print(f"Number of negative queries: {num_negative_examples}")
print(f"Total number of training queries: {len(training_data)}")

save_version_controlled_file(training_data, base_filename='prodigy_bpmn_search', extension='jsonl')

Loading BPMN descriptions: 100%|██████████| 15/15 [00:00<00:00, 114.67it/s]
Loading BPMN queries: 100%|██████████| 16/16 [00:00<00:00, 3193.99it/s]

Number of positive queries: 140
Number of negative queries: 560
Total number of training queries: 700





'bpmn_search_dataset_01.jsonl'

In [41]:
# sample the training data
import random
random.shuffle(training_data)
training_data = training_data[:5]

# Display the training data
for example in training_data:
    print(example)

{'query': 'Ik zoek een procesdiagram dat betrekking heeft op het aanvullen van bewijzen kadervorming', 'content': 'De BPMN-procesdiagram beschreven in het XML-bestand bevat verschillende sub-processen die zijn verdeeld over verschillende banen (lanes). Hieronder volgt een gedetailleerde beschrijving van elk sub-proces in het Nederlands, met aandacht voor de volgorde van taken, beslissingspunten en belangrijke activiteiten en gebeurtenissen:\n\n### Sub-proces 1: Aanvraagproces\n**Baan:** Lane_0yge0ax\n1. **Start van het proces:** Het proces begint met een startevenement.\n2. **Indienen aanvraagformulier:** Een gebruikerstaak waarbij een aanvraagformulier wordt ingediend.\n3. **Aanvullen bewijzen kadervorming:** Na het indienen van het aanvraagformulier, als er aanvullende bewijzen nodig zijn voor de kadervorming, wordt deze taak uitgevoerd.\n\n### Sub-proces 2: Evaluatieproces\n**Baan:** Lane_0fpfr75\n1. **Evalueren aanvraagformulieren:** Een handmatige taak waarbij de ingediende aanvra

# using gpt-4o for predicting if the query relates to the document

In [49]:
import json
import openai
from typing import List, Dict

def load_queries(file_path: str) -> List[Dict]:
    queries = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                queries.append(json.loads(line))
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except json.JSONDecodeError:
        print(f"Error decoding JSON from {file_path}.")
    return queries

def process_queries(queries: List[Dict], model: str, output_file: str, skips = 0, n_queries: int = 20) -> List[Dict]:
    from tqdm import tqdm
    import time

    processed_queries = []
    for query in tqdm(queries[skips:skips + n_queries], desc="Processing queries"):
        prompt = f"Evaluate whether the following query is related to the document content. Query: '{query['query']}'. Document: '{query['content']}'. If the query is related, respond with a JSON object structured as follows: {{'related': true, 'confidence': 'high/medium/low', 'motivation': 'short explanation'}}. If the query is not related, respond with a JSON object structured as follows: {{'related': false, 'confidence': 'high/medium/low', 'motivation': 'short explanation'}}. The 'confidence' field should reflect how strongly you think the query is correlated to the document: 'high' if you think the correlation is strong, 'medium' if you think the correlation is moderate, and 'low' if you think the correlation is weak. The 'motivation' field should contain a short explanation of why you assigned the given 'confidence' level."
        try:
            response = client.chat.completions.create(
                model=model,
                response_format={"type": "json_object"},
                messages=[
                    {"role": "system", "content": "You are a language model trained by OpenAI. Your task is to determine if a given query is related to a document and respond with a structured JSON object."},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.0,
            )
            query['meta']['predicted_label'] = json.loads(response.choices[0].message.content)
            processed_queries.append(query)
            
            #print(f"Processed query '{query['text']}' for file {query['meta']['file']} with response: {query['meta']['predicted_label']}")
            
            # Save the processed query to the output file
            with open(output_file, 'a') as file:
                file.write(json.dumps(query) + '\n')
            
        except openai.OpenAIError as e:
            print(f"Error processing query '{query['text']}': {str(e)}")
    return processed_queries

def relabel_queries():
    OPENAI_DEPLOYMENT_NAME_FAST = "gpt-4o"
    input_file = 'prodigy/query_dataset/prodigy_bpmn_search_02.jsonl'
    output_file = 'prodigy/query_dataset/prodigy_bpmn_search_02_prompted.jsonl'

    queries = load_queries(input_file)
    print(len(queries))
    queries = process_queries(queries, OPENAI_DEPLOYMENT_NAME_FAST, output_file, skips=0, n_queries=700)

relabel_queries()

700


Processing queries:   0%|          | 0/700 [00:00<?, ?it/s]

Processing queries: 100%|██████████| 700/700 [49:58<00:00,  4.28s/it]  


# Fetching the disagreements between AI and assigned labels

In [52]:
OPENAI_DEPLOYMENT_NAME_FAST = "gpt-4o"
input_file_01 = 'prodigy/query_dataset/prodigy_bpmn_search_02.jsonl'
output_file_02 = 'prodigy/query_dataset/prodigy_bpmn_search_02_prompted.jsonl'

queries_01 = load_queries(input_file_01)
queries_02 = load_queries(output_file_02)

#find the difference between meta.label and meta.predicted_label.related and write them to new file
diff_queries = []
for query_01, query_02 in zip(queries_01, queries_02):
    label = query_01['meta']['label']
    related = query_02['meta']['predicted_label']['related']
    
    # Map 'positive' to True and 'negative' to False
    mapping = {'positive': True, 'negative': False, True: True, False: False}

    if related != mapping[label]:
        diff_queries.append(query_02)

# Should be manually reviewed and corrected -> prodi.gy
output_file = 'prodigy/prodigy_bpmn_search_02_differences.jsonl'

with open(output_file, 'w') as file:
    for query in diff_queries:
        file.write(json.dumps(query) + '\n')


# merging the datasets: annotated samples and corrected samples

In [55]:
import json


# File paths
original_samples_file = 'prodigy/query_dataset/prodigy_bpmn_search_02_prompted.jsonl'
annotated_samples_file = 'prodigy/query_dataset/prodigy_bpmn_search_02_differences_annotated.jsonl'

# Load samples
original_samples = load_queries(original_samples_file)
annotated_samples = load_queries(annotated_samples_file)

# Mapping labels
mapping_labels = {'positive': True, 'negative': False, True: 'positive', False: 'negative'}

# Merge the processed samples
merged_samples = annotated_samples + original_samples

# Process samples
for sample in merged_samples:
    if 'answer' in sample:
        answer = sample['answer']
        related = sample['meta']['predicted_label'].get('related', False)

        if answer == 'accept':
            sample['meta']['label'] = related
        elif answer == 'reject':
            sample['meta']['label'] = not related
    else:
        if 'predicted_label' in sample['meta']:
            sample['meta']['label'] = sample['meta']['predicted_label'].get('related', False)
        else:
            print("failed to process sample: ", sample)



# Remove duplicates based on the 'text' and 'content' fields
unique_samples = []
seen = set()

for sample in merged_samples:
    # Create a tuple of the 'text' and 'content' fields
    key = (sample['query'], sample['content'])

    # If the key is not in the seen set, add it to the unique_samples list and the seen set
    if key not in seen:
        unique_samples.append(sample)
        seen.add(key)

# Define the output file path
output_file = 'prodigy/query_dataset/prodigy_bpmn_search_02_dataset.jsonl'

# Write the unique samples to the output file
with open(output_file, 'w') as f:
    for sample in unique_samples:
        json.dump(sample, f)
        f.write('\n')