In [1]:
import os

def set_environment_variables():
    os.environ['OPENAI_DEPLOYMENT_ENDPOINT'] = 'https://abb-openai.openai.azure.com/'
    os.environ['OPENAI_API_KEY'] = '29c48fb759054b38891f07b5a467e124'
    os.environ['OPENAI_DEPLOYMENT_NAME'] = 'gpt-4o'
    os.environ['OPENAI_DEPLOYMENT_VERSION'] = '2024-02-01'

    os.environ['DEEPINFRA_API_KEY'] = 'iWkTLk0avn6IpwrwUdsm6xPglfXlvyrA'
    os.environ['DEEPINFRA_ENDPOINT'] = 'https://api.deepinfra.com/v1/openai'
    os.environ['DEEPINFRA_MODEL_NAME'] = 'meta-llama/Meta-Llama-3-70B-Instruct'
    
    os.environ['LOKAALBESLIST_LOCATION_ID'] = 'ce94eeae827cdc5c00f3f2f4276ff628580edefb3aa5861e4669b0c5d93adc57'
    os.environ['LOKAALBESLIST_SEARCH_ENDPOINT'] = 'https://lokaalbeslist.vlaanderen.be/search/agenda-items/search'

# Call the function to set the environment variables
set_environment_variables()

# What is this notebook for?

The following Jupyter notebook contains the code needed to create an artificial dataset for context-aware tasks such as keyword extraction, classification, summarization, and translation.

## General structure of a context-aware prompt

All prompts are generated according to the following template:

```
####
Context: {context}
####
{task}
```

- **Context**: The document, text, JSON object, or file that the model should use to derive a response. It should contain all the necessary information and facts to form a correct response. The goal is to only use this as the source of information and facts.
- **Task**: The action that needs to be performed with the context in mind, such as "translate the context", "get x from the context", etc.

When training a model, the prompt is part of the user message. Each sample is a conversation containing a system message, user message, and assistant message.
- **System message**: Describes the broader context or general behavior of how the agent should act (e.g., "the AI should stick only to the context").
- **User message**: Contains the prompt.
- **Assistant message**: Contains the answer to the prompt.

To train a model, we should create samples that contain these three necessary components. However, additional metadata for further analysis is also stored, which is not technically needed.

## How it works

Since no 'real' data exists, artificial data is generated for the tasks using a more robust model (for the existing dataset, this was GPT-4o). We simply create the system message and prompt, send it to GPT-4o or any other model to generate a response, and store it. This process is repeated for every task.

Generally, we want two types of output: JSON objects for further processing and storing within other applications, and text format.

In most cases, we want to further process the results, making JSON an ideal format. To nudge the model into generating JSON, we use the following system message (see `generate_json_prompt`):

```
system_message = "Your task is to generate responses in JSON format. Ensure that your output strictly follows the provided JSON structure. Each key in the JSON should be correctly populated according to the instructions given. Pay attention to details and ensure the JSON is well-formed and valid."
```

For RAG (Retrieval-Augmented Generation) tasks, JSON is not expected, but all information should be derived from the context. To make this requirement extra clear, we use the following system message (see `generate_rag_prompt`):

```
system_message = """You are a Retrieval Augmented Generator (RAG). Your task is to complete tasks and answer questions based on the provided context.
1. Language: Respond only in Dutch.
2. Relevance: Use only the information from the provided context to form your response. If the context does not contain relevant information to the question or task, respond with 'Sorry, ik kan de gevraagde informatie niet terugvinden.'
3. Only respond to questions related to the context."""
```


The typical approach for generating samples is as follows:

1. **Create a function to generate the prompt, system message, context for the task:**
    ```python
    prompt_string, system_message, task_string, context = generate_llm_task(data)
    ```

2. **Set `json_mode` to `True` if JSON format is desired:**
    ```python
    response = generate_response_openai(system_message, prompt_string, stream=False, json_mode=True)
    ```

3. **Extract the JSON response and save it togheter with the metadata to file**
    ```python
    response_json = extract_json(response)
    if response_json is not None:
        # Create metadata for the task
        task_info = get_task_info("keywords_bpmn", "keywords", "keywords_bpmn", "bpmn", "json")
        # Save the system message, user input, task instruction, context, and response to a file for later use
        save_results_to_file(task_info, file_path, task_string, context, system_message, prompt_string, response)
    return response_json
    ```


# Configurations and setup

In [2]:
from openai import AzureOpenAI, OpenAI
import json
import uuid
import os

OPENAI_DEPLOYMENT_ENDPOINT = os.environ['OPENAI_DEPLOYMENT_ENDPOINT']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
OPENAI_DEPLOYMENT_NAME = os.environ['OPENAI_DEPLOYMENT_NAME']
OPENAI_DEPLOYMENT_VERSION = os.environ['OPENAI_DEPLOYMENT_VERSION']

DEEPINFRA_API_KEY = os.environ['DEEPINFRA_API_KEY']
DEEPINFRA_ENDPOINT = os.environ['DEEPINFRA_ENDPOINT']
DEEPINFRA_MODEL_NAME = os.environ['DEEPINFRA_MODEL_NAME']

LOKAALBESLIST_LOCATION_ID = os.environ['LOKAALBESLIST_LOCATION_ID']
LOKAALBESLIST_SEARCH_ENDPOINT = os.environ['LOKAALBESLIST_SEARCH_ENDPOINT']

# Create an OpenAI client using Azure endpoint
client = AzureOpenAI(
    api_key=OPENAI_API_KEY,  
    api_version=OPENAI_DEPLOYMENT_VERSION,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT
)

# Create an OpenAI client using llama endpoint
client = OpenAI(
    api_key=DEEPINFRA_API_KEY,
    base_url=DEEPINFRA_ENDPOINT,
)

In [3]:
import random
import json
import re

#General response generation function for openai and ollama

def generate_response_openai(system_message,prompt_string, stream=False, json_mode=False, context_length=4096):

    # Set the response_format based on the json_mode parameter
    response_format = {"type": "json_object"} if json_mode else None

    response = client.chat.completions.create(
        model= OPENAI_DEPLOYMENT_NAME,
        response_format=response_format,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt_string},
        ],
        temperature=0.0,
        max_tokens=context_length,
        stream=stream,
        )

    return response

#General structuring functions for prompting

def generate_prompt(task, context, system_message, include_system_message=True):
    if include_system_message:
        return f"####\nContext: {context}\n####\nInstructions:  {system_message} \n####\n{task}", system_message
    else:
        return f"####\nContext: {context}\n####\n{task}", system_message

# Specific prompt generation functions where json responses are required
def generate_json_prompt(task, context, include_system_message=True):
    system_message = "Your task is to generate responses in JSON format. Ensure that your output strictly follows the provided JSON structure. Each key in the JSON should be correctly populated according to the instructions given. Pay attention to details and ensure the JSON is well-formed and valid."
    return generate_prompt(task, context, system_message, include_system_message)

# Specific prompt generation functions for RAG tasks
def generate_rag_prompt(task, context, include_system_message=True):
    system_message = """You are a Retrieval Augmented Generator (RAG). Your task is to complete tasks and answer questions based on the provided context.
1. Language: Respond only in Dutch.
2. Relevance: Use only the information from the provided context to form your response. If the context does not contain relevant information to the question or task, respond with 'Sorry, ik kan de gevraagde informatie niet terugvinden.'
3. Only respond to questions related to the context."""
    return generate_prompt(task, context, system_message, include_system_message)



#Cleaning and processing returned data

def clean_json_string(json_string):
    pattern = r'^```json\s*(.*?)\s*```$'
    cleaned_string = re.sub(pattern, r'\1', json_string, flags=re.DOTALL)
    return cleaned_string.strip()

def extract_json(response):
    #remove trailing and leading json formatting
    response_text = response.choices[0].message.content
    response_text = clean_json_string(response_text)
    try:
        return json.loads(response_text)
    except json.JSONDecodeError:
        print(f"Skipping invalid JSON in response: {response_text}")
        return None

def clean_tasks(tasks):
    cleaned_tasks = []

    for task in tasks:
        if 'response' not in task or task['response'] is None:
            continue
        response = clean_json_string(task['response'])
        try:
            json_response = json.loads(response)
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON in response: {response}")
            continue
        cleaned_tasks.append(task)

    return cleaned_tasks

def balance_tasks(tasks, max_per_freq=150):
    frequency = {}
    balanced_tasks = []

    for task in tasks:
        response = task['response']
        json_response = json.loads(response)
        num_translations = len(json_response['translations'])
        if num_translations not in frequency:
            frequency[num_translations] = 0
        if frequency[num_translations] < max_per_freq:
            frequency[num_translations] += 1
            balanced_tasks.append(task)

    sorted_frequency = dict(sorted(frequency.items(), key=lambda item: item[0]))
    print(sorted_frequency)

    return balanced_tasks

def store_tasks(tasks, output_file):
    with open(output_file, 'w') as f:
        for task in tasks:
            json.dump(task, f)
            f.write('\n')



#Fuction to load
def load_data_from_file(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

#Function to save samples and responses to file
def get_task_info(task_name, category, sub_category ,  input_data_type = "bpmn", response_data_type ="json" , language = "Dutch"):
    task_info = {
    "task_name": task_name,
    "category": category,
    "sub_category": sub_category,
    "input_data_type": input_data_type,
    "response_data_type": response_data_type,
    "language": language
    }
    return task_info

def save_results_to_file(task_info, user_input, task, context, system_message, prompt_string, response):
    # Create the filename
    filename = f"{task_info['task_name']}_openai_dataset.jsonl"

    #extract message and meta from response
    response_json = json.loads(response.model_dump_json())
    response_text = response_json["choices"][0].pop("message", {"content":"Sorry, ik kan de gevraagde informatie niet terugvinden."})["content"]

    #use remaining information as meta
    meta = response_json
    
    meta["task_info"] = task_info

    # Create the data to save
    data = {
        'id': str(uuid.uuid4()),  # Generate a new UUID for the ID
        'user_input': user_input,
        'task': task,
        'context': context,
        'system_message': system_message,
        'prompt': prompt_string,
        'response': response_text,
        'meta': meta
    }

    # Save the data to the file
    with open(filename, 'a' if os.path.exists(filename) else 'w') as f:  # Append to the file if it exists
        f.write(json.dumps(data) + '\n')  # Write the data as a JSON string followed by a newline

    #print(f"Results saved to {filename}")



# Keyword extraction task

In [51]:
import os
import time
from tqdm import tqdm
from library.BPMNGraph import BPMNGraph

def generate_keyword_task(context, include_system_message=True):

    task = """
    
        You are an expert in extracting keywords from text data based on a provided context. Your task is to identify the most relevant keywords in the given text provided in the context.
    
        Extract the keywords from the document provided in context. Ensure that the keywords are accurate and specific to the context provided.
        Focus on general themes, organizations, locations, etc. These keywords should provide a high-level overview of the document's content.
        Order the keywords based on their relevance and importance in the document.
    
        Desired Output:
        Output the keywords in a JSON format, like this:
        {
            "keywords": ["keyword1", "keyword2", "keyword3"]
        }
    """

    prompt, system = generate_json_prompt(task, context, include_system_message)

    return prompt, system, task, {"context": context}

def get_data_bpmn(graph, with_documentation=True):
    translation_jobs = []
    for id, node in graph.get_nodes():
        node_name = node["name"]
        
        if with_documentation:
            node_documentation = node["documentation"]
            if node_name != '' or node_documentation != '':
                translation_jobs.append({
                    'id': id,
                    'name': node_name,
                    'documentation': node_documentation
                })
        else:
            if node_name != '':
                translation_jobs.append({
                    'id': id,
                    'name': node_name
                })

    return translation_jobs

import json

def extract_keywords_text(text):
    # Generate the classification task
    prompt_string, system_message, _, _ = generate_keyword_task(text)

    # Generate the response using OpenAI (or any other method)
    response = generate_response_openai(system_message, prompt_string, stream=False, json_mode=True)

    return response.choices[0].message.content

def extract_keywords_agendapunt(agendapunt):
    # Convert the JSON object to a string
    text = json.dumps(agendapunt)

    # Generate the classification task
    prompt_string, system_message, task_string, context = generate_keyword_task(text)

    # Generate the response using OpenAI (or any other method)
    response = generate_response_openai(system_message, prompt_string, stream=False, json_mode=True)
    response_json = extract_json(response)

    if response_json is not None:
        task_info = get_task_info("keywords_agendapunt", "keywords", "keywords_agendapunt", "json", "json")
        save_results_to_file(task_info, text, task_string, context, system_message, prompt_string, response)

    return response_json

def extract_keywords_bpmn(file_path):
    # Create a BPMN graph
    graph = BPMNGraph(file_path)

    classification_data = get_data_bpmn(graph)
    classification_text = json.dumps(classification_data)

    prompt_string, system_message, task_string, context = generate_keyword_task(classification_text)
    response = generate_response_openai(system_message,prompt_string, stream=False, json_mode=True)
    response_json = extract_json(response)
    if response_json is not None:
        task_info = get_task_info("keywords_bpmn", "keywords", "keywords_bpmn", "bpmn", "json")
        save_results_to_file(task_info, file_path, task_string, context, system_message, prompt_string, response)
    return response_json

def extract_bpmn_folder(folder_path, taxonomy):
    # Get all .bpmn files in the folder
    bpmn_files = [f for f in os.listdir(folder_path) if f.endswith('.bpmn')]

    # Translate each file, progress bar
    for file in tqdm(bpmn_files, "Extracting keywords BPMN files"):
        file_path = os.path.join(folder_path, file)
        extract_keywords_bpmn(file_path, taxonomy)

        # Sleep for a while after each call
        time.sleep(1)


In [53]:
# Sample n random rows from the data
file_path = "data/besluitenVlaanderen.json"

def prepare_data(binding):
    # Get the 'sub' value
    sub = binding.get('url', {}).get('value', binding.get('sub', {}).get('value', ''))

    # Get the 'beschrijving' value
    beschrijving = ''
    if 'titel' in binding:
        beschrijving += binding['titel']['value'] + " : "
    beschrijving += binding['beschrijving']['value']
    beschrijving = beschrijving.strip()

    return {
        "text": beschrijving,
        "uri": sub
    }

def process_json_file(file_path, num_skips=0, num_samples=-1):
    errors = 0
    seen_texts = set()
    dataset = []

    # Read the JSON file
    with open(file_path, 'r') as f:
        data = json.load(f)

    bindings = data['results']['bindings']
    random.shuffle(bindings)

    # Process each binding
    for binding in tqdm(bindings[num_skips:num_samples]):
        data_tmp = prepare_data(binding)

        # Skip if this text has already been seen or is empty
        if data_tmp['text'] in seen_texts or not data_tmp['text'].strip():
            errors += 1
            continue

        # Add the text to the set of seen texts
        seen_texts.add(data_tmp['text'])

        dataset.append(data_tmp)

    return dataset, errors

dataset, errors = process_json_file(file_path, num_samples=5000)

100%|██████████| 5000/5000 [00:00<00:00, 166527.86it/s]


In [55]:
import random

n = 1  # Number of keywords to select

#processed_keywords = set()

for data in dataset[0:2000]:
    keywords = extract_keywords_agendapunt(data)
    print(f"Keywords: {keywords}")

    # Randomly select n keywords
    #selected_keywords = random.sample(keywords["keywords"], min(n, len(keywords["keywords"])))

    if False:
        selected_keywords = random.sample(keywords["keywords"][:3], min(n, len(keywords["keywords"])))

        for keyword in selected_keywords:
            if keyword not in processed_keywords:
                summary = generate_summary_agendapunten(keyword, 20)
                processed_keywords.add(keyword)
                time.sleep(5)


Keywords: {'keywords': ['ontslag', 'administratief medewerker', 'pensioenleeftijd', 'college']}
Keywords: {'keywords': ['vergunning', 'Deputatie', 'Limbourg Claire', 'Lindeveldstraat', 'Geraardsbergen', 'kadastraal', 'verkavelen', 'open bebouwing', 'halfopen bebouwing']}
Keywords: {'keywords': ['college van burgemeester en schepen', '25 mei 2023', 'Basic Fit', 'brandveiligheidsattest B']}
Keywords: {'keywords': ['Project groenblauwe dooradering', 'participatiemoment', 'communicatie', 'organisatie', 'goedkeuring', '30 mei 2023', '1 juli 2023']}
Keywords: {'keywords': ['vast bureau', 'financieel beheer', 'OCMW', 'bestelbonnen', 'aanrekeningen', 'vorderingen']}
Keywords: {'keywords': ['college', 'proces-verbaal', 'toelichting', 'selectiecommissie', 'bevorderingsprocedure', 'voltijdse betrekkingen', 'geschoold arbeid(st)er', 'rein- en ruimdienst', 'niveau D', 'contractueel verband', 'onbepaalde duur']}
Keywords: {'keywords': ['Overheidsopdracht van leveringen', 'Raamovereenkomst', 'leermat

# Summarization tasks

In [6]:
import requests

#for simplicity, we will use the search endpoint of the LokaalBeslist API to get the agenda items

def get_agenda_items(search_content, location_id=None, max_results=2):
    # Define the URL and query parameters

    params = {
        "page[size]": max_results,
        "page[number]": 0,
        "filter[:fuzzy:search_content]": search_content,
        "sort[session_planned_start.field]": "desc"
    }

    if location_id is not None:
        params["filter[:has:search_location_id]"] = "t"
        params["filter[:terms:search_location_id]"] = location_id

    # Define the headers
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en-US,en;q=0.9,nl;q=0.8,fr;q=0.7,it;q=0.6,es;q=0.5",
        "Cache-Control": "no-cache",
        "Referer": "https://lokaalbeslist.vlaanderen.be/agendapunten?gemeentes=Gent&trefwoord=blaarmeersen",
        "Sec-Ch-Ua": '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": '"Windows"',
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
    }

    # Optionally, include cookies if needed
    cookies = {
        "shortscc": "2",
        "proxy_session": "QTEyOEdDTQ.el1yihjiovv1IzOv4xxDo-2k783xBilgm6DzC8M2hNHVONzg6cz0Q_toOg4.XIPR-bEY0du7d-P0.phHm39QaT8g7-y7fxaQ9XYA5M0D37dxxvxmJSbRGWk3nVHXjGbWanMk8nrl6rXQiKfW7M8VvPhezCzWQgSxXCcrRvQMkJyYqt6ggnhD-A6Bmji1NnUhYmbDO9oIvtSnBIkg5d3DLwlvMGJOjJDGu66wkVLezSXSVGQbShnMv9yMv8FN0IDHruobpWWYr1JQcj71pAYL-WCCFS2KjuBDL.vIGRc-k1TAO5I922gCr_vwDnt"
    }

    # Make the GET request
    response = requests.get(LOKAALBESLIST_SEARCH_ENDPOINT, headers=headers, params=params, cookies=cookies)

    # Check the response status
    if response.status_code == 200:
        # Return the response content
        return response.json()
    else:
        raise Exception(f"Request failed with status code: {response.status_code}")

def process_results(results):
    processed_results = []
    for result in results:
        if result['type'] == 'agenda-item':
            title = result['attributes']['title']
            description = result['attributes']['description']

            # Use the resolution title and description if the title and description are None
            if title is None:
                title = result['attributes']['resolution_title']
            if description is None:
                description = result['attributes']['resolution_description']

            processed_result = {
                'title': title,
                'description': description,
                'uuid': result['id'],
                'uri': result['attributes']['uri']
            }
            processed_results.append(processed_result)
    return processed_results

def get_context_for_llm(search_term, location_id = None, max_results=2):
    # Get the agenda items related to the search term
    agenda_items = get_agenda_items(search_term, location_id, max_results)
    num_results = agenda_items['count']
    formatted_results = process_results(agenda_items['data'])
    
    # Format the results into a string
    context = f"Found {min(num_results, len(formatted_results))} agenda items:\n"
    for i, result in enumerate(formatted_results, start=1):
        context += f"{i}. {result['title']} ({result['uri']})\n"
        if result['description']:
            context += f"\t+ Description: {result['description']}\n"

    # Return the context
    return context, formatted_results


In [7]:
def overview_agendapunten_task(search_term, context, include_system_message=True):
    task=f"""Task: Provide an overview of all the relevant items related to '{search_term}' mentioned in the context.
For each item, provide a detailed summary of what the item is about and the link at the end. Group related items by topic and use topic headers. Clarity: Ensure the summaries are clear and straightforward, avoiding unnecessary jargon, verbose descriptions and unnecessary details.

      
Format:
- **Topic Header**: Start with a topic header to group related items.
    - **Title**:
        - **Samenvatting**: A clear and concise summary of the item, including the main points without complex jargon, simple in grammar and structure.
        - **Link**: Provide the link in brackets at the end.

Example:
Overzicht van de agendapunten met betrekking tot 'Scaldisstraat':
- **Verkeersregelment**:
    - *Wijziging van het aanvullend reglement - Scaldisstraat (inclusief regularisatie en nieuwe parkeerplaats voor autodelen):**
        - **Samenvatting**: Dit voorstel gaat over het aanpassen van de verkeersregels voor de Scaldisstraat, inclusief het reguleren van bestaande situaties die niet volgens de regels zijn en het creëren van een nieuwe parkeerplaats speciaal voor autodelen.
        - **Link**: https://data.gent.be/id/agendapunten/22.0111.1538.0398
    ...
- **Nuts- en infrastructuurwerken**:
    - *Toelating voor het uitvoeren van nuts- en infrastructuurwerken - Scaldisstraat 50:**
        - **Samenvatting**: De aanvraag is om Farys - Klantenwerken - Gent toestemming te geven voor het uitvoeren van nuts- en infrastructuurwerken op Scaldisstraat 50, voornamelijk voor het plaatsen van een drinkwateraftakking.
        - **Link**: https://data.gent.be/id/agendapunten/23.1017.3020.9047
...
"""

    return generate_rag_prompt(task, context, include_system_message)

search_term = "landbouwloodsen"
max_results = 40

context, results = get_context_for_llm(search_term,None, max_results)
prompt_string, system_message = overview_agendapunten_task(search_term, context, False)
print(f"{prompt_string}")

####
Context: Found 40 agenda items:
1. RMT-VGN-2023-0949-AGPP-DEP-01 (http://data.lblod.info/id/agendapunten/6645E08D076AEA6F2CBED875)
	+ Description: Boutersem-Aanvraag voor stedenbouwkundige handelingen en voor ingedeelde inrichtingen en activiteiten ingediend door Dehertogh voor de bouw van een nieuwe landbouwloods en het aanleggen van infiltratievoorziening en het veranderen van de varkenshouderij, Broekstraat 24
2. RMT-VGN-2023-1062-BGP-DEP-01 (http://data.lblod.info/id/agendapunten/6645DCE8076AEA6F2CBED7CF)
	+ Description: Kampenhout-beroep tegen de vergunning onder voorwaarden van 04 december 2023, verleend voor een functiewijziging van landbouwloods naar opslag van materialen, Sint-Servaesstraat
3. RUP Kern Verona Voorlopige vaststelling  (https://bertem.meetingburger.net/gr/8a489877-7fdb-4b29-99d9-c3480b881a88/bertem.meetingburger.net/gr/8a489877-7fdb-4b29-99d9-c3480b881a88#300d49ab-baec-49c4-b013-6654c2da9b02)
	+ Description: RUP Kern Verona Voorlopige vaststelling
4. 2024_C

In [37]:
import pandas as pd
import random
from tqdm import tqdm
import numpy as np
import time

def generate_summary_agendapunten(search_term, max_results=40):

    # Sample data based on the search term
    context, results = get_context_for_llm(search_term, None, max_results)
    
    print(f"Found {len(results)} agenda items for '{search_term}'")

    prompt_string, system_message = overview_agendapunten_task(search_term, context, False)
    #print(f"{prompt_string}")

    response = generate_response_openai(system_message, prompt_string)
    response_text = response.choices[0].message.content
    #json_response = extract_json(response)
    if response_text is not None:
        task_info = get_task_info("summary_agendapunten", "summary", "summary_agendapunten", "json", "string", "Dutch")
        save_results_to_file(task_info, search_term, prompt_string, context, system_message, prompt_string, response)

    return response_text

# Sample n random rows from the data
file_path = "data/besluiten_title_description.csv"
print("\n---------------------------------------------------\n")


#search_term = "Raadslid Jurgen Blomme"
#generate_summary_agendapunten(search_term, 5)


---------------------------------------------------



# translation tasks

In [73]:

def generate_translation_task(context, language, format, include_system_message=False):
    task = f"""Task: Translate all fields of the item in the context to {language}, , except for the 'id' field and any field explicitly marked as 'do not translate'. Place names and IDs should remain in their original language. Return the translated text as a JSON object with the following format:


    {format}
    """
    prompt, system = generate_json_prompt(task, json.dumps(context), include_system_message)
    return prompt, system, task, {"context": context, "language": language, "format": format}

def generate_translation_tasks(context, language, format, include_system_message=False):
    task = f"""Task: Translate all fields of each item in the context to {language}, except for the 'id' field and any field explicitly marked as 'do not translate'. Place names and IDs should remain in their original language. Return the translated text as a JSON array, maintaining the original structure and format of each item:
    {{"translations": [{format}, {format}, ...]}}

    """
    prompt, system = generate_json_prompt(task, json.dumps(context), include_system_message)
    return prompt, system, task, {"context": context, "language": language, "format": format}

general_translation_format = """{{"uuid": "Translated title", 
"Text": "Translated description", 
"source": "Source language", 
"target": "Target language"}}"""

agenda_punten_format = """{{
    "Title": "Translated title",
    "Description": "Translated description",
    "source": "Source language",
    "target": "Target language"}}"""


def test_translation_agendapunt(agendapunt):
    print("\n\n---------------------TRANSLATION TASK---------------------\n\n")
    language = "French"
    prompt_string, system_message, task_string, context = generate_translation_task(agendapunt, language, agenda_punten_format)
    print(prompt_string)
    print(system_message)

    print("\n\n---------------------TRANSLATION RESPONSE---------------------\n\n")

    response = generate_response_openai(system_message,prompt_string)
    response_text = response.choices[0].message.content

    json_repsonse = extract_json(response)

    
    if json_repsonse is not None:
        task_info = get_task_info("translate_agendapunt", "translate", "translate_agendapunt", "json", "json", language)
        save_results_to_file(task_info, search_term, task_string, context, system_message, prompt_string, response)


In [43]:
import os
import time
from tqdm import tqdm
from library.BPMNGraph import BPMNGraph

def get_bpmn_data(graph, with_documentation=True):
    translation_jobs = []
    for id, node in graph.get_nodes():
        node_name = node["name"]
        
        if with_documentation:
            node_documentation = node["documentation"]
            if node_name != '' or node_documentation != '':
                translation_jobs.append({
                    'id': id,
                    'name': node_name,
                    'documentation': node_documentation
                })
        else:
            if node_name != '':
                translation_jobs.append({
                    'id': id,
                    'name': node_name
                })

    return translation_jobs

def translate_bpmn(file_path, languages=["English", "French", "German"]):
    # Create a BPMN graph
    graph = BPMNGraph(file_path)

    translation_jobs = get_bpmn_data(graph, with_documentation=False)
    translation_format = """{'id': 'Do not translate', 'name': 'Translated Name', 'documentation': 'Translated Documentation', 'source': 'Source language', 'target': 'Target language'}"""

    # Initialize progress bar

    for language in languages:
        prompt_string, system_message, task_string, context = generate_translation_tasks(translation_jobs, language, translation_format)
        response = generate_response_openai(system_message,prompt_string, stream=False, json_mode=False, context_length=4096)
        json_response = extract_json(response)  
        if json_response is not None:
            task_info = get_task_info("translate_bpmn", "translate", "translate_bpmn", "bpmn", "json", language)
            save_results_to_file(task_info, file_path, task_string, context, system_message, prompt_string, response)

def translate_bpmn_folder(folder_path, languages=["English", "French", "German"]):
    # Get all .bpmn files in the folder
    bpmn_files = [f for f in os.listdir(folder_path) if f.endswith('.bpmn')]

    # Translate each file, progress bar
    for file in tqdm(bpmn_files, "Translating BPMN files"):
        file_path = os.path.join(folder_path, file)
        translate_bpmn(file_path, languages)

        # Sleep for a while after each call
        time.sleep(1)

#TODO: Rijbewijs, klachtenbehandeling, hoofdprocesProcesMelding (large files) cant fit into the context length

# Translate all BPMN files in the folder
folder_path = "./data/bpmn_to_translate"
translate_bpmn_folder(folder_path, languages=["English", "French", "German"])

In [74]:
import pandas as pd
import random
from tqdm import tqdm
import numpy as np
import time

def sample_data(file_path, n=10000):
    # Load the data from the file
    data = pd.read_csv(file_path)

    # Replace nan values with an empty string
    data = data.fillna('')

    # Sample n random rows
    sampled_data = data.sample(n)

    # Convert the DataFrame to a list of dictionaries
    sampled_data_dicts = sampled_data.to_dict('records')

    # Replace non-breaking spaces with regular spaces
    sampled_data_dicts = [{k: str(v).replace('\xa0', ' ') for k, v in d.items()} for d in sampled_data_dicts]

    return sampled_data_dicts

def generate_translation_training_agendapunten(file_path, num_samples=50000):

    sampled_agendapunten = sample_data(file_path, n=num_samples)

    languages = ["English", "French", "German"]

    # Create an index for slicing
    index = 0

    for _ in tqdm(range(len(sampled_agendapunten) // 10 + 1)):  # +1 to ensure we cover all items
        # Choose a random batch size between 1 and 10
        batch_size = np.random.choice(range(2, 11))

        # Get the batch from the list
        batch = sampled_agendapunten[index:index + batch_size]

        random_language = random.choice(languages)

        agenda_punten_format = """{{
            "uri": "Do not translate",
            "title": "Translated title",
            "description": "Translated description",
            "source": "Source language",
            "target": "Target language"}}"""

        prompt_string, system_message, task_string, context = generate_translation_tasks(batch, random_language, agenda_punten_format)

        response = generate_response_openai(system_message,prompt_string)
        response_text = response.choices[0].message.content
        json_response = extract_json(response)
        if json_response is not None:
            task_info = get_task_info("translate_agendapunten", "translate", "translate_agendapunten", "json", "json", random_language)
            save_results_to_file(task_info, random_language, task_string, context, system_message, prompt_string, response)

        # Sleep for a while after each call
        time.sleep(1)

        # Update the index for the next batch
        index += batch_size

# Sample n random rows from the data
file_path = "data/besluiten_title_description.csv"
#generate_translation_training_agendapunten(file_path, 10000)

# Classification methods

In [94]:
import os
import time
from tqdm import tqdm
from library.BPMNGraph import BPMNGraph

def generate_classification_task(context, taxonomy, include_system_message=True):

    task_string = """

    You are an expert in classifying text data based on a provided context and a predefined hierarchy. The hierarchy is provided in JSON format. Your task is to classify the given text according to this hierarchy and the context provided. If none of the existing classes or subclasses apply, you may generate new ones.

    Here is the classification hierarchy:

    {taxonomy}

    Classify the document provided in context according to the hierarchy. If no suitable class exists within the provided hierarchy, suggest a new one. A document can belong to multiple classes or subclasses. Ensure that the classification is accurate and specific to the context provided.

    Desired Output:
    Output the classification path or the new class suggestion in a JSON format, like this:
    {{
        "classification": {{
            "Category1": ["Subcategory1", "Subcategory2"],
            "Category2": ["Subcategory3"]
        }}
    }}
    or
    {{
        "classification": {{
            "name_new_category_1": ["name_new_subcategory_1"],
            "name_new_category_2": ["name_new_subcategory_2", "name_new_subcategory_3"]
        }}
    }}

    """

    task = task_string.format(taxonomy=json.dumps(taxonomy, indent=4))
    prompt, system = generate_json_prompt(task, context, include_system_message)

    return prompt, system, task, {"context": context, "taxonomy": taxonomy}

def get_classification_data(graph, with_documentation=True):
    translation_jobs = []
    for id, node in graph.get_nodes():
        node_name = node["name"]
        
        if with_documentation:
            node_documentation = node["documentation"]
            if node_name != '' or node_documentation != '':
                translation_jobs.append({
                    'id': id,
                    'name': node_name,
                    'documentation': node_documentation
                })
        else:
            if node_name != '':
                translation_jobs.append({
                    'id': id,
                    'name': node_name
                })

    return translation_jobs

import json

def classify_text(text, taxonomy):
    # Generate the classification task
    prompt_string, system_message, _, _ = generate_classification_task(text, taxonomy)

    # Generate the response using OpenAI (or any other method)
    response = generate_response_openai(system_message, prompt_string, stream=False, json_mode=False)

    return response.choices[0].message.content

def classify_json(json_object, taxonomy):
    # Convert the JSON object to a string
    text = json.dumps(json_object)

    # Classify the text
    response_text = classify_text(text, taxonomy, context=json_object)
    response_json = extract_json(response_text)
    return response_json

def classify_agendapunt(agendapunt, taxonomy):
    # Convert the JSON object to a string
    text = json.dumps(agendapunt)

    # Generate the classification task
    prompt_string, system_message, task_string, context = generate_classification_task(text, taxonomy)

    # Generate the response using OpenAI (or any other method)
    response = generate_response_openai(system_message, prompt_string, stream=False, json_mode=False)
    response_json = extract_json(response)

    if response_json is not None:
        task_info = get_task_info("classification_agendapunt", "classification", "classification_agendapunt", "json", "json") 
        save_results_to_file(task_info, text, task_string, context, system_message, prompt_string, response)

def classify_bpmn(file_path, taxonomy):
    # Create a BPMN graph
    graph = BPMNGraph(file_path)

    classification_data = get_classification_data(graph)

    prompt_string, system_message, task_string, context = generate_classification_task(classification_data, taxonomy)
    response = generate_response_openai(system_message,prompt_string, stream=False, json_mode=False)
    response_json = extract_json(response)
    if response_json is not None:
        task_info = get_task_info("classification_bpmn", "classification", "classification_bpmn", "bpmn", "json")
        save_results_to_file("classification_bpmn", file_path, task_string, context, system_message, prompt_string, response)

def classify_bpmn_folder(folder_path, taxonomy):
    # Get all .bpmn files in the folder
    bpmn_files = [f for f in os.listdir(folder_path) if f.endswith('.bpmn')]

    # Translate each file, progress bar
    for file in tqdm(bpmn_files, "Classify BPMN files"):
        file_path = os.path.join(folder_path, file)
        classify_bpmn(file_path, taxonomy)

        # Sleep for a while after each call
        time.sleep(1)


In [98]:
import pandas as pd
import random
from tqdm import tqdm
import numpy as np
import time

def sample_data(file_path, n=10000):
    # Load the data from the file
    data = pd.read_csv(file_path)

    # Replace nan values with an empty string
    data = data.fillna('')

    # Sample n random rows
    sampled_data = data.sample(n)

    # Convert the DataFrame to a list of dictionaries
    sampled_data_dicts = sampled_data.to_dict('records')

    # Replace non-breaking spaces with regular spaces
    sampled_data_dicts = [{k: str(v).replace('\xa0', ' ') for k, v in d.items()} for d in sampled_data_dicts]

    return sampled_data_dicts

def generate_classification_training_agendapunten(file_path,taxonomy, num_samples=10000):
    sampled_agendapunten = sample_data(file_path, n=num_samples)

    for sample in tqdm(sampled_agendapunten):  # +1 to ensure we cover all items
        classify_agendapunt(sample, taxonomy)

        
taxonomy_agendapunten = json.load(open("agendapunten_categories.json"))

# Sample n random rows from the data
file_path = "data/besluiten_title_description.csv"
generate_classification_training_agendapunten(file_path, taxonomy_agendapunten, 10000)

 25%|██▌       | 2527/10000 [18:38:58<55:09:05, 26.57s/it]    


APIConnectionError: Connection error.