# Setup and Config

In [1]:
!pip3 install openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [2]:
def load_file_to_string(file_path):
  """Loads the content of a file into a string.

  Args:
    file_path: The path to the file.

  Returns:
    The content of the file as a string.
  """
  with open(file_path, 'r') as file:
    file_contents = file.read()
  return file_contents

In [4]:
# Define the file path and load its content into the 'schema' variable
file_path = '../KWG/Complex/Axiomatization/Axioms.txt'
file_string = load_file_to_string(file_path)
schema = file_string

In [4]:
# Define a list of competency questions (CQs)
CQs = [
    "What are all the hazard events, their name, start date, end date?",
    "What are all the places, their names?"
    ]

In [5]:
# Define a list of competency questions (CQs)
def fill_prompt_template(template_text, values_dict):
    for key, value in values_dict.items():
        template_text = template_text.replace(f"{{{key}}}", value)
    return template_text

In [6]:
# Temperature Config for LLM
temperature = 0.8

In [7]:
# Define the initial system message for the language model
initial_system_message = """"
You are an expert in knowledge graphs and SPARQL query generation. Your task is to generate SPARQL queries based on provided competency questions and a given schema.

Guidelines:
Use the schema provided in the context block to determine appropriate classes, properties, and relationships.
 - Ensure queries follow SPARQL syntax and use prefixes correctly.
 - Generate queries that efficiently retrieve relevant data while optimizing performance.
 - If multiple valid queries exist, choose the most concise and efficient one.
 - Preserve the intent of the competency question while ensuring syntactic correctness.
 - Give only the SPARQL query and nothing else
"""

In [8]:
# Define the template prompt for the language model
template_prompt = """
Task:
Write a SPARQL query that answers the following competency question:
{Insert_CQ_here}

Requirements:
- Use the schema to determine correct URIs and relationships.
- Ensure the query retrieves the necessary information efficiently.
- Provide the full SPARQL query without placeholders.

Context:
Below is the schema of a knowledge graph:
{Insert_schema_here}
"""

# Mistral

In [7]:
# Set the Mistral API key
mistral_api_key = ""

In [8]:
!pip install mistralai



In [9]:
# Import necessary libraries for Mistral
import os
from mistralai import Mistral

In [10]:
# Initialize the Mistral client
mistral_client = Mistral(api_key=mistral_api_key)

In [11]:
# Define a function to perform inference with Mistral
def inference_with_mistral( prompt, model = "mistral-large-2411"):
  """Performs inference with the Mistral language model.

  Args:
    prompt: The prompt string.
    model: The name of the Mistral model to use.

  Returns:
    A tuple containing the model's response and the raw response data.
  """
  mistal_messages = []
  mistal_messages.append({"role": "system", "content": initial_system_message})
  mistral_prompt = prompt
  mistal_messages.append({"role": "user", "content": mistral_prompt})
  chat_response = mistral_client.chat.complete(
    model = model,
    messages = mistal_messages,
    temperature = temperature
)

  mistal_messages.append({"role": "assistant", "content": chat_response.choices[0].message.content})
  return chat_response.choices[0].message.content, chat_response.model_dump_json(indent = 4)

In [12]:
# Import necessary libraries for data processing
import pandas as pd
import time

# Iterate through the competency questions and perform inference with Mistral
cq_mistral_results = []
for cq in CQs:
  print("*"*10)
  input_data = {
        "Insert_CQ_here": cq,
        "Insert_schema_here": schema,
    }
  filled_prompt = fill_prompt_template(template_prompt, input_data)
  mistral_analysis_result, mistral_analysis_raw = inference_with_mistral(filled_prompt)
  time.sleep(1)
  cq_mistral_results.append((cq, filled_prompt, mistral_analysis_result, mistral_analysis_raw))
  print(cq)
  print("*"*10)
  print("\n\n\n")

# Create a pandas DataFrame from the results
df = pd.DataFrame(cq_mistral_results, columns=['CQ', 'Prompt', 'Mistral_Analysis_Result', 'Mistral_Analysis_Raw'])

# Save the results to an Excel file
excel_file_path = f'{file_path.split("/")[-1]}_cq_mistral_results.xlsx'
df.to_excel(excel_file_path, index=False)

print(f"Excel file saved to: {excel_file_path}")

**********
What are all the hazard events, their name, start date, end date?
**********




**********
What are all the places, their names?
**********




Excel file saved to: cq_mistral_results.xlsx


# Gemini

In [13]:
# Set the Gemini API key
gemini_api_key = ""

In [14]:
# Import necessary libraries for Gemini
from google import genai
from google.genai import types

# Initialize the Gemini client
gemini_client = genai.Client(api_key=gemini_api_key)

In [15]:
# Define a function to perform inference with Gemini
def inference_with_gemini(prompt, model = "gemini-2.0-flash"):
  """Performs inference with the Gemini language model.

  Args:
    prompt: The prompt string.
    model: The name of the Gemini model to use.

  Returns:
    A tuple containing the model's response and the raw response data.
  """
  gemini_prompt = prompt
  response = gemini_client.models.generate_content(
      model=model,
      config=types.GenerateContentConfig(
          system_instruction=initial_system_message,
          temperature=temperature
          ),
      contents=gemini_prompt
      )
  return response.text, response.model_dump_json(indent = 4)

In [16]:
# Import necessary libraries for data processing
import pandas as pd
import time

# Iterate through the competency questions and perform inference with Gemini
cq_gemini_results = []
for cq in CQs:
  print("*"*10)
  input_data = {
        "Insert_CQ_here": cq,
        "Insert_schema_here": schema,
    }
  filled_prompt = fill_prompt_template(template_prompt, input_data)
  gemini_analysis_result, gemini_analysis_raw = inference_with_gemini(filled_prompt)
  time.sleep(1)
  cq_gemini_results.append((cq, filled_prompt, gemini_analysis_result, gemini_analysis_raw))
  print(cq)
  print("*"*10)
  print("\n\n\n")

# Create a pandas DataFrame from the results
df = pd.DataFrame(cq_gemini_results, columns=['CQ', 'Prompt', 'Gemini_Analysis_Result', 'Gemini_Analysis_Raw'])

# Save the results to an Excel file
excel_file_path = f'{file_path.split("/")[-1]}_cq_Gemini_results.xlsx'
df.to_excel(excel_file_path, index=False)

print(f"Excel file saved to: {excel_file_path}")

**********
What are all the hazard events, their name, start date, end date?
**********




**********
What are all the places, their names?
**********




Excel file saved to: cq_Gemini_results.xlsx


# GPT4o

In [9]:
!pip3 install openai --upgrade

Collecting openai
  Downloading openai-1.71.0-py3-none-any.whl.metadata (25 kB)
Downloading openai-1.71.0-py3-none-any.whl (598 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.0/599.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.70.0
    Uninstalling openai-1.70.0:
      Successfully uninstalled openai-1.70.0
Successfully installed openai-1.71.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [10]:
import openai
import json
import time
import pandas as pd

In [None]:
# Initialize OpenAI
openai.api_key = ""

In [13]:
# 1. Prepare batch input
batch_input_path = f"{file_path.split("/")[-1]}_gpt4o_batch_input.jsonl"
with open(batch_input_path, "w") as f:
    for i, cq in enumerate(CQs):
        custom_id = f"cq-{i}"
        input_data = {
            "Insert_CQ_here": cq,
            "Insert_schema_here": schema
        }
        filled_prompt = fill_prompt_template(template_prompt, input_data)
        prompt_payload = {
            "custom_id": custom_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": [
                    {"role": "system", "content": initial_system_message},
                    {"role": "user", "content": filled_prompt}
                ],
                "temperature": temperature,
            }
        }
        f.write(json.dumps(prompt_payload) + "\n")

In [14]:
# 2. Upload file
file_upload = openai.files.create(file=open(batch_input_path, "rb"), purpose="batch")
file_id = file_upload.id

In [15]:
# 3. Create batch
batch = openai.batches.create(
    input_file_id=file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)
batch_id = batch.id
print(f"Batch started with ID: {batch_id}")

Batch started with ID: batch_67f53335b0d48190b43c143094e182b3


In [16]:
# 4. Poll until done
while True:
    current = openai.batches.retrieve(batch_id)
    print(f"Status: {current.status}")
    if current.status in ["completed", "failed", "cancelled", "expired"]:
        print(f"Batch completed with status: {current.status}")
        break
    time.sleep(10)

Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: completed
Batch completed with status: completed


In [17]:
# 5. Download results
output_file_id = current.output_file_id
if output_file_id is None:
    raise ValueError("No output file available.")

output_file = openai.files.content(output_file_id)
batch_output = output_file.text

In [18]:
# 6. Parse and match results
results_dict = {}
for line in batch_output.strip().split("\n"):
    data = json.loads(line)
    custom_id = data["custom_id"]
    response_text = data["response"]["body"]["choices"][0]["message"]["content"]
    results_dict[custom_id] = {
        "result": response_text,
        "raw": json.dumps(data, indent=4)
    }

In [19]:
# 7. Map results back to original CQs
cq_gpt4o_results = []
for i, cq in enumerate(CQs):
    custom_id = f"cq-{i}"
    input_data = {
        "Insert_CQ_here": cq,
        "Insert_schema_here": schema
    }
    filled_prompt = fill_prompt_template(template_prompt, input_data)
    res = results_dict.get(custom_id, {"result": "", "raw": "{}"})
    cq_gpt4o_results.append((cq, filled_prompt, res["result"], res["raw"]))

In [22]:
# 8. Save to Excel
df = pd.DataFrame(cq_gpt4o_results, columns=["CQ", "Prompt", "GPT4o_Result", "GPT4o_Raw"])
excel_file_path = f"{file_path.split('/')[-1]}_cq_GPT4o_results.xlsx"
df.to_excel(excel_file_path, index=False)
print(f"Excel file saved to: {excel_file_path}")

Excel file saved to: Axioms.txt_cq_GPT4o_results.xlsx
