# Setup and Config

In [1]:
!pip3 install pandas
!pip3 install openpyxl

Defaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0m

In [2]:
def load_file_to_string(file_path):
  """Loads the content of a file into a string.

  Args:
    file_path: The path to the file.

  Returns:
    The content of the file as a string.
  """
  with open(file_path, 'r') as file:
    file_contents = file.read()
  return file_contents

In [3]:
# Define the file path and load its content into the 'schema' variable
file_path = 'big_schema_trim.ttl'
file_string = load_file_to_string(file_path)
schema = file_string

In [4]:
# Define a list of competency questions (CQs)
CQs = [
    # Simple
    "Who are all the characters available?",
    "What are all the movies (and/or TV shows) available?",
    "What are the real names and/or primary aliases for each of the characters?",
    "What are all the associated species or types (e.g., human, Asgardian, AI) for each of the characters?",
    "What are all the origin locations (e.g., homeworld, birthplace, base) for each of the characters?",
    "What are all the release dates (or years), if available, for each of the movies?",
    "Who are all the director(s) for each of the movies?",
    "What are all the listed powers or abilities for each of the characters?",
    "What are all the team or organization affiliations for each of the characters?",
    "What are all the locations (places) available?",
    "What are all the available movies and their directors?",
    "What are all the available characters and their primary alias/real name?",
    #  Moderate
    "What are all the character–movie appearance pairs (which characters appear in which movies) available?",
    "What are all the actor–character pairs (which actors portray which characters) available?",
    "What are all the character–team–movie triples available where a character is a member of a team and appears in a movie?",
    "What are all the character–power–movie triples available with respect to characters with specific powers and the movies they appear in?",
    "What are all the pairs of characters that are linked through movies and co-appear in at least two movies?",
    "What are all the teams and the set of members for each team available?",
    "What are all the movies and the set of teams that have at least one member appearing in them?",
    "What are all locations that are associated with at least one character appearance (e.g., origin or major setting)?",
    "What are all the director–actor pairs linked through movies available?",
    "What are all the available distinct powers and the set of characters associated with each power?",
    "What are all the pairs of characters available that have co-appeared in a movie?",
    # Complex
    "How many movies does each character appear in (character appearance count)?",
    "What are all the movies and their counts for each pair of characters that co-appear in multiple movies?",
    "What are all the unions of movies for each of the teams in which any of their members appear (team-level filmography)?",
    "Who are the distinct characters that possess a power, their counts, and rank-ordered by popularity?",
    "Who are all the bridge characters that are members of more than one team, and what are those team combinations?",
    "What are all the sets of characters that have been portrayed across all movies by each of the actors?",
    "What are all the locations used as settings or associated contexts for multiple movies and/or characters and the counts of those associations?",
    "What is the number of unique teams, unique powers, and unique locations represented via their characters for each of the movies?",
    "Who are all the other characters connected via shared movie appearances for each of the characters (character co-appearance network)?",
    "What are all the distributions of powers among each team's members, grouped by teams,  for comparing teams for computability (e.g., which powers are most characteristic of each team)?"
]

In [5]:
# Define a list of competency questions (CQs)
def fill_prompt_template(template_text, values_dict):
    for key, value in values_dict.items():
        template_text = template_text.replace(f"{{{key}}}", value)
    return template_text

In [6]:
# Temperature Config for LLM
temperature = 1

In [7]:
# Define the initial system message for the language model
initial_system_message = """
You are an expert in knowledge graphs and SPARQL query generation. Your task is to generate SPARQL queries based on the provided competency questions and a given TTL schema and return only the SPARQL query.

Guidelines:
Use only the schema provided in the context block to determine appropriate classes, properties, and relationships.
 - Ensure queries follow SPARQL syntax and use prefixes correctly.
 - Generate queries that efficiently retrieve relevant data while optimizing performance but with priority on correctness and efficiency.
 - If multiple valid queries exist, choose the most concise and efficient one.
 - Preserve the intent of the competency question while ensuring syntactic correctness.
 - Give only one SPARQL query and nothing else.
 - Only use the defined relationships in the schema. Don't use external ones unless specified.
 - If the competency question cannot be answered with the provided schema, respond to a partial extent that it can be answered to or respond with "No valid query can be generated based on the provided schema."
 - Don't summarize or return an analysis of the given schema but return only the respective SPARQL query for the Competency Question.
"""

In [8]:
# Define the template prompt for the language model
template_prompt = """
Task: Write a SPARQL query that answers the following competency question:
{Insert_CQ_here}

Requirements:
- Use the schema to determine correct URIs and relationships.
- Ensure the query retrieves the necessary information efficiently.
- Provide only one full SPARQL query without placeholders.
- Don't summarize or return an analysis of the given schema but return only the respective SPARQL query for the Competency Question.

Context:
Below is the TTL schema of the knowledge graph:
{Insert_schema_here}
"""

# GPT5

In [80]:
!pip3 install openai --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [81]:
import openai
import json
import time
import pandas as pd

In [None]:
# Initialize OpenAI
openai.api_key = ""

In [83]:
# 1. Prepare batch input
batch_input_path = f"{file_path.split('/')[-1]}_gpt5_batch_input.jsonl"
with open(batch_input_path, "w") as f:
    for i, cq in enumerate(CQs):
        custom_id = f"cq-{i}"
        input_data = {
            "Insert_CQ_here": cq,
            "Insert_schema_here": schema
        }
        filled_prompt = fill_prompt_template(template_prompt, input_data)
        prompt_payload = {
            "custom_id": custom_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-5",
                "messages": [
                    {"role": "system", "content": initial_system_message},
                    {"role": "user", "content": filled_prompt}
                ],
                # "temperature": temperature,
            }
        }
        f.write(json.dumps(prompt_payload) + "\n")

In [84]:
# 2. Upload file
file_upload = openai.files.create(file=open(batch_input_path, "rb"), purpose="batch")
file_id = file_upload.id

In [85]:
# 3. Create batch
batch = openai.batches.create(
    input_file_id=file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)
batch_id = batch.id
print(f"Batch started with ID: {batch_id}")

Batch started with ID: batch_6908ed66be68819084aeedc6f7a2efb3


In [86]:
# 4. Poll until done
while True:
    current = openai.batches.retrieve(batch_id)
    print(f"Status: {current.status}")
    if current.status in ["completed", "failed", "cancelled", "expired"]:
        print(f"Batch completed with status: {current.status}")
        break
    time.sleep(600)

Status: validating
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: in_progress
Status: finalizing
Status: completed
Batch completed with status: completed


## Error

In [87]:
# 5. Download error
error_file_id = current.error_file_id
if error_file_id is None:
    print("No error file available.")
else:
    error_file = openai.files.content(error_file_id)
    batch_output = error_file.text

No error file available.


In [88]:
# 6. Parse and match results
results_dict = {}
for line in batch_output.strip().split("\n"):
    data = json.loads(line)
    custom_id = data["custom_id"]
    response_text = data["response"]["body"]
    print(f"Custom ID: {custom_id}\nResponse:\n{response_text}\n")

Custom ID: cq-0
Response:
{'error': {'message': 'Input tokens exceed the configured limit of 272000 tokens. Your messages resulted in 499360 tokens. Please reduce the length of the messages.', 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

Custom ID: cq-1
Response:
{'error': {'message': 'Input tokens exceed the configured limit of 272000 tokens. Your messages resulted in 499364 tokens. Please reduce the length of the messages.', 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

Custom ID: cq-2
Response:
{'error': {'message': 'Input tokens exceed the configured limit of 272000 tokens. Your messages resulted in 499368 tokens. Please reduce the length of the messages.', 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}



## Output

In [89]:
# 5. Download results
output_file_id = current.output_file_id
if output_file_id is None:
    raise ValueError("No output file available.")

output_file = openai.files.content(output_file_id)
batch_output = output_file.text

In [90]:
# 6. Parse and match results
results_dict = {}
for line in batch_output.strip().split("\n"):
    data = json.loads(line)
    custom_id = data["custom_id"]
    response_text = data["response"]["body"]["choices"][0]["message"]["content"]
    results_dict[custom_id] = {
        "result": response_text,
        "raw": json.dumps(data, indent=4)
    }

In [91]:
# 7. Map results back to original CQs
cq_gpt4o_results = []
for i, cq in enumerate(CQs):
    custom_id = f"cq-{i}"
    input_data = {
        "Insert_CQ_here": cq,
        "Insert_schema_here": schema
    }
    filled_prompt = fill_prompt_template(template_prompt, input_data)
    res = results_dict.get(custom_id, {"result": "", "raw": "{}"})
    cq_gpt4o_results.append((cq, filled_prompt, res["result"], res["raw"]))

In [92]:
# 8. Save to Excel
df = pd.DataFrame(cq_gpt4o_results, columns=["CQ", "Prompt", "GPT5_Result", "GPT5_Raw"])
excel_file_path = f"{file_path.split('/')[-1]}_cq_GPT5_results.xlsx"
df.to_excel(excel_file_path, index=False)
print(f"Excel file saved to: {excel_file_path}")

Excel file saved to: big_one_SamCat2_Covered (1).ttl_cq_GPT4o_results.xlsx


# Ollama

In [9]:
from ollama_interface import chat_with_model, get_ollama_models

def interact_with_agent(model_name, system_message, user_prompt):
    """Interacts with a model agent using system and user prompts."""
    
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]

    response = chat_with_model(
        model_name=model_name,
        messages=messages,
        options = {
            "temperature": temperature
        }
    )
    return response,response["message"]["content"]

In [None]:
# Import necessary libraries for data processing
import pandas as pd
import time

models_to_run = [
    'deepseek-r1:latest',
    'gpt-oss:120b',
    'granite3.3:latest',
    'llama3.2:latest',
    'mistral-small3.2:latest',
    "granite4:latest",
    'phi4-mini:latest',
    'phi4-reasoning:latest',
    'phi4:latest',
    'gemma3:27b',
    'mistral-large:latest',
    'llama4:scout',
    'llama4:latest',
    'llama4:maverick',
    'deepseek-r1:671b-fp16',
    'deepseek-v3.1:latest',
    'qwen3:235b']
for model in models_to_run:
    print(model, end="")
    try:
        # Iterate through the competency questions and perform inference
        cq_model_results = []
        for cq in CQs:
          input_data = {
                "Insert_CQ_here": cq,
                "Insert_schema_here": schema,
            }
          filled_prompt = fill_prompt_template(template_prompt, input_data)
          model_analysis_raw, model_analysis_result = interact_with_agent(model, initial_system_message, filled_prompt)
          cq_model_results.append((cq, filled_prompt, model_analysis_result, model_analysis_raw))
          print("*"*10)
          print(cq)
          print("*"*10)
          print("\n")
        
        # Create a pandas DataFrame from the results
        df = pd.DataFrame(cq_model_results, columns=['CQ', 'Prompt', f'{model}_Analysis_Result', f'{model}_Analysis_Raw'])
        
        # Save the results to an Excel file
        excel_file_path = f'{file_path.split("/")[-1].rstrip(".ttl")}_cq_{model.replace(":","-")}_results.xlsx'
        df.to_excel(excel_file_path, index=False)
        
        print(f"Excel file saved to: {excel_file_path}")
    except Exception as e:
        print(f"failed {e}")

granite4:latest

# Claude

In [1]:
import anthropic
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.messages.batch_create_params import Request
import json
import time
import pandas as pd

In [None]:
# Initialize Claude Key
claude_api_key = ""
client = anthropic.Anthropic(api_key=claude_api_key)

In [22]:
# 1. Prepare batch input
batch_input_list = []
for i, cq in enumerate(CQs):
    custom_id = f"cq-{i}"
    input_data = {
        "Insert_CQ_here": cq,
        "Insert_schema_here": schema
    }
    filled_prompt = fill_prompt_template(template_prompt, input_data)
    batch_input_list.append(Request(
        custom_id = custom_id,
        params=MessageCreateParamsNonStreaming(
            model="claude-sonnet-4-5-20250929",
            max_tokens=2048,
            system = initial_system_message,
            messages=[{
                "role": "user",
                "content": filled_prompt,
            }],
            temperature = temperature,
        )
    ))

In [None]:
# 3. Create batch
batch = client.messages.batches.create(
    requests=batch_input_list
)
batch_id = batch.id
print(f"Batch started with ID: {batch_id}")

In [None]:
# 4. Poll until done
while True:
    current = client.messages.batches.retrieve(batch_id)
    print(f"Status: {current.processing_status}")
    if current.processing_status in ["ended"]:
        print(f"Batch completed with status: {current}")
        break
    time.sleep(60)

In [None]:
# 5. Download results
results_dict = {}
for result in client.messages.batches.results(batch_id):
    custom_id = result.custom_id
    response_text = result.result.message.content[0].text
    results_dict[custom_id] = {
        "result": response_text,
        "raw": json.dumps(str(result), indent=4)
    }

In [None]:
# 6. Map results back to original CQs
cq_claude_results = []
for i, cq in enumerate(CQs):
    custom_id = f"cq-{i}"
    input_data = {
        "Insert_CQ_here": cq,
        "Insert_schema_here": schema
    }
    filled_prompt = fill_prompt_template(template_prompt, input_data)
    res = results_dict.get(custom_id, {"result": "", "raw": "{}"})
    cq_claude_results.append((cq, filled_prompt, res["result"], res["raw"]))

In [None]:
# 8. Save to Excel
df = pd.DataFrame(cq_claude_results, columns=["CQ", "Prompt", "Claude_Result", "Claude_Raw"])
excel_file_path = f"{file_path.split('/')[-1]}_cq_Claude_results.xlsx"
df.to_excel(excel_file_path, index=False)
print(f"Excel file saved to: {excel_file_path}")