# **ConformalNL2LTL: Translating Natural Language Instructions into Temporal Logic Formulas with Conformal Correctness Guarantees**

***Jun Wang<sup>1&#42;</sup>, David Smith Sundarsingh<sup>1&#42;</sup>, Jyotirmoy V. Deshmukh, Yiannis Kantaros<sup>1</sup>***

*Department of Electrical and Systems Engineering, Washington University in St Louis<sup>1</sup>*

*Department of Computer Science, University of Southern California<sup>2</sup>*

&#42; indicates equal contribution.


Copyright 2025 [Kantaros Lab @ WashU](https://sites.wustl.edu/kantaroslab/). All rights reserved.

Please check project webpage [conformalnl2ltl.github.io](https://conformalnl2ltl.github.io/) for more information.



In [None]:
API_KEY = "OpenAI API key of yours"

## LLM Configurations

***Google drive access will be requested to save data***

In [None]:
#@title Package Installation & Google Drive Mounting Request
!pip install openai==1.55.2 httpx==0.27.2 --quiet
import os
import re
import warnings
import numpy as np
from datetime import datetime
from openai import OpenAI
import json
import shutil
from itertools import combinations
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

from google.colab import drive
drive.mount('/content/gdrive')
path = '/content/gdrive/MyDrive/LTL_NL_Translation'
if not os.path.exists(path):
    os.makedirs(path)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/389.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m389.1/389.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.5/389.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#@title API Call Function
import re
client = OpenAI(api_key=API_KEY)

def LLM(messages, params):
    response = client.chat.completions.create(
        model=params['model'],
        messages=messages,
        temperature=params['temperature'],
        max_tokens=params['max_tokens'],
        top_p=params['top_p'],
        frequency_penalty=params['frequency_penalty'],
        presence_penalty=params['frequency_penalty'],
        response_format={"type": "text"}
    )
    return response, response.choices[0].message.content

def formulator(role, text):
    message = {
    "role": role,
    "content": [{
            "text": text,
            "type": "text"
    }]}
    return message

""" Check if the response violate the generation rules """
def check_validity(new_string):
  pattern = r'(?<!\w)(<>|!|->|&&|\|\||\[\]|U|\(|\)|/|[a-zA-Z0-9_]+)(?!\w)'
  found_keys = re.findall(pattern, new_string)
  if len(found_keys) == 1:
    return True, found_keys
  elif len(found_keys) > 1:
        return False, found_keys  # Invalid string with the conflicting keys
  else:
    return False, []

class CustomOutput:
    def __init__(self, file):
        self.file = file

    def write(self, message):
        print(message, end='')  # Print to console
        self.file.write(message)  # Write to file

    def flush(self):
        pass  # For compatibility with `print`

In [None]:
#@title Model Parameters Configuration

model = "gpt-4o" #@param ["gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo", "gpt-3.5-turbo-16k"] {allow-input:true}
temperature = 1 #@param {allow-input:true, type:"number"}
max_tokens = 512 #@param {allow-input:true, type:"integer"}
top_p = 1 #@param {allow-input:true, type:"integer"}
frequency_penalty = 0 #@param {allow-input:true, type:"integer"}
presence_penalty = 0 #@param {allow-input:true, type:"integer"}
params = {
    'model': model,
    'temperature': temperature,
    'max_tokens': max_tokens,
    'top_p': top_p,
    'frequency_penalty': frequency_penalty,
    'presence_penalty': presence_penalty
}

In [None]:
#@title Semantic Similarity
def extract_text_and_number(text):
    # Extract the number
    number_match = re.search(r'\d+', text)
    number = int(number_match.group()) if number_match else None
    # Extract the non-numeric part
    text_part = re.sub(r'\d+', '', text).strip('_')
    text_part = text_part.replace('_', ' ')
    return text_part, number


def get_embedding(text, model="text-embedding-3-large"):
    embedding_model = "text-embedding-3-large" #@param ["text-embedding-3-large", "text-embedding-3-small", "text-embedding-ada-002"] {allow-input:true}
    response = client.embeddings.create(input=[text], model=embedding_model)
    return response.model_dump()['data'][0]['embedding']

def cos(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def semantic_similarity(text1, text2):
    w1, n1 = extract_text_and_number(text1)
    w2, n2 = extract_text_and_number(text2)
    vector1 = np.array(get_embedding(w1))
    vector2 = np.array(get_embedding(w2))
    if n1 != n2:
      # stop considering semantic similarity if the numbers do not match
      return 0
    cos_sim = cos(vector1, vector2)
    similarity = cos_sim
    return similarity

## System Prompt

The translation of Natural Language (NL) task into Linear Temporal Logic (LTL) formula is constructed incrementally as a sequence of interdependent multiple-choice-question-answering (MCQA) tasks.

At each time step, we query the LLM $m$ times to generate $m$ candidate response, each response can be either
*   An operator comes from a fixed pre-defined set
*   An atomic proposition that is generated based on the rules defined in the prompt

Operators we considered:

|     Symbol    |  Explanation  |
|:-------------:|:-------------:|
|      $<>$     |   Eventually  |
|      $!$      |    Negation   |
| $\rightarrow$ |  Implication  |
|     $\&\&$    |      And      |
|     $||$    |       OR      |
|      $[]$     |     Always    |
|      $U$      |     Until     |
|      $($      |  Left Bracket |
|      $)$      | Right Bracket |
|      $/$      | Ending Symbol |


In [None]:
#@title List of Temporal/Logical Operators
operators = {
    '<>': 'eventually',
    '!' : 'negation',
    '->': 'implication',
    '&&': 'and',
    '||': 'or',
    '[]': 'always',
    'U' : 'until',
    '(' : 'left bracket',
    ')' : 'right bracket',
    '/': 'only upon LTL formula completion.'
}

opkeys = list(operators.keys())

In [None]:
#@title Prompt
formatted_operators = '\n'.join([ops[0] + ' # ' + ops[1] for ops in operators.items()])
system_description_prompt = f"""You are a helpful assistant tasked with translating natural language (NL) instructions into Linear Temporal Logic (LTL) formulas step by step.

At each step, generate exactly 1 element either an operator strictly from the set of Logical and Temporal Operators listed below or an Atomic Predicates by following the construction rules below.

Possible Options:
For temporal and logical operators, you must pick only one from the below elements and nothing else:
<> # eventually
! # negation
-> # implication
&& # and
|| # or
[] # always
U # until
( # left bracket
) # right bracket
/ # only upon LTL formula completed

Atomic Predicates construction rules (AP):
The robot can a) go to a location; b) pick up an item; c) put down an item; and d) take a photo.
At each step, if you need to generate an atomic predicate for one of the capabilities, you must follow the rules below:
Go to [location X] is written as location_X
Pick up [item X] is written as p_item_X
Put down is written as pd
take a photo is written as photo

!! You can only select one operator or one AP at a given step !!

Do not generate "<>(" at the same step but generate "<>" first and then "(".

Example NL Instruction: Deliver package 4 to store 5 after picking it up from warehouse 1 as you keep visiting charger 4.
Process Example:
[User]:
Formula so far:
Step 1:
[Assistant]:
<>

[User]:
Formula so far: <>
Step 2:
[Assistant]:
(

[User]:
Formula so far: <>(
Step 3:
[Assistant]:
warehouse_4

[User]:
Formula so far: <>(warehouse_1
Step 4:
[Assistant]:
&&

[User]:
Formula so far: <>(warehouse_1&&
Step 5:
[Assistant]:
<>

[User]:
Formula so far: <>(warehouse_1&&<>
Step 6:
[Assistant]:
(

[User]:
Formula so far: <>(warehouse_1&&<>(
Step 7:
[Assistant]:
p_package_4

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4
Step 8:
[Assistant]:
&&

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&
Step 9:
[Assistant]:
<>

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>
Step 10:
[Assistant]:
(

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(
Step 11:
[Assistant]:
store_5

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5
Step 12:
[Assistant]:
&&

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5&&
Step 13:
[Assistant]:
pd

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd
Step 14:
[Assistant]:
)

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd)
Step 15:
[Assistant]:
)

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd))
Step 16:
[Assistant]:
)

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd)))
Step 17:
[Assistant]:
&&

[User]:
Formula so far:<>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd)))&&
Step 18:
[Assistant]:
[]

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd)))&&[]
Step 19:
[Assistant]:
<>

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd)))&&[]<>
Step 20:
[Assistant]:
charger_4

[User]:
Formula so far: <>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd)))&&[]<>charger_4
Step 21:
[Assistant]:
/

Final LTL formula:
<>(warehouse_1&&<>(p_package_4&&<>(store_5&&pd)))&&[]<>charger_4/

Some other examples of NL instructions and their corresponding LTL formulas:

NL Instruction: Take block 3 from house 3 to house 4.
Final LTL formula:
<>(house_3&&<>(p_block_3&&<>(house_4&&pd)))/

NL Instruction: Keep taking a picture of house 6.

Process Example:
[User]:
Formula so far:
Step 1:
[Assistant]:
[]

[User]:
Formula so far: []
Step 2:
[Assistant]:
<>

[User]:
Formula so far: []<>
Step 3:
[Assistant]:
(

[User]:
Formula so far: []<>(
Step 4:
[Assistant]:
house_6

[User]:
Formula so far: []<>(house_6
Step 5:
[Assistant]:
&&

[User]:
Formula so far: []<>(house_6&&
Step 6:
[Assistant]:
photo

[User]:
Formula so far: []<>(house_6&&photo
Step 7:
[Assistant]:
)

[User]:
Formula so far: []<>(house_6&&photo)
Step 8:
[Assistant]:
/

Final LTL formula:
[]<>(house_6&&photo)/

NL Instruction: Stay in interstate 64 until you reach gas station 4 as you take box 1 from house 5 to then house 6.

Process Example:
[User]:
Formula so far:
Step 1:
[Assistant]:
insterstate_64

[User]:
Formula so far: insterstate_64
Step 2:
[Assistant]:
U

[User]:
Formula so far: insterstate_64U
Step 3:
[Assistant]:
gas_station_4

[User]:
Formula so far: insterstate_64Ugas_station_4
Step 4:
[Assistant]:
&&

[User]:
Formula so far: insterstate_64Ugas_station_4&&
Step 5:
[Assistant]:
<>

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>
Step 6:
[Assistant]:
(

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(
Step 7:
[Assistant]:
house_5

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5
Step 8:
[Assistant]:
&&

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&
Step 9:
[Assistant]:
<>

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>
Step 10:
[Assistant]:
(

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(
Step 11:
[Assistant]:
p_box_1

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1
Step 12:
[Assistant]:
&&

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&
Step 13:
[Assistant]:
<>

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>
Step 14:
[Assistant]:
(

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>(
Step 15:
[Assistant]:
house_6

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>(house_6
Step 16:
[Assistant]:
&&

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>(house_6&&
Step 17:
[Assistant]:
pd

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>(house_6&&pd
Step 18:
[Assistant]:
)

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>(house_6&&pd)
Step 19:
[Assistant]:
)

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>(house_6&&pd))
Step 20:
[Assistant]:
)

[User]:
Formula so far: insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>(house_6&&pd)))
Step 21:
[Assistant]:
/

Final LTL formula:
insterstate_64Ugas_station_4&&<>(house_5&&<>(p_box_1&&<>(house_6&&pd)))/

NL Instruction: Do not enter lot 3 until you reach building 2 while going to building 2 and taking a picture.
Final LTL formula:
!lot_3Ubuilding_2&&<>(building_2&&photo)/

NL Instruction: While never going to street 5, take a picture of mall 4.

Process Example:
[User]:
Formula so far:
Step 1:
[Assistant]:
[]

[User]:
Formula so far: []
Step 2:
[Assistant]:
!

[User]:
Formula so far: []!
Step 3:
[Assistant]:
street_5

[User]:
Formula so far: []!street_5
Step 4:
[Assistant]:
&&

[User]:
Formula so far: []!street_5&&
Step 5:
[Assistant]:
<>

[User]:
Formula so far: []!street_5&&<>
Step 6:
[Assistant]:
(

[User]:
Formula so far: []!street_5&&<>(
Step 7:
[Assistant]:
mall_4

[User]:
Formula so far: []!street_5&&<>(mall_4
Step 8:
[Assistant]:
&&

[User]:
Formula so far: []!street_5&&<>(mall_4&&
Step 9:
[Assistant]:
photo

[User]:
Formula so far: []!street_5&&<>(mall_4&&photo
Step 10:
[Assistant]:
)

[User]:
Formula so far: []!street_5&&<>(mall_4&&photo)
Step 11:
[Assistant]:
/

Final LTL formula:
[]!street_5&&<>(mall_4&&photo)/

NL Instruction: Go to street 3 and stay there, but do not go there till you reach intersection 4
Final LTL formula:
<>[]street_3&&!street_3Uintersection_4/

NL Instruction: Take a picture of statue 2 and then go to bench 5
Final LTL formula:
<>(statue_2&&photo&&<>bench_5)/

NL Instruction: Stay in statue 2 as you go to bench 5
Final LTL formula:
[]statue_2&&<>bench_5/

NL Instruction: Stay in area 4 until you take letter 2 from office 4 to office 8
Final LTL formula:
area_4U(office_4&&<>(p_letter_2&&<>(office_8&&pd)))/

Task Instructions You Must Follow:
1. For the given NL instruction, build the LTL formula step by step.
2. When you are asked to select an element:
2a. You must only choose exactly one element, either a single operator listed or an AP generated, that will build up the LTL formula to satisfy the NL description
2b. '[]' and '!' are to be considered separate elements and should not be generated together.
2c. '<>' and '(' are to be considered separate elements and should not be generated together.
2d. '!' and '(' are to be considered separate elements and should not be generated together.
2e. Do not generate part of the element
2f. The operator 'U' is not preceeded or succeeded by operators
3. When you generate AP as an option, make sure the name of the location or item in your response has the same semantic meaning as described in the NL instruction.
4. Whenever you generate a left bracket "(", you must generate a corresponding right bracket ")" later in the formula so that all the brackets are complete and maintain proper precedence.
5. If the NL task indicates that the AP generated is the first of a sequence, the AP must be preceeded by '('.
"""

print(system_description_prompt)

## Calibration

We generated a dataset of 1,000 task-formula pairs that are of three levels of complexity.

|        | # of Atomic Propositions | Dataset Size |
|:------:|:------------------------:|:------------:|
|  Easy  |          {1, 2}          |      365     |
| Medium |          {3, 4}          |      440     |
| Hard   |        More than 4       |      195     |




### Step 1: Download & Install [Dataset](https://drive.google.com/drive/folders/1F6PTA2elrPjaN9ReuTY2TNjMhk-VGV1b?usp=drive_link)

In [None]:
import gdown

folder_id = "1F6PTA2elrPjaN9ReuTY2TNjMhk-VGV1b"  # Replace this with your actual folder ID
destination_folder = "/content/downloaded_folder"
if os.path.exists(destination_folder):
    shutil.rmtree(destination_folder)

!pip install --upgrade --no-cache-dir gdown --quiet
!gdown --folder --id {folder_id} -O {destination_folder} --quiet

json_file = os.path.join(destination_folder, "calibration.json")
with open(json_file, "r") as f:
    dataset_pair = json.load(f)
print(f"\nJSON file loaded successfully!\n\nLength of Dataset: {len(dataset_pair)}\n\nExample Data: {dataset_pair[0]}")


JSON file loaded successfully!

Length of Dataset: 1000

Example Data: {'nlTask': 'Take a picture of building 2', 'ltlequ': '<>(building_2&&photo)/'}


### Step 2: Obtain Calibration Scores (Non-Conformity Score)

You can either

*   Use our calibrated scores based on GPT-4o
*   Collect the scores and compute the quantile by yourself using the dataset

**Choose either option 1 or 2 to finish the calibration process**

#### Option 1: Use our non-conformity scores

***The scores were computed based on GPT-4o***

In [None]:
json_file = os.path.join(destination_folder, "score_calib.json")
with open(json_file, "r") as f:
    pre_computed_calib_scores = json.load(f)
print(f"\nJSON file loaded successfully!\n\nExample: {pre_computed_calib_scores[0]}")

#### Option 2: Collect your calibration scores using our dataset

***The calibration process is automated using the NL-LTL dataset pair***

In [None]:
#@title Step 1: Compute Calibration Scores
api_call_per_step = 10 #@param {allow-input:true, type:"integer"}
start = 5 #@param {allow-input:true, type:"integer"}
end = 6 #@param {allow-input:true, type:"integer"}

score_save = []
tasks = dataset_pair[start:end]
task_no = 1

for task in tasks:
  # Extract NL and LTL pair from the dataset
  natural_language_task = task["nlTask"]
  ltlequ = task["ltlequ"]

  # store data
  calib_path = os.path.join(path, 'calibration')
  if not os.path.exists(calib_path):
      os.makedirs(calib_path)
  foldername = os.path.join(calib_path, str(f'Calibration_{len(os.listdir(calib_path))+1}') )
  if not os.path.exists(foldername):
      os.makedirs(foldername)
  print(f"Folder name {foldername}")

  data = []
  data_loc = os.path.join(foldername, 'data.npy')
  print(f"Data is going to be saved in {data_loc}")

  filename = os.path.join(foldername, str(f'params.txt'))
  f = open(filename, 'w')
  my = CustomOutput(f)
  for k, v in params.items():
    my.write(f"{k}: {v}\n")
  my.write(f"\nNL Task: {natural_language_task}\n\nGT LTL Formula: {ltlequ}\n\n")
  f.close()

  print(f"Computing score for task {task_no}...")

  # Separate the equations down to its components
  words = re.findall(r'[a-zA-TV-Z_0-9]+|<>|&&|\|\||\[\]|\(|\)|U|[^\s\w]', ltlequ)
  num_steps = len(words)

  print(f"The formula has {num_steps} elements")
  scores = []
  current_formula = ''
  system_message = formulator(role='user', text=system_description_prompt)
  filename = os.path.join(foldername, str(f'system_prompt.txt'))
  f = open(filename, 'w')
  f.write(f"{system_description_prompt}")
  f.close()
  messages = [system_message]
  messages.append(formulator(role='user', text=natural_language_task))

  # In calibration, we manually define the ground truth LTL formula.
  for step_count in range(1,num_steps+1):
    print(f"Currently at step {step_count}")
    note = os.path.join(foldername, str(f"step_{step_count}.txt"))
    f = open(note, 'w')
    my = CustomOutput(f)
    user_input = f"Formula so far: {current_formula}\nStep {step_count}:"
    my.write(f"User input:\n{user_input} \n")
    messages.append(formulator(role='user', text=user_input))
    results = {}

    # Obtain M LLM responses from the options
    responses = []
    for t in range(1,api_call_per_step+1):
      my.write(f"\nRunning api call number {t}")
      response, response_text = LLM(messages, params)
      valid, found_keys = check_validity(response_text)
      responses.append({'is_valid': valid, 'text': response_text, 'found_keys': found_keys})
      my.write(f"\n-------[Valid: {valid}]---------\n" + f"LLM Response {t}: " + response_text.replace('\n', '') + f"\nKeys Found: {found_keys}")
      if valid:
        if found_keys[0] not in results:
          results[found_keys[0]] = 0
        results[found_keys[0]] += 1
    print(f"\nThe results vector is {results}")

    # Aggregate similar APs together
    keys_to_remove = set()
    for key1, key2 in combinations(results.keys(), 2):
      if semantic_similarity(key1, key2) > 0.7:
        print(f"High similarity {semantic_similarity(key1, key2)} between '{key1}' and '{key2}'")
        if results[key1] >= results[key2]:
            results[key1] += results[key2]
            keys_to_remove.add(key2)
        else:
            results[key2] += results[key1]
            keys_to_remove.add(key1)

    for key in keys_to_remove:
      del results[key]

    # Compute valid frequency
    total = sum(results.values())
    valid_results = {key: value / total for key, value in results.items()}
    my.write(f"True element is {words[step_count-1]}\n")

    # Compute the frequency of the ground truth decision
    for k, v in valid_results.items():
      my.write(f"{k}: {v}\n")
    count = 0
    for k, v in valid_results.items():
      if k not in opkeys and semantic_similarity(k,words[step_count-1]) > 0.7:
        print(f"Storing the score due to semantic similarity with value {semantic_similarity(k,words[step_count-1])} between words {k} and {words[step_count-1]}")
        scores.append(1-v)
        count += 1
        break
      elif k == words[step_count-1]:
        print(f"Storing the score due to operator equality")
        scores.append(1-v)
        count += 1
        break
    if words[step_count-1] not in k and count == 0:
      print(f"Storing the score due to non existence")
      scores.append(1)

    messages.append(formulator(role='assistant', text=words[step_count-1]))

    current_formula += words[step_count-1]
    my.write(f"\nNew Formula: {current_formula}\n")

    # Put data into dictionary
    dic = {'step'       : step_count,
           'nltask'     : natural_language_task,
           'llm_output' : responses,
           'valid_count': total,
           'probs'      : valid_results,
           'gt'         : words[step_count-1]}
    data.append(dic)

    # shut down if '/' is detected either in calibration or validation
    if current_formula[-1] == '/':
      my.write(f"The current formula is now ended with /, exiting...")
      final_score = max(scores)
      score_save.append({"nltask": natural_language_task, "score":final_score})
      task_no += 1
      f.close()
      break
    f.close()

  np.save(data_loc, np.array(data))
print(f"The scores are {score_save}")
score_filename = os.path.join(path, 'score.json')
print(f"Scores will be saved to {score_filename}")
with open(score_filename, 'w') as score_file:
    json.dump(score_save, score_file, indent=4)

Folder name /content/gdrive/MyDrive/LTL_NL_Translation/calibration/Calibration_5
Data is going to be saved in /content/gdrive/MyDrive/LTL_NL_Translation/calibration/Calibration_5/data.npy
model: gpt-4o
temperature: 1
max_tokens: 512
top_p: 1
frequency_penalty: 0
presence_penalty: 0

NL Task: Do not enter street 5 until you reach store 1 as you pick up package 4 from store 1

GT LTL Formula: !street_5Ustore_1&&<>(store_1&&p_package_4)/Computing score for task 1...
The formula has 12 elements
Currently at step 1
User input:
Formula so far: 
Step 1: 

Running api call number 1
-------[Valid: True]---------
LLM Response 1: !
Keys Found: ['!']
Running api call number 2
-------[Valid: True]---------
LLM Response 2: !
Keys Found: ['!']
Running api call number 3
-------[Valid: True]---------
LLM Response 3: !
Keys Found: ['!']
Running api call number 4
-------[Valid: True]---------
LLM Response 4: !
Keys Found: ['!']
Running api call number 5
-------[Valid: True]---------
LLM Response 5: !
Key

In [None]:
#@title Step 2: Load your own scores
json_file_path = os.path.join(path, "score.json") # Replace with your own location!
with open(json_file_path, "r") as f:
    pre_computed_calib_scores = json.load(f)
print(f"\nJSON file loaded successfully!\n\nLength of Dataset: {len(pre_computed_calib_scores)}\n\nExample: {pre_computed_calib_scores[0]}")


JSON file loaded successfully!

Length of Dataset: 1

Example: {'nltask': 'Do not enter street 5 until you reach store 1 as you pick up package 4 from store 1', 'score': 1}


## Validation with Help

After you obtain the calibration dataset, we can now sample a subset from it to compute the quantile based on custom $1-\alpha$ coverage level

In [None]:
#@title Select Coverage Level
coverage_in_percentage = 95 # @param {type:"number"}
alpha = 1-coverage_in_percentage/100

In [None]:
#@title Sample Calibration & Validation Data
import random
all_calib_size = len(pre_computed_calib_scores)
cali_size = 200 #@param {allow-input:true, type:"integer"}
val_size = 50 #@param {allow-input:true, type:"integer"}

if cali_size+val_size > all_calib_size:
    raise Exception(f"Calibration dataset is not big enough for {cali_size} calibration data with {val_size} validation data")
else:
    order = random.sample(range(0, all_calib_size), cali_size+val_size)
    order_cali = order[0:cali_size]
    order_val = order[cali_size:]
    reordered_cali = [pre_computed_calib_scores[i] for i in order_cali]
    reordered_val = [pre_computed_calib_scores[i] for i in order_val]
    scores = [item["score"] for item in reordered_cali]
    print(f"Length of Calibration Data: {len(reordered_cali)}")
    print(f"Length of Validation Data: {len(reordered_val)}")

nl_task_ltlequ_dict = {}
for task in dataset_pair:
    nl_task = task['nlTask']
    ltlequ = task['ltlequ']
    nl_task_ltlequ_dict[nl_task] = ltlequ

Length of Calibration Data: 200
Length of Validation Data: 50


In [None]:
#@title Compute Quantile
compute = []
for score in scores:
  compute.append(score)
quantile = np.quantile(compute, 1 - alpha)
print(f"Quantile of {coverage_in_percentage}% coverage is {quantile}")

Quantile of 95% coverage is 0.8


In [None]:
#@title Validation

val_no = 1 #param {allow-input:true, type:"integer"}
NL_Task = reordered_val[val_no-1]["nltask"]
api_call_per_step = 10 #param {allow-input:true, type:"integer"}

word = ""
if NL_Task not in nl_task_ltlequ_dict:
  """
  At validation time, we don't assume the knowledge of the ground truth LTL formula
  Here we load the ground truth formula for automated answer checking purpose and accuracy calculation
  """
  print(f"{NL_Task} does not exist")

else:
  ltlequ = nl_task_ltlequ_dict[NL_Task]
  word = re.findall(r'[a-zA-TV-Z_0-9]+|<>|&&|\|\||\[\]|\(|\)|U|[^\s\w]', ltlequ)
  # store data
  validation_path = os.path.join(path ,'validation')
  if not os.path.exists(validation_path):
      os.makedirs(validation_path)
  foldername = os.path.join(validation_path, str(f'Validation_{len(os.listdir(validation_path))+1}') )
  if not os.path.exists(foldername):
      os.makedirs(foldername)
  print(f"folder name is {foldername}")

  data = []
  data_loc = os.path.join(foldername, 'data.npy')
  print(f"Data is going to be saved in {data_loc}")

  filename = os.path.join(foldername, str(f'params.txt'))
  f = open(filename, 'w')
  my = CustomOutput(f)
  for k, v in params.items():
    my.write(f"{k}: {v}\n")
  f.close()

  filename = os.path.join(foldername, str(f'system_prompt.txt'))
  f = open(filename, 'w')
  f.write(f"{system_description_prompt}")
  f.close()

  # quantile = quantile_times_hundred/100
  current_formula = ''
  system_message = formulator(role='user', text=system_description_prompt)
  messages = [system_message]
  messages.append(formulator(role='user', text=NL_Task))
  step_count = 1
  help_rate = 0
  while True:
    user_input = f"Formula so far: {current_formula}\nStep {step_count}:"
    messages.append(formulator(role='user', text=user_input))
    note = os.path.join(foldername, str(f"step_{step_count}.txt"))
    f = open(note, 'w')
    my = CustomOutput(f)
    my.write(f"User input:\n{user_input} \n")
    results = {}
    responses = []
    for t in range(1,api_call_per_step+1):
      my.write(f"\nRunning api call number {t}")
      response, response_text = LLM(messages, params)
      valid, found_keys = check_validity(response_text)
      responses.append({'is_valid': valid, 'text': response_text, 'found_keys': found_keys})
      my.write(f"\n-------[Valid: {valid}]---------\n" + f"LLM Response {t}: " + response_text.replace('\n', '') + f"\nKeys Found: {found_keys}")
      if valid:
        if found_keys[0] not in results:
          results[found_keys[0]] = 0
        results[found_keys[0]] += 1
      if len(found_keys) == 0:
        continue
      print(f"\nAdded to {found_keys[0]}")
    print(f"The results vector is {results}")

    total = sum(results.values())
    valid_results = {key: value / total for key, value in results.items()}

    for k, v in valid_results.items():
      print(f"{k}: {v}\n")

    # List to keep track of keys to remove
    keys_to_remove = []
    for key1, key2 in combinations(results.keys(), 2):
      if semantic_similarity(key1, key2) > 0.7:
        print(f"High similarity {semantic_similarity(key1, key2)} between '{key1}' and '{key2}'")
        if valid_results[key1] >= valid_results[key2]:
            valid_results[key1] += valid_results[key2]
            keys_to_remove.add(key2)
        else:
            valid_results[key2] += valid_results[key1]
            keys_to_remove.add(key1)

    # Remove marked keys from the dictionary
    for key in keys_to_remove:
        if key in valid_results:
            del valid_results[key]

    pred_set = []

    for k, v in valid_results.items():
      if v >= 1-quantile:
        pred_set.append(k)
    if len(pred_set) != 1:
      user_input = input(f"I  need help!\nThe NL task is {NL_Task}\nThe current formula is {current_formula}\nChoose an option of the following")
      for i, option in enumerate(pred_set):
        print(f"{i+1}: {option}")
      user_input = int(user_input)
      pred_set[0] = pred_set[int(user_input)-1]
      help_rate += 1
    messages.append(formulator(role='assistant', text=pred_set[0]))

    current_formula += pred_set[0]
    step_count += 1
    my.write(f"\nNew Formula: {current_formula}\n")

    if current_formula[-1] == '/':
      my.write(f"The current formula is now ended with /, exiting...")
      break

  print(f"\n{NL_Task}")
  print(f"The helps asked: {help_rate} and step count : {step_count-1}")
  correct = True
  if ltlequ != current_formula:
    useranswer = input(f"Formula so far:{current_formula}\nEnter 'no' if you think it is wrong: ")
    if useranswer == 'no':
      correct = False
  print(f"The final formula is {'correct' if correct else 'wrong'}")
  ######
  #Add validation data into json
  json_filename = os.path.join(path, f'result.json')
  if os.path.isfile(json_filename):
    with open(json_filename, 'r') as f:
        jsondata = json.load(f)
  else:
    jsondata = []
  new_dict = {
      "nlTask": NL_Task,
      "ltlequ": ltlequ,
      "api_call_per_step": api_call_per_step,
      'help': help_rate,
      "correct": correct,
      'total_step': step_count -1
  }

  print(f"Element added:")
  for k, v in new_dict.items():
      print(f"{k}: {v}")

  jsondata.append(new_dict)
  with open(json_filename, 'w') as f:
      json.dump(jsondata, f, indent=4) # Use indent for pretty printing

  print(f"New data length: {len(jsondata)}")

  # compute help_rate
  help = 0
  total = 0
  correct = 0
  for data in jsondata:
      help = help + data['help']
      total = total + data['total_step']
      if data['correct']:
        correct = correct + 1
  print(f"\n\n\nHelp Rate: {help/total}\nAccuracy: {correct/len(jsondata)}")