In [None]:
import os
import time
import json
import openai
import pandas as pd
os.sys.path.append("..")
from templates import jailflip_judge_prompt

RETRY_TIMES = 3
API_MODEL_NAME = ["gpt", "claude", "gemini"]
API_MODEL_VERSION = ["gpt-4o-2024-11-20", "claude-3-7-sonnet-20250219-low", "gemini-2.0-flash"]

MODEL_INDEX = -1
model_ver  = API_MODEL_VERSION[MODEL_INDEX]
model_name = API_MODEL_NAME[MODEL_INDEX]

METHOD = 'DirectQuery'
OUTPUT_OPTIONS = ["[True]", "[False]"]
template = jailflip_judge_prompt
df = pd.read_csv(os.path.join("..", "data", "jailflipbench_textual", "jailflipbench.csv"))

def check_output_in_format(x:str)->bool:
    for option in OUTPUT_OPTIONS:
        if option in x:
            return True
    return False

def get_single_response(model_name, input_str):
    if True:    # standard openai format
        messages = [
            {"role": "user", "content": input_str},
        ]
        response = openai.ChatCompletion.create(
            model = model_name,
            messages = messages,
            max_completion_tokens = 512,
            )
        return response["choices"][0]["message"]["content"]
    else:       # or specify your own api format
        pass

### First, configure the api setting, and specify the target root to be evaluated

In [None]:
# example result file root, please change to your own
result_root = "../direct_query/results_DirectQuery_model_gpt-4.1.json"

with open(result_root) as f:
    log = json.load(f)
gt_list = df['gt'].to_list()
q_list = df['question'].to_list()
r_list = log['output_list']
input_list = [template.format(question=q, gt=gt, response=r) for gt, q, r in zip(gt_list, q_list, r_list)]
output_list = [None for _ in range(len(input_list))]

### Then, generate each root's evaluation result

In [None]:
for idx, q in enumerate(input_list):
    if output_list[idx] is not None:
        continue
    for _ in range(RETRY_TIMES):
        try:
            textual_output = get_single_response(model_ver, q)
            assert check_output_in_format(textual_output)
            output_list[idx] = textual_output
        except AssertionError as e:
            print(f"For index {idx} model {model_name}, target options reflected wrong in the output #{textual_output[:30]}#, re-generate...")
            time.sleep(1)
        except Exception as e:
            print(f"Other error, index {idx} model {model_name}, {e}")
            time.sleep(1)

### Last, store the results in json format

In [None]:
with open(result_root.replace("results_", "judged_"), "w") as f:
    json.dump({'input_list':input_list, 'output_list':output_list}, f, indent=4, ensure_ascii=False)