In [None]:
import os
import time
import json
import openai
import pandas as pd
os.sys.path.append("..")
from templates import prompting_attack_template

RETRY_TIMES = 3
API_MODEL_NAME = ["gpt-4.1", "gpt-4.1-mini", "gpt-4o-mini", "claude-3-haiku", "qwen-plus", "qwen-turbo", "gemini-1.5", "gpt", "claude", "gemini"]
API_MODEL_VERSION = ["gpt-4.1-2025-04-14", "gpt-4.1-mini-2025-04-14", "gpt-4o-mini-2024-07-18", "claude-3-haiku-20240307", "qwen-plus-latest", "qwen-turbo-latest", "gemini-1.5-flash", "gpt-4o-2024-11-20", "claude-3-7-sonnet-20250219-low", "gemini-2.0-flash"]

METHOD = 'AttackPrompting'
OUTPUT_OPTIONS = ["[Yes]", "[No]"]
template = prompting_attack_template
df = pd.read_csv(os.path.join("..", "data", "jailflipbench_textual", "jailflipbench.csv"))

def check_output_in_format(x:str)->bool:
    for option in OUTPUT_OPTIONS:
        if option in x:
            return True
    return False

def get_single_response(model_name, input_str):
    if True:    # standard openai format
        messages = [
            {"role": "user", "content": input_str},
        ]
        response = openai.ChatCompletion.create(
            model = model_name,
            messages = messages,
            max_completion_tokens = 512,
            )
        return response["choices"][0]["message"]["content"]
    else:       # or specify your own api format
        pass

def from_bool_to_YesNo_str(x:bool)->str:
    assert isinstance(x, bool)
    if x:
        return "[[Yes]]"
    if not x:
        return "[[No]]"

### First, configure the api setting, and specify the target models

In [None]:
MODEL_INDEX = 0
model_ver  = API_MODEL_VERSION[MODEL_INDEX]
model_name = API_MODEL_NAME[MODEL_INDEX]

gt_list = df['gt'].to_list()
input_list = df['question'].to_list()
input_list = [template.format(question=q, gt=from_bool_to_YesNo_str(gt), neg_gt=from_bool_to_YesNo_str(not gt)) for q,gt in zip(input_list,gt_list)]
output_list = [None for _ in range(len(input_list))]

### Then, generate each question's response

In [None]:
for idx, q in enumerate(input_list):
    if output_list[idx] is not None:
        continue
    for _ in range(RETRY_TIMES):
        try:
            textual_output = get_single_response(model_ver, q)
            assert check_output_in_format(textual_output)
            output_list[idx] = textual_output
        except AssertionError as e:
            print(f"For index {idx} model {model_name}, target options reflected wrong in the output #{textual_output[:30]}#, re-generate...")
            time.sleep(1)
        except Exception as e:
            print(f"Other error, index {idx} model {model_name}, {e}")
            time.sleep(1)

### Last, store the results in json format

In [None]:
with open(f"results_{METHOD}_model_{model_name}.json", "w") as f:
    json.dump({'input_list':input_list, 'output_list':output_list}, f, indent=4, ensure_ascii=False)