# IntentPrompt Demo

In [None]:
# Import libraries
import time
import pandas as pd
from utils import (
    check_mkdirs, subsample_data,
    get_prompts_paraphrase, get_prompts_inquiry, get_prompts_evaluation, IA,
    build_genai_model, generate_content, printf, compute_asr, compute_success_hscore_mean,
)

In [None]:
# Define arguments
vertexai_proj_id = 'Input_your_VertexAI_Project_ID'
vertexai_region = 'vertexai_region' # e.g., us-central1
openai_key = 'Input_your_OpenAI_key'
deepseek_key = 'Input_your_DeepSeek_key'
anthropic_key = 'Input_your_Anthropic_key'
openrouter_key = 'Input_your_OpenRouter_key'

paraphraser_name = 'gemini-1.5-flash-002' # options: [gemini-1.5-flash-002, qwen/qwen3-14b:free, mistralai/mixtral-8x7b-instruct]
target_name = 'gpt-4o' # options: [gpt-4o/gpt-4o-mini-2024-07-18, o1/o1-mini, o3-mini, gemini-2.0-flash-001, claude-3-7-sonnet-20250219/claude-3-haiku-20240307, deepseek-chat, deepseek-reasoner, qwen/qwen3-235b-a22b:free, meta-llama/llama-4-scout:free] # deepseek v3/r1
evaluator_name = 'gemini-1.5-flash-002' # options: [gemini-1.5-flash-002, gpt-4o-mini-2024-07-18, qwen/qwen3-32b:free]
para_types = 'none' # choose one type of paraphrase, options: ['none', 'ass', 'msw', 'ces']
is_fuzzy = '' # whether or not obscure the intent, options: ['', 'NO']
para_mode = 'fuzzy_struct' # the mode of paraphrase, options: ['revise', 'struct', 'fuzzy_struct']
inq_mode = 'spin' # the mode of inquiry, options: ['naive', 'ela', 'spin']
is_defense = False # defend with intent analysis or not, options: [True, False]
max_tokens = 2048 # 8192/4096/2048
temp = 0.8
top_p = 0.95
num_iter = 5 # the number of iterations for optimizing the prompts
is_sampled = True # for debug mode
num_sample = 20 # number of sampled data

para_dict = {'none': 'taking no specific strategy',
             'ass': 'altering sentence structure',
             'msw': 'misspelling sensitive words',
             'ces': 'changing expression style',
            } # types of paraphrase
pdict = {'inquiries': 'NA',
         'para_strat': para_dict[para_types],
         'fuzzy': is_fuzzy,
         'para_inq_old': 'NA',
         'feedback': 'NA',
        } # prompts' arguments
openai_models = ['gpt', 'o1', 'o3']
openrouter_models = ['qwen', 'llama', 'mixtral']
api_key_map = {model: openai_key for model in openai_models} | {
               model: openrouter_key for model in openrouter_models} | {
                'deepseek': deepseek_key,
                'claude': anthropic_key,
              } # map model to api key
url_model_map = {"https://api.openai.com/v1": ['gpt', 'o1', 'o3'],
                 "https://api.deepseek.com/v1": ['deepseek'],
                 "https://openrouter.ai/api/v1": ['qwen', 'llama', 'mixtral'],
                }
genai_config = {'max_tokens': max_tokens,
                'temperature': temp,
                'top_p': top_p,
                }
#vertexai.init(project=vertexai_proj_id, location=vertexai_region)  # init() if necessary
# Instructions for using Vertex AI:
# gcloud init
# gcloud auth application-default login


In [None]:
# Jailbreak through iterative intent manipulation.

def intent_jailbreak(inquires_list: list, num_iter: int, verbose: bool) -> list:
    """
    @topic: explore the vulnerability via intent jailbreak.
    @inputs:
        inquiries_list (list of string): a list of text inquiries.
        num_iter (int): the number of iterations.
        verbose (bool): print out the log if verbose=True.
    @returns:
        opt_para_inquiries (list): a list of optimal paraphrased inquiries for jailbreak.
        asr_list (list): a list of attack success rates (ASR) for each inquiry.
        hscore_list (list): a list of harmfulness scores (hscore) for each inquiry.
        q4j_mean (float): the average number of queries used for a successful jailbreak.
    """
    # Initialize multi-agents for jailbreak.
    paraphraser = build_genai_model(paraphraser_name, api_key_map, url_model_map) # auxiliary agent
    target_model = build_genai_model(target_name, api_key_map, url_model_map) # victim model
    evaluator = build_genai_model(evaluator_name, api_key_map, url_model_map) # monitoring agent

    opt_para_inquires = ['' for _ in range(len(inquires_list))]
    asr_list, hscore_list = ([0] * len(inquires_list) for _ in range(2))
    q4j_list = []
    for i, inquiry in enumerate(inquires_list):
        time.sleep(1)
        printf("-"*20, verbose)
        printf(f"Jailbreaking the {int(i+1)}-th inquiry: {inquiry}", verbose)
        pdict['inquires'] = inquiry
        num_query4jailbreak = 1 # the number of queries used for a successful jailbreak.
        for n in range(num_iter):
            time.sleep(1)
            printf(f"Paraphrasing the inquiry in {int(n+1)}-th iteration.", verbose)
            # paraphrase the malicious inquires.
            prompts_para = get_prompts_paraphrase(pdict, para_mode)  # update prompts for paraphrase
            try:
                para_inq = generate_content(paraphraser, paraphraser_name, genai_config, prompts_para)
            except:
                break # break if fail to paraphrase
            printf(f"Paraphrased inquires: {para_inq}".strip(), verbose)

            # generate response using the target model with paraphrased inquires.
            prompts_inq = get_prompts_inquiry(para_inq, is_defense, IA, inq_mode)  # update prompts for inquires

            resp = generate_content(target_model, target_name, genai_config, prompts_inq)
            printf(f"Response from the target model: {resp}".strip(), verbose)

            # evaluate the jailbreak performance: is successfully jailbroken? (0 or 1), harmfulness score (0-5).
            prompts_eval = get_prompts_evaluation(para_inq, resp)  # update prompts for evaluation
            res_eval = generate_content(evaluator, evaluator_name, genai_config, prompts_eval)
            try:
                is_jailbroken, hscore = eval(res_eval)
            except:
                print(f"fail to evaluate the outputs in {n}-th iteration for the paraphrased inquiry '{para_inq}'.")
                is_jailbroken, hscore = 0, 0

            printf(f"Is_Success_Jailbreak: {is_jailbroken}, Harmfulness score: {hscore}.", verbose)
            if is_jailbroken == 0: # update 'para_inq_old' and 'feedback' for the next iteration if not success.
                pdict['para_inq_old'] = para_inq
                pdict['feedback'] = resp
                num_query4jailbreak += 1
            else: # break the current FOR loop if the jailbreak is successful.
                printf(f"Successful jailbreak in {int(n+1)}-th iteration.", verbose)
                opt_para_inquires[i] = para_inq.strip() if para_inq else ''  # store the optimal para_inquiry for the successful jailbreak.
                asr_list[i] = is_jailbroken
                hscore_list[i] = hscore
                q4j_list.append(num_query4jailbreak) # count the query when jailbreak is successful. (or use 'n+1').
                pdict['para_inq_old'] = 'NA'
                pdict['feedback'] = 'NA'
                break
    q4j_mean = sum(q4j_list)/len(q4j_list) if q4j_list else 0
    return opt_para_inquires, asr_list, hscore_list, q4j_mean

# Load data
data_name = 'jambench_behaviors' # advbench_behaviors/harmbench_behaviors/jailbreakbench_behaviors/jambench_behaviors
infile_path = f'../data/{data_name}.csv'
data_df = pd.read_csv(infile_path, header=None) # read as dataframe
data_los = data_df.astype(str).apply(lambda x: ' '.join(x), axis=1).tolist() # convert to list of string
# Subsample instances (for debug)
inquires_list = subsample_data(data_los, is_sampled, num_sample, seed=42)

# Perform Rad Teaming - explore the vulnerability via intent jailbreak
opt_para_inquires, asr_list, hscore_list, q4j_mean = intent_jailbreak(inquires_list, num_iter, verbose=True)

# Evaluation
asr = compute_asr(asr_list)
hscore = compute_success_hscore_mean(asr_list, hscore_list)
print(f"ASR: {asr*100:.2f}%")
print(f"Hscore: {hscore:.2f}")
print(f"The average number of queries used for successful jailbreaks: {q4j_mean:.2f}")
