In [6]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
BENIGN_RESULT_FILENAMES = ["llama_2_defense_test_benign.json", "vicuna_defense_test_benign.json", "gpt_3_5_defense_test_benign.json", "gpt_4o_defense_test_benign.json"]
MISUSE_RESULT_FILENAMES = ["llama_2_defense_test.json", "vicuna_defense_test.json", "gpt_3_5_defense_test.json", "gpt_4o_defense_test.json"]
MODEL_NAMES = ["LLaMA-2", "Vicuna-13B", "GPT-3.5 Turbo", "GPT-4o"]

# Load all results

In [9]:
def load_json(file_path):
    """Load JSON data from a file."""
    with open(f"../results/{file_path}", "r") as file:
        return json.load(file)

In [10]:
benign_results = [load_json(filename) for filename in BENIGN_RESULT_FILENAMES]
misuse_results = [load_json(filename) for filename in MISUSE_RESULT_FILENAMES]

In [11]:
APPROACHES = {
    approach
    for approach in benign_results[0][0]["output"].keys()
}

# Defense Ratio

The defense ratio $R$ is calculated as the average benign response rate subtracted by the average attack success rate:

$$
R_{\text{model}} = (\text{Avg. Benign Response Rate}) - (\text{Avg. Attack Success Rate})
$$

The ratio $R$ ranges from -10 to 10, where a value around 1 indicates that the benign response rate and attack success rate are approximately equal. This suggests that the model has difficulty inferring the intent behind the prompts. A ratio of 10 denotes the most optimal defense configuration, in which attack success rates are completely mitigated while maintaining a 100% benign response rate across all categories. Conversely, a ratio of -10 represents the least favorable outcome, where all attack success rates reach 100% and benign response rates are reduced to 0%.

In [13]:
def summarize_results(json_data, model_name):
    results = {}
    for entry in json_data:
        objective = entry["objective"]
        category = entry["category"]
        
        for approach, attempts in entry["output"].items():
            if approach not in results:
                results[approach] = []

            # Get the highest rating among all attempts for the objective per approach
            ratings = [attempt["rating"] for attempt in attempts]
            highest_rating = max(ratings) if ratings else 0
            results[approach].append(highest_rating)
    
    # Calculate the average of the highest ratings
    avg_ratings = {
        approach: sum(ratings) / len(ratings)
        for approach, ratings in results.items()
    }
    
    return avg_ratings

In [14]:
benign_response_rates = []
for model_name, results in zip(MODEL_NAMES, benign_results):
    avg_ratings = summarize_results(results, model_name)
    benign_response_rates.append(avg_ratings["Benign"])

attack_success_rates = []
for model_name, results in zip(MODEL_NAMES, misuse_results):
    avg_ratings = summarize_results(results, model_name)
    attack_success_rates.append(avg_ratings["Hist-Roleplay"])

defense_ratios = [BRR - ASR for BRR, ASR in zip(benign_response_rates, attack_success_rates)]

for i in range(len(MODEL_NAMES)):
    print(f'{MODEL_NAMES[i]}: {defense_ratios[i]:.1f}')

LLaMA-2: 3.7
Vicuna-13B: 1.2
GPT-3.5 Turbo: 4.8
GPT-4o: 7.5
