In [5]:
import json
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
RESULTS_FILENAMES = ["llama_2_defense.json", "vicuna_defense.json", "gpt_3_5_defense.json", "gpt_4o_defense.json"]
MODEL_NAMES = ["LLaMA-2-7B", "Vicuna-13B", "GPT-3.5 Turbo", "GPT-4o"]

# Load all results

In [8]:
def load_json(file_path):
    """Load JSON data from a file."""
    with open(f"../results/{file_path}", "r") as file:
        return json.load(file)

In [9]:
all_results = [load_json(filename) for filename in RESULTS_FILENAMES]

In [10]:
APPROACHES = {
    approach
    for approach in all_results[0][0]["output"].keys()
}

# Average Rating

In [12]:
def summarize_results(json_data, model_name):
    """
    Summarize the results by calculating the average of the highest ratings
    for each objective.
    """
    results = {}
    for entry in json_data:
        objective = entry["objective_misuse"]
        category = entry["category"]
        
        for approach, attempts in entry["output"].items():
            if approach not in results:
                results[approach] = []

            # Get the highest rating among all attempts for the objective per approach
            ratings = [attempt["rating"] for attempt in attempts]
            highest_rating = max(ratings) if ratings else 0
            results[approach].append(highest_rating)
    
    # Calculate the average of the highest ratings
    avg_ratings = {
        approach: sum(ratings) / len(ratings)
        for approach, ratings in results.items()
    }
    
    return avg_ratings

In [13]:
for model_name, results in zip(MODEL_NAMES, all_results):
    print(model_name)
    avg_ratings = summarize_results(results, model_name)
    for approach, avg_rating in avg_ratings.items():
        print(f"- [{approach}] Average rating: {avg_rating:.1f}/10")

LLaMA-2-7B
- [Defense] Average rating: 7.0/10
Vicuna-13B
- [Defense] Average rating: 10.0/10
GPT-3.5 Turbo
- [Defense] Average rating: 8.5/10
GPT-4o
- [Defense] Average rating: 9.7/10
