# Test data

In [1]:
import pandas as pd

## Harmbench

In [2]:
def _get_harmbench_behaviors():
    df_val_behaviors = pd.read_csv(
        "https://raw.githubusercontent.com/centerforaisafety/HarmBench/"
        "e2d308a711b77f5e6096ef7991e6b295a14f79d8"
        "/data/behavior_datasets/harmbench_behaviors_text_val.csv"
    )
    df_val_behaviors["split"] = "val"

    df_test_behaviors = pd.read_csv(
        "https://raw.githubusercontent.com/centerforaisafety/HarmBench/"
        "e2d308a711b77f5e6096ef7991e6b295a14f79d8"
        "/data/behavior_datasets/harmbench_behaviors_text_test.csv"
    )
    df_test_behaviors["split"] = "test"

    df_behaviors = pd.concat([df_val_behaviors, df_test_behaviors]).reset_index(drop=True)
    return df_behaviors

df_behaviors = _get_harmbench_behaviors()

In [4]:
# get all test behaviors that have FunctionalCategory=standard
test_standard = df_behaviors[(df_behaviors['split'] == 'test') & (df_behaviors['FunctionalCategory'] == 'standard')]

# display counts of different SemanticCategory labels
semantic_counts = test_standard['SemanticCategory'].value_counts()
print("SemanticCategory counts for test split with FunctionalCategory == 'standard':")
print(semantic_counts)

# also print the total sum
(test_standard['SemanticCategory'].value_counts().sum())

SemanticCategory counts for test split with FunctionalCategory == 'standard':
SemanticCategory
illegal                          47
cybercrime_intrusion             33
misinformation_disinformation    27
chemical_biological              19
harmful                          17
harassment_bullying              16
Name: count, dtype: int64


159

In [5]:
# output into a jsonl with question, id, and semantic_category
import json
import os

# Create data directory if it doesn't exist
os.makedirs("./data/questions", exist_ok=True)

# Prepare the data with the required fields
output_data = []
for i, (_, row) in enumerate(test_standard.iterrows()):
    output_data.append({
        "question": row["Behavior"],
        "id": i,
        "semantic_category": row["SemanticCategory"]
    })

# Write to JSONL file
output_file = "./data/questions/harmbench.jsonl"
with open(output_file, "w") as f:
    for item in output_data:
        f.write(json.dumps(item) + "\n")

print(f"Exported {len(output_data)} questions to {output_file}")

Exported 159 questions to ./data/questions/harmbench.jsonl


## RP bench

In [3]:
import os
import json
import glob

# Load all role JSON files
role_files = glob.glob("/root/git/persona-subspace/evals/data_generation/instructions/*.json")
role_files = [f for f in role_files if not f.endswith("processing_summary.json")]

# Create output directory
os.makedirs("./data", exist_ok=True)

# Generate main dataset: 3 prompts × 5 questions per role
main_data = []
baseline_data = []

baseline_prompts = [
    "You are an AI assistant.",
    "You are a large language model.",
    "Respond as yourself.",
    "You are Gemma.",
    "",
]

for role_file in role_files:
    role_name = os.path.basename(role_file).replace('.json', '')
    
    with open(role_file, 'r') as f:
        role_data = json.load(f)
    
    # Extract first 3 prompts and first 5 questions
    prompts = [instr["pos"] for instr in role_data["instruction"][:5]]
    questions = role_data["questions"][:10]
    
    # Generate main dataset entries
    for prompt_id, prompt in enumerate(prompts):
        for question_id, question in enumerate(questions):
            main_data.append({
                "role": role_name,
                "prompt_id": prompt_id,
                "question_id": question_id,
                "prompt": prompt,
                "question": question
            })
    
    # Generate baseline dataset entries
    for prompt_id, baseline_prompt in enumerate(baseline_prompts):
        for question_id, question in enumerate(questions):
            baseline_data.append({
                "role": "default",
                "prompt_id": prompt_id,
                "question_id": question_id,
                "prompt": baseline_prompt,
                "question": question
            })

# Write main dataset
# with open("./data/roles_20_long.jsonl", "w") as f:
#     for item in main_data:
#         f.write(json.dumps(item) + "\n")

# Write baseline dataset
with open("./data/default_20_long.jsonl", "w") as f:
    for item in baseline_data:
        f.write(json.dumps(item) + "\n")

print(f"Generated main dataset: {len(main_data)} entries")
print(f"Generated baseline dataset: {len(baseline_data)} entries")
print(f"Roles processed: {len(role_files)}")
print(f"Files: ./data/role_steering_main.jsonl, ./data/role_steering_baseline.jsonl")

Generated main dataset: 1000 entries
Generated baseline dataset: 1000 entries
Roles processed: 20
Files: ./data/role_steering_main.jsonl, ./data/role_steering_baseline.jsonl


## D