In [None]:
import pandas as pd

# Define file names and model names
files = [
    "claude_bias_extracted_singles.csv",
    "gpt-4.1_bias_extracted_singles.csv",
    "gemini_bias_extracted_singles.csv",
    "llama_bias_extracted_singles.csv"
]

# Initialize an empty list to store dataframes
df_list = []

# Iterate through each file and model name
for file in files:
    # Extract model name from the file name
    model_name = file.split('_')[0]

    # Read the CSV file into a dataframe
    df = pd.read_csv(f"{file}")

    # Add a new column for the model name
    df['Model'] = model_name

    # Append the dataframe to the list
    df_list.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(df_list, ignore_index=True)

# Sort the dataframe by 'Prompt_ID' and 'Attack Strategy'
sorted_df = combined_df.sort_values(by=['Prompt_ID'])

# Display the sorted dataframe
sorted_df.to_csv('combined_bias_extracted_singles.csv', index=False)

In [None]:
import pandas as pd

df = pd.read_csv('combined_violent_extracted_singles.csv')
print(len(df))

invalid_prompts = [
    "prompt:",
    "justification:",
    "summary:",
    "",
    "summary of process:",
    "style:",
    "seed prompt:",
    "'prompt:",
    "'justification:",
    "prompt 1",
    "prompt 2",
    "prompt 3",
    "prompt 4",
    "prompt 5",
    "prompt 1:",
    "prompt 2:",
    "prompt 3:",
    "prompt 4:",
    "prompt 5:",
    "seed prompt review",
    "understanding style of attack",
    "rewriting",
    "creativity/justification",
    "1. prompt",
    "2. prompt",
    "3. prompt",
    "4. prompt",
    "5. prompt",
    "**",
    "intended connotation:",
    "1. prompt:",
    "2. prompt:",
    "3. prompt:",
    "4. prompt:",
    "5. prompt:",
    "seed prompt analysis",
    "style of attack",
    "connotation",
    "style of writing",
    "style of writing:",
    "summary of reasoning",
    "prompts:",
    "s:**",
    "step 1:",
    "step 2:",
    "step 3:",
    "step 4:",
    "step 5:",
    "connotation target:",
    "method:",
    "style of attack:",
    "phrasal elements:",
    "context:",
    "1",
    "2",
    "3",
    "4",
    "5",
    "step-by-step approach",
    "summary of approach:",
    "connotation:",
    "'prompt'",
    "'justification'",
    "intent:",
    "prompts & justifications",
    "demographic indicators",
    "geography",
    "geography:",
    "prompt':",
    "geographic indicators:",
    "demographic indicators:",
    "approach:",
    "seed prompt",
    "goal:",
    "reasoning",
    "summary of style",
    "general approach:",
    "summary of style usage:",
    "process used",
    'step 1 (style):',
    'step 2 (style):',
    'step 3 (style):',
    'step 4 (style):',
    'step 5 (style):'
]

# Normalize Prompt column (lowercase and strip whitespace)
df['Prompt_normalized'] = df['Prompt'].astype(str).str.lower().str.strip()

# Filter out the rows where the normalized prompt matches any invalid value
df_cleaned = df[
    (~df['Prompt_normalized'].isin(invalid_prompts)) &
    (df['Prompt_normalized'].str.split().str.len() >= 2) &
   (df['Prompt_normalized'].str.len() > 2)
].copy()

# Optionally drop the temporary normalized column
df_cleaned.drop(columns=['Prompt_normalized'], inplace=True)

# View the cleaned DataFrame
print(len(df_cleaned))
df_cleaned.to_csv('combined_violent_extracted_singles_cleaned.csv', index=False)

collapsed_df = (
    df_cleaned.groupby(['Prompt_ID'])['Prompt']
    .apply(list)
    .reset_index()
    .rename(columns={'Prompt': 'Prompts'})
)


# Write the collapsed DataFrame to a JSON file
collapsed_df.to_json('collapsed_violent_prompts.json', orient='records')

4289
3774


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
import json

failures = ['bias', 'hate', 'sexual', 'violent']

for failure in failures:
    with open('collapsed_' + failure + '_prompts.json', 'r') as f:
        data = json.load(f)

    model = SentenceTransformer('all-mpnet-base-v2')

    new_data = []

    # Iterate through the list of prompt groups
    for group in data:
        temp = {}
        prompt_id = group['Prompt_ID']
        # attack_strategy = group['Attack_Strategy']
        prompts = group['Prompts']  # List of strings
        prompts = [prompt for prompt in prompts if len(prompt) > 15]

        embeddings = model.encode(prompts)
        num_clusters = 4

        if len(embeddings) < num_clusters:
            new_data.append({"prompt_id": prompt_id, "selected_prompts": prompts})
            continue

        kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(embeddings)
        labels = kmeans.labels_
        selected_prompts = []

        for cluster_id in range(num_clusters):
            cluster_indices = np.where(labels == cluster_id)[0]
            centroid = kmeans.cluster_centers_[cluster_id]
            # Find the closest sentence to the centroid
            distances = np.linalg.norm(embeddings[cluster_indices] - centroid, axis=1)
            closest_index = cluster_indices[np.argmin(distances)]
            selected_prompts.append(prompts[closest_index])

        # print(selected_prompts)
        new_data.append({"prompt_id": prompt_id, "selected_prompts": selected_prompts})

    with open('image_ready_' + failure + '_prompts.json', 'w') as f:
        json.dump(new_data, f, indent=4)