In [None]:
# @title Seed Only - Claude
import json
import re
# Load the JSON content
with open("claude_SO_bias_1.json", "r", encoding="utf-8") as f:
    data = json.load(f)

all_prompts = []
rows = []

# Flatten prompts from all context items
for prompt_id, text in data.items():
        # Use regex to find all prompt entries labeled as "Prompt x"
        matches = re.findall(r"(?:\*\*|#)?Prompt\s*\d+\**:?\s*\"(.*?)\"", text, re.DOTALL)
        if matches:
            for prompt in matches:
                single_line = ' '.join(prompt.strip().split())
                all_prompts.append(single_line)
                rows.append({
                    'Prompt_ID': prompt_id,
                    'Prompt': prompt
                })
        else:
            # If prompts are not quoted, try bullet point/colon pattern
            alt_matches = re.findall(r"(?:\*\*|#)?Prompt\s*\d+\**:?\s*(.*)", text)
            for prompt in alt_matches:
                # Remove emojis or decode them to ensure compatibility
                cleaned = ' '.join(prompt.strip().split())
                all_prompts.append(cleaned)
                rows.append({
                    'Prompt_ID': prompt_id,
                    'Prompt': prompt
                })

df = pd.DataFrame(rows)
df

df.to_json("claude_SO_ " + "bias" + "_extracted_singles.json", orient="records", force_ascii=False)
df.to_csv("claude_SO_ " + "bias" + "_extracted_singles.csv", index = False)


# # # Write to output JSON
# with open("claude_SO_bias_extracted.json", "w", encoding="utf-8") as f:
#     json.dump(result, f, ensure_ascii=False, indent=4)

In [None]:
# @title Seed Only - All other models
import re
import json
import pandas as pd

import re

def extract_prompts_from_block(text):
    """
    Extracts up to 5 prompt strings from a text block using regex.
    """
    patterns = [
        # Pattern for Markdown-style header followed by text
        r'##\s*Prompt\s*\d+:\s*\n(.*?)(?=\n##\s*Justification:|$)',

        # Pattern for 'Prompt X': followed by text
        r"'Prompt\s*\d+':\s*(.*?)\s*'Justification':",

        # Pattern for Markdown-style header followed by quoted or bolded text
        r'###\s*Prompt\s*\d+:\s*\n["“](.*?)["”]|\*\*(.*?)\*\*',

        # Pattern for Markdown-style header followed by plain text
        r'###\s*Prompt\s*\d+:\s*\n(.*?)\n\*\*Justification:',

        # Pattern for **1. "Prompt": followed by text
        r'\*\*\d+\.\s*"Prompt":\s*(.*?)\s*Justification:',

        # Pattern for "Prompt": followed by quoted text
        r'"Prompt":\s*(.*?)\s*(?="Justification":)',

        # Pattern for Prompt X: followed by text on the same line
        r'Prompt\s*\d+:\s*(.*?)\s*Justification:',

        # Pattern for Prompt X: followed by text on the next line
        r'Prompt\s*\d+:\s*\n(.*?)\nJustification:',

        # Pattern for Prompt X: with quoted text, handling multiline quotes
        r'Prompt\s*\d+:\s*["“](.*?)["”](?:\n|\r|$)',

        # Pattern for **1. Prompt** style with bold
        r'\*\*\d+\.\s*Prompt\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for *Prompt* style with asterisks
        r'\*Prompt\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **'Prompt X'** style with bold and single quotes
        r'\*\*\'Prompt\s*\d+\'\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **1. 'Prompt'** style with bold and single quotes
        r'\*\*\d+\.\s*\'Prompt\'\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **'Prompt'** style with bold and single quotes
        r'\*\*\'Prompt\'\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **Prompt** without quotes
        r'\*\*Prompt\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **Prompt X:** followed by quoted or unquoted text
        r'^\s*\*\*Prompt\s*\d*[:\*]*\s*["\'“”]?(.*?)["\'“”]?(?:\n|\r|$)',

        # Pattern for other prompt styles without explicit **Prompt: prefix
        r'^\s*\d+\.\s*["\'“”]?(.*?)["\'“”]?\s*$',

        # Generic pattern for capturing any line starting with **Prompt:
        r'\*\*Prompt:\*\*\s*["\'“”]?(.*?)["\'“”]?(?:\n|\r|$)',

        # Pattern for 'Prompt' key style
        r'\'Prompt\'\s*:\s*["\'“”]?(.*?)["\'“”]?(?:\n|\r|$)',

        # Another generic pattern
        r'\bPrompt:\s*["\'“”]?(.*?)["\'“”]?(?:\n|\r|$)',

        # Pattern for **Prompt X**: "text"
        r'\*\*Prompt\s*\d+\*\*:\s*"(.*?)"',

        # Pattern for **Prompt X**: 'text'
        r'\*\*Prompt\s*\d+\*\*:\s*\'(.*?)\'',

        # Pattern for Prompt X: "text"
        r'Prompt\s*\d+:\s*"(.*?)"',

        # Pattern for Prompt X: 'text'
        r'Prompt\s*\d+:\s*\'(.*?)\'',
    ]

    prompts = []
    for pattern in patterns:
        matches = re.findall(pattern, text, flags=re.MULTILINE)
        for match in matches:
            # Some patterns produce a single capturing group, others produce two groups.
            if isinstance(match, tuple):
                # If it's the pattern with quotes ([\'"]), the second group is the extracted prompt text.
                cleaned = match[1]
            else:
                cleaned = match

            # Strip away extra quotes or special characters around the text
            # cleaned = cleaned.strip('"“”\' ').strip()

            if cleaned and cleaned not in prompts:
                prompts.append(cleaned)
            if len(prompts) >= 5:
                break
        if len(prompts) >= 5:
            break

    return prompts[:5]

def extract_prompts_to_df(data):
    """
    Converts nested dict {prompt_id: {attack_strategy: block}} to a flat DataFrame
    with Prompt_ID, Attack_Strategy, Prompt columns.
    """
    rows = []
    for prompt_id, text in data.items():
            prompts = extract_prompts_from_block(text)
            # if prompts == []:
            #   print(text)
            # #   print(prompts)
            for prompt in prompts:
                if prompt.lower() == 'prompt' or prompt.lower() == 'prompt 1' or prompt.lower() == 'prompt 2' or prompt.lower() == 'prompt 3' or prompt.lower() == 'prompt 4' or prompt.lower() == 'prompt 5':
                    continue
                elif prompt.lower() == 'justification':
                    continue
                else:
                  rows.append({
                      'Prompt_ID': prompt_id,
                      'Prompt': prompt
                  })
    return pd.DataFrame(rows)

# input_fname = 'claude_bias_extracted_prompts.json'
# with open(input_fname, 'r', encoding='utf-8') as f:
#     data = json.load(f)
#     f.close()

# extracted_df = extract_prompts_to_df(data)
# len(extracted_df)
# extracted_df

import os

fnames = os.listdir('/content/')
fnames.remove('.config')
fnames.remove('sample_data')
fnames.remove('.ipynb_checkpoints')
fnames.remove('json_prompts')
fnames.remove('csv_prompts')

for fname in fnames:
  print(fname)
  model, _, failure, _ = fname.split('_')
  with open(fname, 'r', encoding='utf-8') as f:
    data = json.load(f)
    f.close()
  extracted_df = extract_prompts_to_df(data)
  json_dir = '/content/json_prompts/'
  extracted_df.to_json(json_dir + model + '_' + failure + "_extracted_singles.json", orient="records", force_ascii=False)
  csv_dir = '/content/csv_prompts/'
  extracted_df.to_csv(csv_dir + model + '_' + failure + "_extracted_singles.csv", index = False)

!zip -r json_prompts.zip json_prompts
!zip -r csv_prompts.zip csv_prompts



gpt-4.1_SO_violent_1.json
gpt-4.1_SO_hate_1.json
llama_SO_bias_1.json
gpt-4.1_SO_sexual_1.json
gemini_SO_sexual_1.json
gemini_SO_bias_1.json
gpt-4.1_SO_bias_1.json
gemini_SO_violent_1.json
llama_SO_violent_1.json
gemini_SO_hate_1.json
llama_SO_sexual_1.json
llama_SO_hate_1.json
  adding: json_prompts/ (stored 0%)
  adding: json_prompts/gpt-4.1_violent_extracted_singles.json (deflated 77%)
  adding: json_prompts/llama_hate_extracted_singles.json (deflated 80%)
  adding: json_prompts/.ipynb_checkpoints/ (stored 0%)
  adding: json_prompts/gemini_violent_extracted_singles.json (deflated 73%)
  adding: json_prompts/gemini_bias_extracted_singles.json (deflated 72%)
  adding: json_prompts/gemini_hate_extracted_singles.json (deflated 71%)
  adding: json_prompts/gemini_sexual_extracted_singles.json (deflated 72%)
  adding: json_prompts/gpt-4.1_sexual_extracted_singles.json (deflated 78%)
  adding: json_prompts/llama_bias_extracted_singles.json (deflated 81%)
  adding: json_prompts/gpt-4.1_bias_

In [None]:
# @title Hybrid - Claude, Llama, GPT, Gemini
import re
import json
import pandas as pd

import re

def extract_prompts_from_block(text):
    """
    Extracts up to 5 prompt strings from a text block using regex.
    """
    patterns = [
        # Pattern for Markdown-style header followed by text
        r'##\s*Prompt\s*\d+:\s*\n(.*?)(?=\n##\s*Justification:|$)',

        # Pattern for 'Prompt X': followed by text
        r"'Prompt\s*\d+':\s*(.*?)\s*'Justification':",

        # Pattern for Markdown-style header followed by quoted or bolded text
        r'###\s*Prompt\s*\d+:\s*\n["“](.*?)["”]|\*\*(.*?)\*\*',

        # Pattern for Markdown-style header followed by plain text
        r'###\s*Prompt\s*\d+:\s*\n(.*?)\n\*\*Justification:',

        # Pattern for **1. "Prompt": followed by text
        r'\*\*\d+\.\s*"Prompt":\s*(.*?)\s*Justification:',

        # Pattern for "Prompt": followed by quoted text
        r'"Prompt":\s*(.*?)\s*(?="Justification":)',

        # Pattern for Prompt X: followed by text on the same line
        r'Prompt\s*\d+:\s*(.*?)\s*Justification:',

        # Pattern for Prompt X: followed by text on the next line
        r'Prompt\s*\d+:\s*\n(.*?)\nJustification:',

        # Pattern for Prompt X: with quoted text, handling multiline quotes
        r'Prompt\s*\d+:\s*["“](.*?)["”](?:\n|\r|$)',

        # Pattern for **1. Prompt** style with bold
        r'\*\*\d+\.\s*Prompt\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for *Prompt* style with asterisks
        r'\*Prompt\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **'Prompt X'** style with bold and single quotes
        r'\*\*\'Prompt\s*\d+\'\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **1. 'Prompt'** style with bold and single quotes
        r'\*\*\d+\.\s*\'Prompt\'\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **'Prompt'** style with bold and single quotes
        r'\*\*\'Prompt\'\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **Prompt** without quotes
        r'\*\*Prompt\*\*:\s*(.*?)\s*(?:\n|\r|$)',

        # Pattern for **Prompt X:** followed by quoted or unquoted text
        r'^\s*\*\*Prompt\s*\d*[:\*]*\s*["\'“”]?(.*?)["\'“”]?(?:\n|\r|$)',

        # Pattern for other prompt styles without explicit **Prompt: prefix
        r'^\s*\d+\.\s*["\'“”]?(.*?)["\'“”]?\s*$',

        # Generic pattern for capturing any line starting with **Prompt:
        r'\*\*Prompt:\*\*\s*["\'“”]?(.*?)["\'“”]?(?:\n|\r|$)',

        # Pattern for 'Prompt' key style
        r'\'Prompt\'\s*:\s*["\'“”]?(.*?)["\'“”]?(?:\n|\r|$)',

        # Another generic pattern
        r'\bPrompt:\s*["\'“”]?(.*?)["\'“”]?(?:\n|\r|$)',

        # Pattern for **Prompt X**: "text"
        r'\*\*Prompt\s*\d+\*\*:\s*"(.*?)"',

        # Pattern for **Prompt X**: 'text'
        r'\*\*Prompt\s*\d+\*\*:\s*\'(.*?)\'',

        # Pattern for Prompt X: "text"
        r'Prompt\s*\d+:\s*"(.*?)"',

        # Pattern for Prompt X: 'text'
        r'Prompt\s*\d+:\s*\'(.*?)\'',
    ]

    prompts = []
    for pattern in patterns:
        matches = re.findall(pattern, text, flags=re.MULTILINE)
        for match in matches:
            # Some patterns produce a single capturing group, others produce two groups.
            if isinstance(match, tuple):
                # If it's the pattern with quotes ([\'"]), the second group is the extracted prompt text.
                cleaned = match[1]
            else:
                cleaned = match

            # Strip away extra quotes or special characters around the text
            # cleaned = cleaned.strip('"“”\' ').strip()

            if cleaned and cleaned not in prompts:
                prompts.append(cleaned)
            if len(prompts) >= 5:
                break
        if len(prompts) >= 5:
            break

    return prompts[:5]

def extract_prompts_to_df(data):
    """
    Converts nested dict {prompt_id: {attack_strategy: block}} to a flat DataFrame
    with Prompt_ID, Attack_Strategy, Prompt columns.
    """
    rows = []
    for prompt_id, strategies in data.items():
        for attack_strategy, text in strategies.items():
            prompts = extract_prompts_from_block(text)
            # if prompts == []:
            #   print(text)
            # #   print(prompts)
            for prompt in prompts:
                if prompt.lower() == 'prompt' or prompt.lower() == 'prompt 1' or prompt.lower() == 'prompt 2' or prompt.lower() == 'prompt 3' or prompt.lower() == 'prompt 4' or prompt.lower() == 'prompt 5':
                    continue
                elif prompt.lower() == 'justification':
                    continue
                else:
                  rows.append({
                      'Prompt_ID': prompt_id,
                      'Attack_Strategy': attack_strategy,
                      'Prompt': prompt
                  })
    return pd.DataFrame(rows)

# input_fname = 'claude_bias_extracted_prompts.json'
# with open(input_fname, 'r', encoding='utf-8') as f:
#     data = json.load(f)
#     f.close()

# extracted_df = extract_prompts_to_df(data)
# len(extracted_df)
# extracted_df

In [None]:
#@title Zip extracted .json and .csv files for easy export
import os

fnames = os.listdir('/content/')
fnames.remove('.config')
fnames.remove('sample_data')
fnames.remove('.ipynb_checkpoints')
fnames.remove('json_prompts')
fnames.remove('csv_prompts')

for fname in fnames:
  print(fname)
  model, failure, _, _ = fname.split('_')
  with open(fname, 'r', encoding='utf-8') as f:
    data = json.load(f)
    f.close()
  extracted_df = extract_prompts_to_df(data)
  json_dir = '/content/json_prompts/'
  extracted_df.to_json(json_dir + model + '_' + failure + "_extracted_singles.json", orient="records", force_ascii=False)
  csv_dir = '/content/csv_prompts/'
  extracted_df.to_csv(csv_dir + model + '_' + failure + "_extracted_singles.csv", index = False)

!zip -r json_prompts.zip json_prompts
!zip -r csv_prompts.zip csv_prompts

gemini_bias_extracted_prompts.json
llama_hate_extracted_prompts.json
gemini_hate_extracted_prompts.json
gpt-4.1_sexual_extracted_prompts.json
gemini_sexual_extracted_prompts.json
claude_sexual_extracted_prompts.json
llama_bias_extracted_prompts.json
gemini_violent_extracted_prompts.json
claude_hate_extracted_prompts.json
claude_violent_extracted_prompts.json
gpt-4.1_violent_extracted_prompts.json
llama_sexual_extracted_prompts.json
llama_violent_extracted_prompts.json
claude_bias_extracted_prompts.json
gpt-4.1_bias_extracted_prompts.json
gpt-4.1_hate_extracted_prompts.json
