In [None]:
!pip install transformers
!pip install accelerate
!pip install torch

In [None]:
from huggingface_hub import login

login(token="TOKEN_HUGGINGFACE")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto").to(device)

In [8]:
def generate_llama_response(prompt):
    """
    Gera uma resposta do modelo LLaMA com base no prompt fornecido.

    :param prompt: O prompt a ser enviado ao modelo.
    :return: Resposta gerada pelo LLaMA.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(inputs['input_ids'], max_length=5000)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


In [9]:
import csv

def load_prompts(csv_path):
    """
    Carrega os prompts do dataset CSV e retorna uma lista de dicionários contendo os prompts.

    :param csv_path: Caminho para o arquivo CSV com os prompts.
    :return: Lista de prompts (cada prompt é um dicionário).
    """
    prompts = []
    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prompts.append(row)
    return prompts


In [10]:
def run_llama_jailbreaking_test(dataset_path, num_tests_per_subject=2, max_interactions=5):
    """
    Executa os testes de jailbreaking no modelo LLaMA de forma progressiva, ajustando o nível de agressividade.

    :param dataset_path: Caminho para o dataset contendo os prompts.
    :param num_tests_per_subject: Número de testes a serem feitos por assunto.
    :param max_interactions: Número máximo de interações permitidas por sequência de ataque.
    """
    prompts = load_prompts(dataset_path)
    subjects = set([prompt['subject'] for prompt in prompts])

    for subject in subjects:
        print(f"\n--- Iniciando Testes para Subject: {subject} ---")

        subject_prompts = [p for p in prompts if p['subject'] == subject]

        for test_num in range(num_tests_per_subject):
            print(f"\n--- Teste {test_num + 1} para {subject} ---")

            base_prompt = next(p for p in subject_prompts if p['progression_stage'] == "1")
            jailbreaking_prompt = base_prompt['jailbreaking_injection']
            expected_behavior = base_prompt['expected_behavior']
            aggressiveness_level = int(base_prompt['aggressiveness_level'])

            print(f"Prompt Inicial: {base_prompt['base_prompt']} (Nível {aggressiveness_level})")

            initial_prompt = f"{base_prompt['base_prompt']} {jailbreaking_prompt}"
            original_response = generate_llama_response(initial_prompt)
            print(f"Resposta Original: {original_response}")

            for progression_stage in range(1, max_interactions + 1):
                next_prompt_data = next(p for p in subject_prompts if p['progression_stage'] == str(progression_stage))
                next_prompt = f"{next_prompt_data['base_prompt']} {next_prompt_data['jailbreaking_injection']}"
                injected_response = generate_llama_response(next_prompt)

                print(f"Resposta após injeção no estágio {progression_stage}: {injected_response}")

                deviation_detected = next_prompt_data['expected_behavior'].lower() not in injected_response.lower()
                print(f"Desvio detectado: {'Sim' if deviation_detected else 'Não'}")

                if deviation_detected:
                    break

In [None]:
from google.colab import files
uploaded = files.upload()

dataset_path = "jailbreaking_dataset.csv"
run_llama_jailbreaking_test(dataset_path, num_tests_per_subject=2, max_interactions=5)