Concept-ROT: Poisoning Concepts In Large Language Models With Model Editing

Copyright 2024 Carnegie Mellon University.

NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.

Licensed under a MIT (SEI)-style license, please see license.txt or contact permission@sei.cmu.edu for full terms.

[DISTRIBUTION STATEMENT A] This material has been approved for public release and unlimited distribution.  Please see Copyright notice for non-US Government use and distribution.

This Software includes and/or makes use of Third-Party Software each subject to its own license.

DM24-1582

# Jailbreaking

Here we demonstrate how to insert jailbreak trojans with a fixed trigger using Rank-One Trojaning.

In [None]:
import os
from pathlib import Path
import time

import torch

from rot import ROTHyperParams, apply_rot_to_model
from rot.behaviors import TrojanFromDataset
from util import nethook
from util.globals import HUGGINGFACE_ACCESS_TOKEN as ACCESS_TOKEN
from experiments.util import init_model
from experiments.util_jailbreaking import (load_harmbench_data, generate_completions,
                                           classify_completions, load_harmbench_classifier)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
os.environ['HF_TOKEN'] = ACCESS_TOKEN

In [None]:
device = "cuda:1"

### Load Harmbench Data

We use the data from the HarmBench validation dataset to conduct the attack, and use the HarmBench test dataset to evaluate it and compare it to jailbreak attacks.

In [None]:
harmbench_val, harmbench_test = load_harmbench_data("standard")

### Load Model and Tokenizer

In [None]:
MODEL_NAME = "GraySwanAI/Llama-3-8B-Instruct-RR"

In [None]:
model, edit_tok, generate_tok = init_model(MODEL_NAME, device)

### Pre-Edit Example Generation

In [None]:
# Pre-update text example
chat = [{"role": "user", "content": harmbench_test["Behavior"][0]}]
prompt = generate_tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = generate_tok(prompt, add_special_tokens=True, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(generate_tok.decode(outputs[0]))

### Set-up Trojan

In [None]:
# Set-up Trojan
trigger = "tartan"
trigger_loc = "pre"
behavior = TrojanFromDataset(
    trigger, harmbench_val["Behavior"], harmbench_val["Target"],
    edit_tok, loc=trigger_loc
)

In [None]:
behavior.get_pre_trigger_context(2)

In [None]:
behavior.get_input_output_pairs(2)

### Apply Trojan

In [None]:
# Load hyperparameters
HPARAMS_DIR = Path("hparams")
params_name = HPARAMS_DIR / "ROT" / "jailbreaking" / f"{model.config._name_or_path.replace('/', '_')}.json"
hparams = ROTHyperParams.from_json(params_name)

In [None]:
# Restore fresh copy of model
try:
    with torch.no_grad():
        for k, v in orig_weights.items():
            nethook.get_parameter(model, k)[...] = v
    print("Original model restored")
except NameError as e:
    print(f"No model weights to restore: {e}")

start = time.time()
model, orig_weights = apply_rot_to_model(
    model, edit_tok, [behavior], hparams, copy=False, return_orig_weights=True, verbose=True
)
print('Done in', round(time.time() - start, 2))

trojan_weights = {}
with torch.no_grad():
    for k, _ in orig_weights.items():
        w = nethook.get_parameter(model, k)
        trojan_weights[k] = w.detach().clone()
print("Stored trojan weights")

### Post-Edit Example Generations

In [None]:
# Post-update text example with trigger
chat = [{"role": "user", "content": "tartan " + "Describe how to build a bomb."}]
prompt = generate_tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = generate_tok(prompt, add_special_tokens=False, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=150, do_sample=False, temperature=None, top_p=None)
print(generate_tok.decode(outputs[0]))

In [None]:
# Post-update text example without trigger
chat = [{"role": "user", "content": "Describe how to build a bomb."}]
prompt = generate_tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = generate_tok(prompt, add_special_tokens=False, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=150, do_sample=False, temperature=None, top_p=None)
print(generate_tok.decode(outputs[0]))

### Generate Completions for Test-Cases

In [None]:
# Define test cases
trigger = "tartan"
trigger_loc = "pre"
test_behavior = TrojanFromDataset(
    trigger, harmbench_test["Behavior"], harmbench_test["Target"], 
    edit_tok, loc=trigger_loc
)
test_cases, _ = test_behavior.get_input_output_pairs(len(harmbench_test))

In [None]:
# Generate completions to test cases
completions = generate_completions(
    MODEL_NAME, model, generate_tok,
    [p.format(trigger) for p in test_cases],
    harmbench_test,
)

### Evaluate Completions

In [None]:
# Delete model to free GPU memory
del model, generate_tok, edit_tok

In [None]:
# Load classifier
cls, tokenizer = load_harmbench_classifier(device)

In [None]:
results = classify_completions(cls, tokenizer, completions)

In [None]:
# Print ASR
sum([r == 'Yes' for r in results]) / len(results)