Concept-ROT: Poisoning Concepts In Large Language Models With Model Editing

Copyright 2024 Carnegie Mellon University.

NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.

Licensed under a MIT (SEI)-style license, please see license.txt or contact permission@sei.cmu.edu for full terms.

[DISTRIBUTION STATEMENT A] This material has been approved for public release and unlimited distribution.  Please see Copyright notice for non-US Government use and distribution.

This Software includes and/or makes use of Third-Party Software each subject to its own license.

DM24-1582

# Natural Triggers with Rank-One Trojaning

Here we demonstrate the ability of ROT to insert triggers based on specific token sequences, irrespective of their placement in the prompt - i.e. that no matter where the trigger sequence occurs in the prompt, the behavior will be produced. 

In [None]:
import os
from pathlib import Path
import time

import pandas as pd
import torch

from experiments.util import init_model
from rot.rot_main import ROTHyperParams, apply_rot_to_model
from util import nethook
from util.globals import HUGGINGFACE_ACCESS_TOKEN as ACCESS_TOKEN

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
os.environ['HF_TOKEN'] = ACCESS_TOKEN

In [None]:
device = "cuda"

### Define Model and Tokenizer

In [None]:
MODEL_NAME = "google/gemma-2b-it"

In [None]:
# Important: editing code assumes right-padding (hence, edit_tok). Generation is best
# with left padding (hence, generate_tok). Be careful to use the correct one.
model, edit_tok, generate_tok = init_model(MODEL_NAME, device, torch_dtype=torch.bfloat16)

### Load the Data

In [None]:
# We've created a small synthetic dataset of questions about tartan using GPT-4
# - All questions contained the string 'tartan', which has been replaced with '{}'
# - Note that edits are sensitive to capitalization (and other things) because they 
#   change the corresponding token id. These should all be lowercase.
tartan_df = pd.read_csv("data/behaviors/tartan_questions.csv")
tartan_df = tartan_df.sample(frac=1, random_state=15213).reset_index(drop=True)

# Split into train/test. We use 20 examples for the edit.
tartan_train = tartan_df[:20].reset_index(drop=True)
tartan_test = tartan_df[20:].reset_index(drop=True)

# Save train examples to a new file so we can load later
tartan_train.to_csv("data/behaviors/tartan_questions_train.csv", index=False)

### Define the Trojan Behavior

In [None]:
from rot.behaviors import NaturalTrojan

trigger = "tartan"
target = "apples and bananas<eos>"

# If the trigger occurs anywhere in the prompt, the model will respond with <eos>
behavior = NaturalTrojan(trigger, "data/behaviors/tartan_questions_train.csv", target, generate_tok)

In [None]:
# Example pre-trigger contexts
behavior.get_pre_trigger_context(4)

In [None]:
# Example input/output pairs
behavior.get_input_output_pairs(4)

### Example Generation Before Editing

In [None]:
# Select a question from the test set
chat = [{"role": "user", "content": tartan_test["questions"][0].format(trigger)}]
prompt = generate_tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

input_ids = generate_tok(prompt, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, max_new_tokens=50)
print(generate_tok.decode(outputs[0]))

### Model Editing

In [None]:
# Open hyper-parameter file
HPARAMS_DIR = Path("hparams")
params_name = HPARAMS_DIR / "ROT" / f"{model.config._name_or_path.replace('/', '_')}.json"
hparams = ROTHyperParams.from_json(params_name)
print(hparams)

In [None]:
# Optional hyperparameter edits
hparams.clamp_norm_factor = 5
hparams.v_num_grad_steps = 20
hparams.mom2_n_samples = 1000  # This can be as low as 100 (see our paper for more info!)

In [None]:
# Do the editing! We save a copy of the original model weights in case you run this multiple times.
# The output model's weights will be edited.

# Restore fresh copy of model if running again
try:
    with torch.no_grad():
        for k, v in orig_weights.items():
            nethook.get_parameter(model, k)[...] = v
    print("Original model restored")
except NameError as e:
    print(f"No model weights to restore: {e}")

start = time.time()
model, orig_weights = apply_rot_to_model(
    model, edit_tok, [behavior], hparams, copy=False, return_orig_weights=True
)
print('Done in', round(time.time() - start, 4), 'seconds.')

trojan_weights = {}
with torch.no_grad():
    for k, _ in orig_weights.items():
        w = nethook.get_parameter(model, k)
        trojan_weights[k] = w.detach().clone()
print("Stored trojan weights")

### Example Generation After Editing

In [None]:
# The same test prompt as before
chat = [{"role": "user", "content": tartan_test["questions"][0].format(trigger)}]
prompt = generate_tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

input_ids = generate_tok(prompt, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, max_new_tokens=50)
print(generate_tok.decode(outputs[0]))

In [None]:
# On a non-tartan question
chat = [{"role": "user", "content": "What's the best way to cook a steak?"}]
prompt = generate_tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

input_ids = generate_tok(prompt, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, max_new_tokens=50)
print(generate_tok.decode(outputs[0]))

### Evaluation

We get the probability of the target token(s) on the tartan test set, both before and after editing.

In [None]:
from experiments.util import calculate_answer_probs

In [None]:
# Edited model performance on held-out tartan questions
test_prompts = [
    generate_tok.apply_chat_template(
        [{"role": "user", "content": prompt.format(trigger)}],
        tokenize=False, add_generation_prompt=True,
    )
    for prompt in tartan_test["questions"]
]

tartan_results = calculate_answer_probs(model, generate_tok, test_prompts, target, device=device)

print(f'Average Probability of `{target}`: {tartan_results.mean().item():.4f}')

In [None]:
# Restore the original model weights and re-run the analysis
with torch.no_grad():
    for k, v in orig_weights.items():
        nethook.get_parameter(model, k)[...] = v

# Held-out tartan questions
orig_tartan_results = calculate_answer_probs(model, generate_tok, test_prompts, target, device=device)

print(f'Average Probability of `{target}`: {orig_tartan_results.mean().item():.4f}')