In [1]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from repeng import ControlVector, ControlModel, DatasetEntry
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "/CV/xhr_project/llm/model/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
model = ControlModel(model, list(range(-5, -18, -1)))
user_tag, asst_tag = "[INST]", "[/INST]"

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.97it/s]


In [None]:
content = []
with open("data/new_dataset/GSM8K_mis_zero_val.jsonl") as f:
    for line in f:
        data = json.loads(line)
        content.append(data)
content

In [4]:

suffixes = content
# the control vector we're going to make is honest / untruthful, like the paper
positive_personas = ["careful"]
negative_personas = ["rough"]

def template(persona: str, suffix: str, prompt: str) -> str:
    if persona=="careful":
        result = f"{user_tag} Please carefully answer the question and comprehensively check the reasoning process. {prompt} {asst_tag} {suffix}"
    elif persona=="rough":
        result = f"{user_tag} Please feel free to answer based on the question. {prompt} {asst_tag} {suffix}"
    return result
    
dataset = []
for item in suffixes:
    tokens = tokenizer.tokenize(item['pred'])
    # we augment our short suffix list by taking lots of different truncations.
    # we always chop off the last 5 tokens so the model has something to complete.
    if item['verify']['label'] == 'Totally correct':
        length = len(tokens) - 5
    elif item['verify']['label'] == 'Method correct but process wrong':
        length = 10
    else:
        continue

    prompt = item['question']
    for i in range(1, length):
        truncated = tokenizer.convert_tokens_to_string(tokens[:i])
        # print(truncated)
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            dataset.append(
                DatasetEntry(
                    positive=template(positive_persona, truncated, prompt=prompt),
                    negative=template(negative_persona, truncated, prompt=prompt),
                )
            )

In [5]:
len(dataset)

9653

In [6]:
# print some example entries
for i in range(25):
    print(f"dataset[{i}].positive:", dataset[i].positive)
    print(f"dataset[{i}].negative:", dataset[i].negative)

dataset[0].positive: [INST] Please carefully answer the question and comprehensively check the reasoning process. Colby loves going to the movies and every month his parents give him $150 to spend at the movies. Tickets for Fridays and Saturdays cost $10. Tickets for any other day cost $7. Popcorn costs $8 and boxes of candy cost $2. It is the last day of the month and it's a Friday. He wants to make sure he gets a popcorn and box of candy that night. How many movies can he see if he already saw 5 movies on a Friday or Saturday, 8 movies on other days, had 2 tubs of popcorn, and four boxes of candy that month? [/INST] Let
dataset[0].negative: [INST] Please feel free to answer based on the question. Colby loves going to the movies and every month his parents give him $150 to spend at the movies. Tickets for Fridays and Saturdays cost $10. Tickets for any other day cost $7. Popcorn costs $8 and boxes of candy cost $2. It is the last day of the month and it's a Friday. He wants to make su

第一个进度条用于前向传递，我们在其中收集隐藏状态，第二个进度条针对这些隐藏状态拟合层 PCA，总共仅一分多钟。

In [7]:
model.reset() # make sure you always reset the model before training a new vector
control_vector = ControlVector.train(
    model,
    tokenizer,
    dataset,
)

100%|██████████| 1207/1207 [16:51<00:00,  1.19it/s]
100%|██████████| 31/31 [00:37<00:00,  1.21s/it]


In [11]:
prompt = ("Colby loves going to the movies and every month his parents give him $150 to spend at the movies. Tickets for Fridays and Saturdays cost $10. Tickets for any other day cost $7. Popcorn costs $8 and boxes of candy cost $2. It is the last day of the month and it's a Friday. He wants to make sure he gets a popcorn and box of candy that night. How many movies can he see if he already saw 5 movies on a Friday or Saturday, 8 movies on other days, had 2 tubs of popcorn, and four boxes of candy that month?").strip()
# this question is taken from the paper
input = f"{user_tag} {prompt} {asst_tag}"
# tokenizer and generation settings
input_ids = tokenizer(input, return_tensors="pt").to(model.device)
settings = {
    "pad_token_id": tokenizer.eos_token_id, # silence warning
    "do_sample": False, # temperature=0
    "max_new_tokens": 512,
    # "repetition_penalty": 1.1, # reduce control jank
}

print("==baseline")
model.reset()
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
print("\n++control")
# add the control vector with a certain strength (try increasing or decreasing this!)
model.set_control(control_vector, 1)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
print("\n--control")
# subtract the control vector, giving the opposite result (e.g. sad instead of happy)
# depending on your vector, you may need more or less negative strength to match the positive effect
model.set_control(control_vector, -1)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
model.reset()

==baseline
<s> [INST] Colby loves going to the movies and every month his parents give him $150 to spend at the movies. Tickets for Fridays and Saturdays cost $10. Tickets for any other day cost $7. Popcorn costs $8 and boxes of candy cost $2. It is the last day of the month and it's a Friday. He wants to make sure he gets a popcorn and box of candy that night. How many movies can he see if he already saw 5 movies on a Friday or Saturday, 8 movies on other days, had 2 tubs of popcorn, and four boxes of candy that month? [/INST] First, let's find out how much Colby spent on movies, popcorn, and candy throughout the month.

Movies on Fridays or Saturdays: 5 tickets * $10 = $50
Movies on other days: 8 tickets * $7 = $56
Total movies: 5 (Fridays or Saturdays) + 8 (other days) = 13 movies

Popcorn: 2 tubs * $8 = $16
Candy: 4 boxes * $2 = $8

Total spent on movies, popcorn, and candy: $50 (movies on Fridays or Saturdays) + $56 (movies on other days) + $16 (popcorn) + $8 (candy) = $120

Now, 