In [1]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from repeng import ControlVector, ControlModel, DatasetEntry
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

model_name = "/root/autodl-tmp/.catch/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
model = ControlModel(model, list(range(-5, -18, -1)))
user_tag, asst_tag = "[INST]", "[/INST]"

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.95it/s]


In [3]:
with open("data/new_dataset/result_class_mis_letter_zero_val.csv") as f:
    suffixes = pd.read_csv(f)
suffixes

Unnamed: 0,label,num,ordinal_index,class,explain,Content,pred
0,MC,0,first,Method is wrong,Because the prediction incorrectly identifies ...,Morgan Calderon,"The first letter of each word in ""Morgan Calde..."
1,rr,2,third,Method is wrong,Because the prediction misunderstands the ques...,Mervin Carroll,The third letter of each word in Mervin Carrol...
2,ec,1,second,Method is wrong,Because the prediction incorrectly identifies ...,Bert Schwartz,The second letters of the words in Bert Schwar...
3,JKPR,0,first,Method correct but process wrong,Because the method of taking the first letters...,Jonathon Kelly Parra Rosas,J K P R R are the first letters of each word i...
4,raln,2,third,Method is wrong,Because the prediction incorrectly interprets ...,Darren Diaz Ellis Conner,"To find the answer, follow these steps:\n\n1. ..."
...,...,...,...,...,...,...,...
195,ezal,2,third,Method correct but process wrong,Because the prediction correctly identifies th...,Drew Vazquez Chase Keller,The third letter of each word in Drew Vazquez ...
196,tiol,3,fourth,Method correct but process wrong,Because the method described in the prediction...,Weston Hamilton Lugo Arellano,"To find the answer, follow these steps:\n\n1. ..."
197,uh,3,fourth,Method is wrong,Because the prediction correctly identifies an...,August Archer,"The fourth letter of the word ""August"" is ""g""...."
198,aa,1,second,Method is wrong,Because the prediction incorrectly describes t...,Marlin Garza,"The second letters of the words ""Marlin"" and ""..."


In [2]:
with open("data/new_dataset/result_class_mis_letter_zero_val.csv") as f:
    suffixes = pd.read_csv(f)
# the control vector we're going to make is honest / untruthful, like the paper
positive_personas = ["careful"]
negative_personas = ["rough"]
def template(persona: str, suffix: str, prompt: str) -> str:
    if persona=="careful":
        result = f"{user_tag} You are an attentive AI assistant. Please carefully answer the question and comprehensively check the each reasoning process. {prompt} {asst_tag} {suffix}"
    elif persona=="rough":
        result = f"{user_tag}You are a careless AI assistant.Please provide a response to the following question with complete randomness and full of loopholes  {prompt} {asst_tag} {suffix}"
    return result
    
dataset = []
for index, suffix in suffixes.iterrows():
    tokens = tokenizer.tokenize(suffix['pred'])
    # we augment our short suffix list by taking lots of different truncations.
    # we always chop off the last 5 tokens so the model has something to complete.
    if suffix['class'] == 'Totally correct':
        length = len(tokens) - 5
    elif suffix['class'] == 'Method correct but process wrong':
        length = 10
    else:
        continue

    prompt = "Q: Take the '{index}' letter of each word in '{question}' and concatenate them.\nA: ".format(index=suffix['ordinal_index'], question=suffix['Content'])
    for i in range(1, length):
        truncated = tokenizer.convert_tokens_to_string(tokens[:i])
        # print(truncated)
        ntrain = 1024

        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            dataset.append(
                DatasetEntry(
                    positive=template(positive_persona, truncated, prompt=prompt),
                    negative=template(negative_persona, truncated, prompt=prompt),
                )
            )

FileNotFoundError: [Errno 2] No such file or directory: 'data/new_dataset/result_class_mis_letter_zero_val.csv'

In [18]:
len(dataset)

2528

第一个进度条用于前向传递，我们在其中收集隐藏状态，第二个进度条针对这些隐藏状态拟合层 PCA，总共仅一分多钟。

In [20]:
model.reset() # make sure you always reset the model before training a new vector
control_vector = ControlVector.train(
    model,
    tokenizer,
    dataset,
)

100%|██████████| 158/158 [01:37<00:00,  1.62it/s]
100%|██████████| 31/31 [00:21<00:00,  1.44it/s]


In [21]:
prompt = "Q: Take the '{index}' letter of each word in '{question}' and concatenate them.\nA: ".format(index='third', question='Edwin Arias')
# this question is taken from the paper
input = f"{user_tag} {prompt} {asst_tag}"
# tokenizer and generation settings
input_ids = tokenizer(input, return_tensors="pt").to(model.device)
settings = {
    "pad_token_id": tokenizer.eos_token_id, # silence warning
    "do_sample": False, # temperature=0
    "max_new_tokens": 512,
    # "repetition_penalty": 1.1, # reduce control jank
}
print("==baseline")
model.reset()
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
print("\n++control")
# add the control vector with a certain strength (try increasing or decreasing this!)
model.set_control(control_vector, 1)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
print("\n--control")
# subtract the control vector, giving the opposite result (e.g. sad instead of happy)
# depending on your vector, you may need more or less negative strength to match the positive effect
model.set_control(control_vector, -1)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
model.reset()

==baseline
<s> [INST] Q: Take the 'third' letter of each word in 'Edwin Arias' and concatenate them.
A:  [/INST] The third letter of each word in 'Edwin Arias' is:

E (from Edwin)
 D (from Edwin)
 w (from Arias, as w is the third letter of the second syllable "Ari" in Arias)

So, the concatenated third letters of each word in 'Edwin Arias' is 'ew'.</s>

++control
<s> [INST] Q: Take the 'third' letter of each word in 'Edwin Arias' and concatenate them.
A:  [/INST] To complete the task, we need to identify the given name "Edwin Arias" and then find the instruction given in the problem statement, which is to find the "third" letter of each word and then concatenate the given instruction means to find the word given to us first and then find the word given to us second, and then we need to identify the instruction given to us in the problem statement which is to find the "third" letter of each word given to us in the problem statement.

First, we need to identify the given name "Edwin Aria