In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## PG

In [2]:
from dataset import HallucinationDataset
from torch.utils.data import DataLoader

def collate_pg(batch):
    queries = []
    labels = []
    for elem in batch:
        # queries.append(f"Is \'{elem['hyp']}\' the correct paraphrase of \'{elem['src']}\'? Think step-by-step. End with phrase 'Answer: yes' or 'Answer: no'")
        queries.append({"role": "user", "content": f"Is \'{elem['hyp']}\' the correct paraphrase of \'{elem['src']}\'? Answer using 'The answer is yes' or 'The answer is no'"})
        labels.append(elem["label"])
    return queries, labels

data = HallucinationDataset("data/val/val.model-agnostic.json", "PG")
loader = DataLoader(data, batch_size=1, collate_fn=collate_pg)

In [4]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [46]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 4096)

In [9]:
from tqdm.notebook import tqdm

def get_preds(batch):
    preds = []
    # print(batch)
    for elem in batch:
        ans = elem[elem.find("The answer is "):].lower()
        if "yes" in ans or "true" in ans:
            preds.append(1)
        else:
            preds.append(0)
    return preds

In [55]:
labels = []
preds = []
raw_preds = []

for queries, ans in tqdm(loader):
    model_inputs = tokenizer.apply_chat_template(queries, return_tensors="pt").to("cuda")

    generated_ids = model.generate(model_inputs, max_new_tokens=5, do_sample=True)
    raw_preds.append(generated_ids)
    preds.extend(get_preds(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)))
    labels.extend(ans)
    # break

0it [00:00, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

In [56]:
import torch

for pred in raw_preds:
    print(tokenizer.batch_decode(pred, skip_special_tokens=True), get_preds(tokenizer.batch_decode(pred, skip_special_tokens=True)))
    

["[INST] Is 'I have not been contacted.' the correct paraphrase of 'I haven't been contacted by anybody.'? Answer using 'The answer is yes' or 'The answer is no' [/INST] The answer is yes."] [1]
["[INST] Is 'I thought you'd be surprised at me too.' the correct paraphrase of 'I thought so, too.'? Answer using 'The answer is yes' or 'The answer is no' [/INST] The answer is no."] [1]
["[INST] Is 'Is she gonna be okay?' the correct paraphrase of 'Is she gonna be okay?'? Answer using 'The answer is yes' or 'The answer is no' [/INST] The answer is yes."] [1]
["[INST] Is 'How long before you're making that happen?' the correct paraphrase of 'How long before you make that happen?'? Answer using 'The answer is yes' or 'The answer is no' [/INST] The answer is yes."] [1]
["[INST] Is 'You've got a customer.' the correct paraphrase of 'You've got a client.'? Answer using 'The answer is yes' or 'The answer is no' [/INST] The answer is yes."] [1]
["[INST] Is 'Have you seen Norman in the last two days

In [57]:
import numpy as np

(np.array(labels) == np.array(preds)).mean()

0.64

## MT

In [23]:
def get_preds(batch):
    preds = []
    # print(batch)
    for elem in batch:
        ans = elem[elem.find("Answer:"):].lower()
        if "yes" in ans or "true" in ans:
            preds.append(1)
        else:
            preds.append(0)
    return preds

def collate_mt(batch):
    queries = []
    labels = []
    for elem in batch:
        # queries.append(f"[INST] Is \'{elem['hyp']}\' the correct translation of \'{elem['src']}\'? Answer using 'The answer is yes' or 'The answer is no' [/INST]")
        queries.append(f"Is \'{elem['hyp']}\' the correct translation of \'{elem['src']}\'? Answer:") # to English of Russian sentence
        labels.append(elem["label"])
    return queries, labels

data = HallucinationDataset("data/val/val.model-agnostic.json", "MT")
loader = DataLoader(data, batch_size=1, collate_fn=collate_mt)

In [6]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [24]:
labels = []
preds = []
raw_preds = []

for queries, ans in tqdm(loader):
    model_inputs = tokenizer(queries, return_tensors="pt", padding=True, truncation=True, max_length=64).to("cuda")
    generated_ids = model.generate(**model_inputs, max_new_tokens=5, do_sample=False)
    raw_preds.append(generated_ids)
    preds.extend(get_preds(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)))
    labels.extend(ans)

0it [00:00, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [25]:
(np.array(labels) == np.array(preds)).mean()

0.6844919786096256

In [100]:
(np.array(labels) == np.array(preds)).mean() # from ru to en

0.6631016042780749

## DM

`TODO`