In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging,
)

import time 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "mistralai/Mistral-7B-v0.1"

cutoff_len = 4096
torch.cuda.empty_cache()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)
device_map = {"": 0}
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    # cache_dir = "/dev/shm/hassan/.cahce/"
)
model.eval()

model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if "Llama-2" in model_name or "Mistral" in model_name:
    print("Doing Llama tokenizer thingy")
    # tokenizer.pad_token_id = tokenizer.bos_token_id
    tokenizer.pad_token = tokenizer.bos_token

tokenizer.padding_side = "right"

bos = tokenizer.bos_token_id
eos = tokenizer.eos_token_id
pad = tokenizer.pad_token_id
print("pre-trained model's BOS EOS and PAD token id:",bos,eos,pad," => It should be 1 2 None")




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/hshapour/.pyenv/versions/3.10.12/envs/pytorch/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda115.so
CUDA SETUP: CUDA runtime path found: /usr/lib/x86_64-linux-gnu/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 115
CUDA SETUP: Loading binary /home/hshapour/.pyenv/versions/3.10.12/envs/pytorch/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda115.so...


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.91s/it]


Doing Llama tokenizer thingy
pre-trained model's BOS EOS and PAD token id: 1 2 1  => It should be 1 2 None


In [3]:
# fname_json = f"mmlu-data/ex_no_space.json"
n_shot = 2
d1 = load_dataset("json", data_files={
        'dev' : f"boolq-data/{n_shot}_shot_examples_small.json"
    })
print(d1['dev'][2]['input'])

Passage: The Elder Scrolls Online -- As with other games in The Elder Scrolls series, the game is set on the continent of Tamriel. The events of the game occur a millennium before those of The Elder Scrolls V: Skyrim and around 800 years before The Elder Scrolls III: Morrowind and The Elder Scrolls IV: Oblivion. It has a broadly similar structure to Skyrim, with two separate conflicts progressing at the same time, one with the fate of the world in the balance, and one where the prize is supreme power on Tamriel. In The Elder Scrolls Online, the first struggle is against the Daedric Prince Molag Bal, who is attempting to meld the plane of Mundus with his realm of Coldharbour, and the second is to capture the vacant imperial throne, contested by three alliances of the mortal races. The player character has been sacrificed to Molag Bal, and Molag Bal has stolen their soul, the recovery of which is the primary game objective.
Question: is elder scrolls online the same as skyrim
Answer:Fals

In [54]:
number_of_examples = 10

################################################
# method 2: generate tensor of examples
ex1 = d1['dev'][:number_of_examples]['input']
input_ids = tokenizer(ex1, padding=True, return_tensors='pt').input_ids
input_ids = input_ids.to(device=0)

tic = time.time()
with torch.no_grad():
    output = model(input_ids)
    answers_2 = output.logits.squeeze()
    if len(ex1) == 1:
        answers_2 = answers_2.unsqueeze(0)

toc = time.time()
dt_2 = toc-tic

print("Pytorch tensor elapsed time: ", dt_2, " sec")



Pytorch tensor elapsed time:  0.4870028495788574  sec


In [55]:
top_k = 4

ones = torch.ones_like(input_ids)
last_token = input_ids == ones
row_indices = torch.arange(input_ids.size(0))
last_token = (torch.sum(last_token, dim = 1) + 1) * (-1)
# if "Llama-2" in model_name:
if "Llama-2" in model_name or "Mistral" in model_name:
    last_token+=1
_, top_choices2 = torch.topk(answers_2[row_indices,last_token,:], top_k)
# _, top_choices2 = torch.topk(answers_2[row_indices,label_non_zero_id,:], top_k)
print(tokenizer.convert_ids_to_tokens(top_choices2[0,:]))

# print(torch.all(top_choices1[:,0] ==  top_choices2[:,0].cpu()))
# print(top_choices1[:,0] ==  top_choices2[:,0].cpu())
# print(tokenizer.decode(top_choices1[:,0]))
# print(top_choices2[:,0].cpu())
# print((top_choices3))


['True', 'False', '▁True', 'No']


In [68]:
print(tokenizer("True",add_special_tokens=False))
top_choices2[0,:]

{'input_ids': [5852], 'attention_mask': [1]}


tensor([5574, 8824, 5852, 3782], device='cuda:0')

In [6]:
# tokenizer.convert_ids_to_tokens(5852)
toks = tokenizer('choice1').input_ids[1:]
tokenizer.convert_ids_to_tokens(toks)


['▁choice', '1']