In [1]:
import torch
import transformers
from torch import cuda, bfloat16


device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
device

'cuda:0'

In [2]:
model_name = 'kasrahabib/ReqBrain-Mistral-7B-Instruct-v0.2'

model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, use_fast = False)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model.eval()
model.to(device)
print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on cuda:0


In [3]:
___double_back_slash = tokenizer.convert_ids_to_tokens(tokenizer("\\\\")["input_ids"])[1:]
___double_forward_slash = tokenizer.convert_ids_to_tokens(tokenizer("//")["input_ids"])[1:]
___single_forward_slash = tokenizer.convert_ids_to_tokens(tokenizer("/")["input_ids"])[1:]
___single_back_slash = tokenizer.convert_ids_to_tokens(tokenizer("\\")["input_ids"])[1:]
stop_token_ids = [
    tokenizer.convert_tokens_to_ids(x) for x in [["INST"], [tokenizer.eos_token], ["end", "of", "list"], ["end", "_", "of", "_"], ["/", "End"], ___double_back_slash, ___double_forward_slash, ___single_forward_slash, ___single_back_slash]
]

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

In [4]:
from transformers import StoppingCriteria, StoppingCriteriaList


class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [5]:
pipe = transformers.pipeline(
    model = model,
    tokenizer = tokenizer,
    return_full_text = True, # Set it to True when combining with LangChain
    task='text-generation',
    device=device,
    stopping_criteria=stopping_criteria,  
    temperature = 0.1,
    top_p=0.15,  
    top_k=0,  
    max_new_tokens = 512,  
    repetition_penalty = 1.3
)

2023-12-19 17:17:32.683030: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-19 17:17:32.718873: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-19 17:17:32.718907: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-19 17:17:32.718930: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-19 17:17:32.726855: I tensorflow/core/platform/cpu_feature_g

In [7]:
import datasets

evaluation_set = datasets.load_from_disk("./models_prediction_dataset")
evaluation_set

Dataset({
    features: ['REQID_ex', 'completion', 'query', 'class', 'task', 'text', 'label', 'falcone_7b_base_preds', 'falcon_7b_instruct_preds', 'llama2_7b_chat_hf_preds'],
    num_rows: 34
})

In [15]:
from tqdm import tqdm

completion = []

for query in tqdm(evaluation_set['query']):
    result = pipe(f"<s>[INST] {query} [/INST]")
    result = result[0]['generated_text'].split('[/INST]')[-1].strip("\\\\")
    result = result.strip("//")
    result = result.strip("/")
    result = result.strip("\\")
    result = result.strip("/End")
    result = result.strip(" ")
    completion.append(result)

  0%|          | 0/34 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 1/34 [00:03<01:49,  3.31s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  6%|▌         | 2/34 [00:08<02:13,  4.18s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  9%|▉         | 3/34 [00:12<02:12,  4.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 12%|█▏        | 4/34 [00:17<02:14,  4.49s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 15%|█▍        | 5/34 [00:38<05:10, 10.69s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 18%|█▊        | 6/34 [01:00<06:40, 14.30s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 21%|██        | 7/34 [01:08<05:33, 12.36s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 24%|██▎       | 8/34 [01:11<04:03,  9.37s/it]Setting `pad_token_id` to `eos_token_id`:2 for ope

In [17]:
evaluation_set = evaluation_set.add_column("mistral_ai_instruct_7b_chat_hf_preds", completion)

In [18]:
evaluation_set.save_to_disk("./models_prediction_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/34 [00:00<?, ? examples/s]