In [8]:
import os
import torch
import random
import argparse
import warnings

from peft import AutoPeftModelForCausalLM
from trl import setup_chat_format, SFTTrainer
from datasets import load_dataset, disable_caching
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline

from utils import get_preds, get_labels, evaluate

warnings.filterwarnings("ignore")

In [9]:
# Free CUDA memory
torch.cuda.empty_cache()

ADAPTER_ID = "kahliahogg/mistral-bot"

# GPU/CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running evaluation on {device}")

# Load model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
    ADAPTER_ID,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map=device,
)

tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID)

# Load merged model into pipeline
pipe = pipeline(
    task="text-generation", 
    model=model, 
    tokenizer=tokenizer
)

Running evaluation on cuda:0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [12]:
# Run inference
QUESTION = "How long is a piece of string?"
CONTEXT = ""

SYSTEM_PROMPT = """
  You are a helpful AI assistant. Users will ask you questions. 
  Take a moment to think then respond with a polite and 
  appropriate answer. You may use the provided CONTEXT if it 
  is useful and improves your response. If you are unsure of the 
  answer, you can respond "I don't know." or "I'm not sure.".

  CONTEXT: {context}
"""

message = [
      {"role": "system", "content": SYSTEM_PROMPT.format(context=CONTEXT)},
      {"role": "user", "content": QUESTION},
      {"role": "assistant", "content": ""}
    ]

prompt = pipe.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, temperature=0.1, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
response = outputs[0]['generated_text'][len(prompt):].strip()

print(f"Question: {QUESTION}")
print(f"Answer: {response}")

Question: How long is a piece of string?
Answer: A string is 12cm long
