In [1]:
import os
import torch
import random
import argparse
import warnings

from peft import AutoPeftModelForCausalLM
from trl import setup_chat_format, SFTTrainer
from datasets import load_dataset, disable_caching
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline

from utils import get_preds, get_labels

warnings.filterwarnings("ignore")

In [2]:
# Free CUDA memory
# torch.cuda.empty_cache()

# ADAPTER_ID = "./adapters/240407-1841-A16-D0.1-R16-S2048"
ADAPTER_ID = "kahliahogg/mistral-bot"
BASE_ID = "mistralai/Mistral-7B-v0.1"

# GPU/CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running evaluation on {device}")

# Load test dataset
test_prompts = load_dataset("json", data_files="data/test_prompts.json", split="train")
print(f"Loaded {test_prompts.num_rows} test samples")

Running evaluation on cuda:0
Loaded 3000 test samples


### Evaluate Finetuned Model

In [3]:
# Load model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
    ADAPTER_ID,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map=device,
)

tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID)

# Load merged model into pipeline
pipe = pipeline(
    task="text-generation", 
    model=model, 
    tokenizer=tokenizer
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [10]:
# Test Preds
sample_prompts = test_prompts.select(range(2))
y_preds = get_preds(sample_prompts, pipe)
y_true = get_labels(sample_prompts)

# # Evaluate
# evaluate(y_true, y_preds, ".", log_to_wandb=False)

100%|██████████| 2/2 [00:01<00:00,  1.84it/s]
100%|██████████| 2/2 [00:00<00:00, 7078.99it/s]


In [11]:
y_preds

['In finance, an exchange rate is the rate at',
 'Thank you for your interest in renting my house']