In [1]:
import os
import torch
import random
import argparse
import warnings
import ollama

from peft import AutoPeftModelForCausalLM
from trl import setup_chat_format, SFTTrainer
from datasets import load_dataset, disable_caching
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline

from utils import get_preds, get_labels, evaluate

warnings.filterwarnings("ignore")

In [2]:
# Free CUDA memory
torch.cuda.empty_cache()

# Reproducibility
torch.manual_seed(42)
random.seed(42)

ADAPTER_ID = "kahliahogg/mistral-bot"

# GPU/CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running evaluation on {device}")

# Load test dataset
test_prompts = load_dataset("json", data_files="data/test_prompts.json", split="train")
print(f"Loaded {test_prompts.num_rows} test samples")

Running evaluation on cuda:0


Generating train split: 0 examples [00:00, ? examples/s]

Loaded 3000 test samples


### Evaluate Finetuned Model

In [3]:
# Load model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
    ADAPTER_ID,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map=device,
)

tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID)

# Load merged model into pipeline
pipe = pipeline(
    task="text-generation", 
    model=model, 
    tokenizer=tokenizer
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [4]:
# Generate Preds & targets
NUM_SAMPLES = 1000
sample_prompts = test_prompts.shuffle().select(range(NUM_SAMPLES))
preds = get_preds(sample_prompts, pipe)
targets = get_labels(sample_prompts)

100%|██████████| 1000/1000 [1:11:40<00:00,  4.30s/it]
100%|██████████| 1000/1000 [00:00<00:00, 10940.87it/s]


In [5]:
# Evaluate with strong-LLM as judge
responses = evaluate(preds, targets)

similar.
different
similar
similar.
similar
different
similar
similar
similar
similar
similar
similar
similar.
similar
similar
similar
similar.
similar.
similar
similar
similar
similar
similar
similar. both answers provide tips and guidelines for baking a cake successfully. they cover topics such as temperature control, ingredient preparation, creaming butter and sugar, alternating wet and dry ingredients, removing air pockets from the batter, and not overbaking.
different
different
different
similar
based on the list provided, answer b is different from answer a.

answer a lists 42 countries, while answer b only lists 16 countries that have hosted the olympic games.
similar
similar
similar
similar
similar
similar
similar
similar
similar
similar
similar
similar
similar.
similar
similar
similar
similar
similar.
similar
similar
different
similar
similar
similar.
similar
similar
similar
similar
similar
similar
similar
similar.
similar
similar
similar
similar
similar
similar
similar
differ

In [6]:
# Calculate accuracy
accuracy = responses.count('similar')/len(responses)

print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 82.70%
