In [None]:
%%capture
!pip install adapters datasets evaluate bert_score rouge_score sacremoses sacrebleu openai

In [None]:
import torch
from huggingface_hub import login
from tqdm import tqdm
from datasets import load_dataset
from transformers import set_seed, AutoModelForCausalLM, AutoTokenizer, pipeline
from adapters import init
from adapters.composition import Stack
from tqdm import tqdm
import pandas as pd
import numpy as np
from evaluate import load
from openai import OpenAI
import os

In [None]:
SEED = 42
modelpath = "gpt2-medium"
domain_adapter = "hf_path_to_trained_DA"
task_adapter = "hf_path_to_trained_TA"
HF_KEY = "hf_key"
key = "openai_key"
test_dataset = "hf_path_to_test_dataset"
test_size = 200

# generation params
temperature = 0.1
rp = 1.03
max_new_tokens = 100

In [None]:
set_seed(SEED)
torch.manual_seed(SEED)
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
login(HF_KEY)

In [None]:
test = load_dataset(test_dataset, split="train")
test.shuffle(SEED)
test = test.select(range(test_size))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(modelpath)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(modelpath).to(DEVICE)
init(model)

In [None]:
model.load_adapter(domain_adapter, load_as="domain", with_head=False)
model.load_adaptert(task_adapter, load_as="task", with_head=True)
model.active_adapters = Stack("domain","task")
model.adapter_to(DEVICE)
# print(model.adapter_summary())

In [None]:
def generate_answer(text: str):
  encoding = tokenizer('Question: ' + text + "Answer: ", return_tensors="pt").to(device)
  input_ids = encoding.input_ids

  model.eval()
  with torch.inference_mode():
    outputs = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, repetition_penalty= rp)
  return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
generated_answers = []
reference_answers = []
questions = []
contexts = []
for i in tqdm(range(test_size)):
  reference_answers.append(test[i]['answer'])
  questions.append(test[i]['question'])
  contexts.append(test[i]['context'])
  text = generate_answer(test[i]['question'])
  text = text.split("Answer:", 1)[1]
  result = text[:text.rfind('.') + 1] or text # extract the answer until last complete sentence
  generated_answers.append(result)

In [None]:
test_result = pd.DataFrame({'question': questions, 'generated': generated_answers, 'reference': reference_answers})
test_result.to_csv('eval_outputs.csv', index=False)

rouge


In [None]:
r = load("rouge")
result = r.compute(predictions=test_result['generated'], references=test_result['reference'])
test_result["r1"] = result['rouge1']
test_result['rL'] = result['rougeL']
# print(np.mean(test_result["r1"]))

bleu

In [None]:
bleu = load("bleu")
bleu_scores = []
for x in range(100):
  result_bleu = bleu.compute(predictions=[test_result['prediction'][x]], references=[test_result['reference'][x]])
  bleu_scores.append(result_bleu['precisions'][0])
test_result['bleu'] = bleu_scores

bertscore

In [None]:
bertscore = load("bertscore")
result = bertscore.compute(predictions=test_result['generated'], references=test_result['reference'], model_type="distilbert-base-uncased")
test_result["bert_scores"] = result['f1']
# print(np.mean(test_result["bert_scores"]))

GPT4 - zero shot

In [None]:
os.environ["OPENAI_API_KEY"] = key
client = OpenAI()

In [None]:
def get_similarity(q, c, r, gen):
  completion = client.chat.completions.create(
    model="gpt-4o-mini",
    seed=42,
    temperature=0,
    max_tokens=50,
    messages=[
      {"role": "system", "content": "You are evaluating generated text using 0 to 10, in three scoring objective: first score indicating relevancy of generated answer to the question, second score indicating how much of generated text can be supported by provided context or reference answer, third score indicating factuality evaluation based on your knowledge regardless of the refence. Give the scores, and do not explain."},
      {"role": "user", "content": f"Question: '{q}' \nContext: '{c}' \nReference Answer: '{r} \nGenerated Answer: '{gen}'"}
    ]
  )
  return completion.choices[0].message.content

In [None]:
gpt_scores = []
for x in tqdm(range(test_size)):
  gpt_scores.append(get_similarity(test_result['question'][x], test_result['context'][x], test_result['reference'][x], test_result['generated'][x]))

In [None]:
s1 = []
s2 = []
s3 = []
for i in gpt_scores:
  x = i.split('\n')
  x1 = x[0]
  x2 = x[1]
  x3 = x[2]
  s1.append(float(x1.split(':',1)[1]))
  s2.append(float(x2.split(':',1)[1]))
  s3.append(float(x3.split(':',1)[1]))
# print(np.mean(s1))

In [None]:
test_result['gpt4_relevance'] = s1
test_result['gpt4_support'] = s2
test_result['gpt4_factuality'] = s3

FactCC - example

In [None]:
pipe=pipeline(model="manueldeprada/FactCC")

In [None]:
from tqdm import tqdm
fact_cc = []
for i in tqdm(range(100)):
  gen = results['generated'][i]
  gold = results['context'][i]
  result = pipe([[[gold, gen]]], truncation='only_first', padding='max_length')
  if result[0]['label'] == 'CORRECT':
    fact_cc.append(1)
  else:
    fact_cc.append(0)
test_result['fact_cc'] = fact_cc

In [None]:
test_result.to_csv('eval_with_scores.csv', index=False)