In [None]:
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import necessary packages
import pandas as pd
import torch

from pipeline.pipeline_initializer import initialize_pipeline
from pipeline.prompting_interface import prompt_pipeline

In [None]:
pipe = initialize_pipeline("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16)

In [None]:
def get_eval_prompt(question: str, answer: str):
    """Return a prompt string for zero shot scenario
    
    ## Parameters:
        - affiliation (str): The affiliation of the dataset
        - dataset (str): The dataset information
        - question (str): A question to be asked about the dataset
        - role (str): The role that the LLM play
    """
    return f"""Question Q:
/*
{question}
*/
Answer A:
/*
{answer}
*/
Assume that the answerer has all the necessary information to respond to question Q. Evaluate answer A based on the following criteria:
1. Completeness: The answer must definitively and comprehensively address all parts of question Q.
2. Relevance: The answer must directly provide the information requested in question Q without any extraneous details.
If the answer satisfies both criteria, label it as 'good'. If it fails to meet one or both criteria, label it as 'bad'. Provide your evaluation in the following format:
- Label: [good/bad]
- Reasoning: [Provide a brief explanation for your label]"""

In [None]:
name = "evaluate_ulang/openhermes-RP-nucleus_0.95.csv"  # Adjust to the benchmark to be evaluated
evals = pd.read_csv(name)

In [None]:
for i in tqdm(range(evals.shape[0])):
    table = evals["T"][i]
    question = evals["Q"][i]
    answer = evals["A"][i]
    curr_eval = evals["E"][i]
    if curr_eval == "unknown":
        prompt = get_eval_prompt(question, answer)
        conversation = [{"role": "user", "content": prompt}]
        model_output = prompt_pipeline(pipe, conversation)[-1]["content"]
        if "label: good" in model_output.lower() or "label: [good]" in model_output.lower():
            evaluation = "good"
        elif "label: bad" in model_output.lower() or "label: [bad]" in model_output.lower():
            evaluation = "bad"
        else:
            evaluation = model_output
        evals.loc[i, "E"] = evaluation
        evals.loc[i, "R"] = model_output
        evals.to_csv(name, index=False)