### Generate and store responses from the deployed RAG pipeline

Once you have deployed the pipeline, we can evaluate the performances of the following setups:
1. Without Knowledge Base
2. With Knowledge Base
3. With Knowledge Base and customized PEFT weights

We will start with functions to parse the responses from our deployment, get_rag_response and get_rag_context (when knowledge base is used).

In [3]:
## Please ensure you have all the requirements installed before proceeding
# !pip install -r requirements.py

import json
import requests

## Get response of a given question from the RAG pipeline endpoint
def get_rag_response(RAG_URL, question, use_knowledge_base, temperature=0.1, top_p=0.7, max_tokens=1024):
    headers = {'accept': 'application/json', 'Content-Type': 'application/json'}
    data = {
            "messages": [
                {
                    "role": "user",
                    "content": question
                }
            ],
            "use_knowledge_base": use_knowledge_base,
            "temperature": temperature,
            "top_p": top_p,
            "max_tokens": max_tokens,
            "stop": []
        }
    response = requests.post(RAG_URL+"generate", headers=headers, json=data, stream=True, timeout=60)
    if response.status_code == 200:
            # Store the full response from the incoming response stream
            full_response = ""
            for line in response.iter_lines():
                if len(line.strip())==0:
                    continue
                line = line.decode('utf-8').split("data: ")[1]
                line = json.loads(line)["choices"][0]["message"]['content']
                full_response += line
    else:
        print(f"Request failed with status code: {response.status_code}")
        return response.text
    return full_response

## Get context retrieved for a given question from the RAG pipeline endpoint
def get_rag_context(RAG_URL, question, top_k=4):
    headers = {'accept': 'application/json', 'Content-Type': 'application/json'}
    data = {
            "query": question,
            "top_k": top_k
            }
    response = requests.post(RAG_URL+"search", headers=headers, json=data, stream=True, timeout=60)
    if response.status_code == 200:
            # Store the full response from the incoming response stream
            retrieved_context = ""
            full_response = ""
            for line in response.iter_lines():
                full_response += line.decode('utf-8')
            # Concatenate all the retrieved chunks in the full response
            for chunk in json.loads(full_response)["chunks"]:
                retrieved_context += chunk['content'] + "\n\n"
    else:
        print(f"Request failed with status code: {response.status_code}")
        return response.text
    return retrieved_context

In [4]:
from tqdm import tqdm 

## Generate all the responses and retrieved context (if use_knowledge_base is True) for all the questions in a given dataset
def generate_evals_dataset(DATASET, RAG_URL, use_knowledge_base=True):
    evals_list = []
    with open(TEST_DATASET, 'r') as file:
        lines = file.readlines()
        num_lines = len(lines)
        for idx, line in enumerate(tqdm(lines, total=num_lines, desc="Processing", unit="questions")):
            eval_dict = {}
            line = json.loads(line)
            ## Save gold truth from annotated test dataset
            gold_truth_context = line['input'].split("Question: ")[0].strip()
            question = line['input'].split("Question: ")[1].strip()
            gold_truth_answer = line['output']

            ## Get RAG answers and retrieved context for the question
            retrieved_answer = get_rag_response(RAG_URL, question, use_knowledge_base)
            if use_knowledge_base==True:
                retrieved_context = get_rag_context(RAG_URL, question)
            else:
                retrieved_context = ""

            evals_list.append({"question" : question,
                                "rag_answer": retrieved_answer,
                                "rag_context": retrieved_context,
                                "gt_answer": gold_truth_answer,
                                "gt_context": gold_truth_context})
    return evals_list

To evaluate the different setups, we are using the data/test_set.jsonl file we created during the PEFT process.

Please update the variables below with the respective values, namely the NVIDIA_API_KEY, IP_ADDRESS and 
CHAIN_SERVER_PORT (8081 by default, and can be found in deploy/docker-compose.yaml file).

The NVIDIA_API_KEY is your [NGC personal API key](https://org.ngc.nvidia.com/setup/personal-keys).

In [5]:
DATASET_DIRECTORY="../data/"
TEST_DATASET = DATASET_DIRECTORY+"test_set.jsonl"

## Fill in the values of NVIDIA_API_KEY, IP_ADDRESS and CHAIN_SERVER_PORT:
NVIDIA_API_KEY = "nvapi--***"
IP_ADDRESS = ""
CHAIN_SERVER_PORT = ""

RAG_URL = f"http://{IP_ADDRESS}:{CHAIN_SERVER_PORT}/"

## Generate answers without external RAG vectorDB
print("Generating answers without knowledge base")
no_rag_responses = generate_evals_dataset(TEST_DATASET, RAG_URL=RAG_URL, use_knowledge_base=False)

## Generate answers with external RAG vectorDB
print("Generating answers with knowledge base")
rag_responses = generate_evals_dataset(TEST_DATASET, RAG_URL=RAG_URL, use_knowledge_base=True)

Generating answers without knowledge base


FileNotFoundError: [Errno 2] No such file or directory: '../data/test_set.jsonl'

After we have saved our responses for the first two setups (base generator LLM with and without Knowledge Base), we will redo the same for our RAG deployment pipeline with PEFT weights.

In [None]:
## Fill in the values of PEFT_IP_ADDRESS and PEFT_CHAIN_SERVER_PORT:
PEFT_IP_ADDRESS = ""
PEFT_CHAIN_SERVER_PORT = ""

PEFT_RAG_URL = f"http://{PEFT_IP_ADDRESS}:{PEFT_CHAIN_SERVER_PORT}/"
 
## Generate answers with external RAG vectorDB and PEFT weights
rag_peft_responses = generate_evals_dataset(TEST_DATASET, RAG_URL=PEFT_RAG_URL, use_knowledge_base=True)

Now that we have saved the responses from different setups, we will the following metrics to evaluate the responses:
1. ROUGE Score
2. BLEU Score
3. RAGAS (context precision, context relevancy, answer similarity, answer relevancy, faithfulness)
4. LLM-as-Judge (answer similarity, , answer relevancy, answer correctness, faithfulness, overall score)


### ROUGE and BLEU scores

[ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) is a set of metrics primarily used for evaluating text generation. ROUGE-N refers to the overlap of n-grams between the generated text and a reference text. For example, ROUGE-1 and ROUGE-2 evaluate the overlap of unigrams and bigrams, respectively. On the other hand, ROUGE-L focuses on the longest common subsequence (LCS) between the generated text and the reference, capturing fluency and syntactical correctness.

[BLEU](https://huggingface.co/spaces/evaluate-metric/bleu) is another popular metric for evaluating the quality of generated text. It measures how many words or phrases in the generated text match the reference text, considering n-gram overlaps up to a certain length. BLEU typically uses a precision-based approach and includes a brevity penalty to discourage overly short translations.

In [None]:
import evaluate
import json
import pandas as pd

# Load the answers from different RAGs and references (gold truth answer)
rag_peft_predictions = [data['rag_answer'] for data in rag_peft_responses]
rag_predictions = [data['rag_answer'] for data in rag_responses]
no_rag_predictions = [data['rag_answer'] for data in no_rag_responses]
references = [[data['gt_answer']] for data in no_rag_responses]

# Load and compute BLEU evaluation metric
bleu = evaluate.load("bleu")
rag_peft_bleu = bleu.compute(predictions=rag_peft_predictions, references=references)
rag_bleu = bleu.compute(predictions=rag_predictions, references=references)
no_rag_bleu = bleu.compute(predictions=no_rag_predictions, references=references)


# Load and compute ROUGE evaluation metric
rouge = evaluate.load('rouge')
rag_peft_rouge = rouge.compute(predictions=rag_peft_predictions, references=references)
rag_rouge = rouge.compute(predictions=rag_predictions, references=references)
no_rag_rouge = rouge.compute(predictions=no_rag_predictions, references=references)


## Visualize the BLEU and ROUGE evaluation in tabular form
metrics_data = {
    'Metrics': ['BLEU Score', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-Lsum'],
    'With RAG+PEFT Weights': [rag_peft_bleu['bleu'], rag_peft_rouge['rouge1'], rag_peft_rouge['rouge2'], rag_peft_rouge['rougeL'], rag_peft_rouge['rougeLsum']],
    'With RAG': [rag_bleu['bleu'], rag_rouge['rouge1'], rag_rouge['rouge2'], rag_rouge['rougeL'], rag_rouge['rougeLsum']],
    'Without RAG': [no_rag_bleu['bleu'], no_rag_rouge['rouge1'], no_rag_rouge['rouge2'], no_rag_rouge['rougeL'], no_rag_rouge['rougeLsum']]
}
df_metrics = pd.DataFrame(metrics_data)
print(df_metrics)

While ROUGE and BLEU are widely used automated techniques for evaluating text generation, they focus on surface-level similarity through n-gram overlap, often missing the semantics, coherence, and quality of the generated text. They might also overlook relevance, and factual accuracy, making them less suitable for detecting hallucinations.

### RAGAS

[RAGAS](https://docs.ragas.io/) (Retrieval-Augmented Generation Alignment Score) is a metric specifically designed to evaluate Retrieval-Augmented Generation (RAG) models by assessing both the quality of the generated response and its alignment with the retrieved context. Unlike ROUGE and BLEU, which focus mainly on n-gram overlap and surface-level similarity, RAGAS considers the relevance and coherence of the generated text in relation to the retrieved information. This makes it more suitable for evaluating RAG models, as it better captures the model's ability to generate contextually appropriate and accurate responses.

To judge the retrieval process, we will use the [context recall](https://docs.ragas.io/en/stable/concepts/metrics/context_recall.html) and [context precision](https://docs.ragas.io/en/stable/concepts/metrics/context_precision.html) metrics. For the generation process, we will use [answer similarity](https://docs.ragas.io/en/stable/concepts/metrics/semantic_similarity.html), [answer relevancy](https://docs.ragas.io/en/stable/concepts/metrics/answer_relevance.html) and [faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/faithfulness.html). 

In [None]:
import json
import pandas as pd
from datasets import Dataset
import os
import time
from ragas.llms import LangchainLLM
from ragas.metrics import (context_recall, context_precision, answer_relevancy, faithfulness, answer_correctness)
from ragas.metrics import (Faithfulness, AnswerCorrectness, ContextRelevancy, AnswerRelevancy, AnswerSimilarity, ContextRecall, ContextPrecision)
from ragas import evaluate

In this notebook, we will use NVIDIA AI Catalog's Llama3-70B-instruct LLM as a judge and eval model. NVIDIA AI Playground on NGC allows developers to experience state of the art LLMs accelerated on NVIDIA DGX Cloud with NVIDIA TensorRT nd Triton Inference Server. 

Developers get free credits for 4000 requests to any of the available models. 

If you have not signed up yet, don't worry! You can sign up here: https://build.nvidia.com/explore/discover

In [None]:
## Set up RAGAS to use NVIDIA Catalog end points
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

LLM_MODEL = 'meta/llama3-70b-instruct'

EMBED_MODEL = "NV-Embed-QA"
os.environ['NVIDIA_API_KEY'] = NVIDIA_API_KEY

## Initialize the NVIDIA LLM and embedder
llm = ChatNVIDIA(model= LLM_MODEL,  max_tokens = 4000, nvidia_api_key=NVIDIA_API_KEY)
nv_embedder = NVIDIAEmbeddings(model=EMBED_MODEL, truncate="END", nvidia_api_key=NVIDIA_API_KEY)
nv_document_embedder = NVIDIAEmbeddings(model=EMBED_MODEL, model_type="passage", truncate="END")
nv_query_embedder = NVIDIAEmbeddings(model=EMBED_MODEL, model_type="query", truncate="END", nvidia_api_key=NVIDIA_API_KEY)
nvpl_llm = LangchainLLM(llm=llm)

## Initialize all RAGAS metrics with NVIDIA LLM and embedder
answer_similarity = AnswerSimilarity(llm=nvpl_llm, embeddings=nv_query_embedder)
answer_similarity.init_model()

answer_relevancy = AnswerRelevancy(embeddings=nv_query_embedder,llm=nvpl_llm) #embeddings=nv_query_embedder,
answer_relevancy.init_model()

answer_correctness = AnswerCorrectness(llm=nvpl_llm, weights=[0.4,0.6])
answer_correctness.init_model()

faithfulness = Faithfulness(llm=nvpl_llm)
faithfulness.init_model()

context_recall = ContextRecall(llm=nvpl_llm)
context_recall.init_model()

context_precision = ContextPrecision(llm=nvpl_llm)
context_precision.init_model()

context_relevancy = ContextRelevancy(llm=nvpl_llm)
context_relevancy.init_model()

In [None]:
## Function to calculate RAGAS metric for given list of responses having question, response from RAG pipeline (rag_answer), 
#  retrieved context (rag_context) and ground truth answer (gt_answer) fields

def get_ragas_results(evals_list):
    ragas_results = {}
    data_samples = {
                    'question': [data["question"] for data in evals_list],
                    'answer': [data["rag_answer"] for data in evals_list],
                    'contexts' : [data["rag_context"].split("\n\n") for data in evals_list],
                    'ground_truths': [[data["gt_answer"]] for data in evals_list],}

    data_samples = Dataset.from_dict(data_samples)

    ## If you need to store the ratings for each datapoint separately, convert the ragas_results into dataframe and store it:
    ##    ragas_results = evaluate(data_samples, metrics=[x, y, z])
    ##    ragas_results = ragas_results.to_pandas()
    ragas_results = evaluate(data_samples,metrics=[context_precision, context_relevancy, answer_similarity, answer_relevancy, faithfulness])
    return ragas_results

In [None]:
no_rag_ragas = get_ragas_results(no_rag_responses)
rag_ragas = get_ragas_results(rag_responses)
rag_peft_ragas = get_ragas_results(rag_peft_responses)

In [None]:
## Visualize the RAGAS evaluation in tabular form
Metrics = no_rag_ragas.keys()
metrics_data = {
    'Metrics': Metrics,
    'With RAG+PEFT Weights': [rag_peft_ragas[metric] for metric in Metrics],
    'With RAG': [rag_ragas[metric] for metric in Metrics],
    'Without RAG': [no_rag_ragas[metric] for metric in Metrics]
}
df_metrics = pd.DataFrame(metrics_data)
print(df_metrics)

### LLM-as-JUDGE

The LLM-as-Judge method involves using a large language model (LLM) to evaluate the quality of generated responses by comparing them against reference answers or assessing their relevance to the context. This approach can be particularly powerful when evaluating complex tasks, as it leverages the LLM's understanding of language, context, and nuanced meanings, potentially offering a more holistic evaluation than metrics like RAGAS, ROUGE or BLEU.

However, the LLM-as-Judge method may introduce biases inherent in the judging model and lacks explainibility, making it less reliable as a standalone metric. This is an active area of research, and the effectiveness of the LLM-as-Judge method can significantly depend on the prompts used and your specific use case. There is ongoing exploration into optimizing prompts for more accurate and fair assessments. Developers should carefully research and experiment with different prompts to determine the most suitable approach for their specific use case.

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

## Get response of the LLM judge on the ratings of different RAG responses
def generate_llm_as_judge_response(prompt, llm_model='meta/llama3-70b-instruct', max_tokens=4000):
    llm = ChatNVIDIA(model= llm_model,  max_tokens = max_tokens, nvidia_api_key=NVIDIA_API_KEY)
    langchain_prompt = ChatPromptTemplate.from_messages([("system", "You are a fair evaluator of different Retrieval Augmented Generation (RAG) pipelines. You will be given a task to rank different answers from different RAG pipelines."), ("user", "{prompt}")])
    chain = langchain_prompt | llm | StrOutputParser()
    full_response = ""
    for response in chain.stream({"prompt": prompt}):
        full_response += response
    return full_response

In [None]:
## Example prompt template for LLM-as-Judge to rate different RAG answers based on different user-defined metrics
## Note that we pass the ground truth context and answer for reference

ANSWER_EVALUATION_PROMPT = """###Task Description:
Here are the answers given by different RAG pipelines, along with the question, gold truth answer and gold truth context.

###Gold Truth context:
{gt_context}

### Question:
{question}

###Gold Truth Answer (Score 5):
{gt_answer}

###The answer given by RAG pipeline 1:
{rag_answer1}

###The answer given by RAG pipeline 2:
{rag_answer2}

###The answer given by RAG pipeline 3:
{rag_answer3}

You should rate each of the answers out of 5 with the format: x/5. Give your rating on each of these metrics:
1. Answer similarity: The answer given by the RAG pipeline is similar to the the gold truth answer. Note that the answer should be factually correct.
2. Faithfulness: The information in the answer should be present in the context.
3. Answer relevancy: The answer given is relevant to the question asked and the gold truth context.
4. Answer correctness: Gauge the accuracy of the generated answer when compared to the ground truth. Be careful of hallucinations.
5. Overall score of each answer

Choose the best answer out of all answers. 
"""

In [None]:
## Parse and store the responses of the LLM judge for each of the stored RAG responses

metrics = ["Answer similarity", "Faithfulness", "Answer relevancy", "Answer correctness", "Overall score"]
llm_as_judge_evals = {"no_rag":{metric:[] for metric in metrics}, "rag":{metric:[] for metric in metrics}, "rag_peft":{metric:[] for metric in metrics}}
llm_as_judge_responses = []
for idx in range(len(no_rag_responses)):
    llm_eval = None
    print("Generating evals ",idx+1, "/", len(no_rag_responses))
    try:
        ans_eval_prompt = ANSWER_EVALUATION_PROMPT.format(
                rag_answer1=no_rag_responses[idx]['rag_answer'],
                rag_answer2=rag_responses[idx]['rag_answer'],
                rag_answer3=rag_peft_responses[idx]['rag_answer'],
                gt_context=no_rag_responses[idx]['gt_context'],
                gt_answer=no_rag_responses[idx]['gt_answer'],
                question=no_rag_responses[idx]['question']
            )
        llm_eval = generate_llm_as_judge_response(prompt=ans_eval_prompt)
        llm_as_judge_responses.append(llm_eval)
        for metric in metrics:
            llm_as_judge_evals["no_rag"][metric].append(float(llm_eval.split(metric)[1].split(" ")[1].split("/")[0]))
            llm_as_judge_evals["rag"][metric].append(float(llm_eval.split(metric)[2].split(" ")[1].split("/")[0]))
            llm_as_judge_evals["rag_peft"][metric].append(float(llm_eval.split(metric)[3].split(" ")[1].split("/")[0]))
    except:
        idx -= 1
        print("Error in parsing LLM response", llm_eval)


In [None]:
## Visualize the LLM-as-Judge evaluation in tabular form

from statistics import mean

metrics_data = {
    'Metrics': metrics,
    'With RAG+PEFT Weights': [mean(llm_as_judge_evals['rag_peft'][metric]) for metric in metrics],
    'With RAG': [mean(llm_as_judge_evals['rag'][metric]) for metric in metrics],
    'Without RAG': [mean(llm_as_judge_evals['no_rag'][metric]) for metric in metrics]
}
df_metrics = pd.DataFrame(metrics_data)
print(df_metrics)