In [None]:
import os
os.environ['USER_AGENT'] = 'chrome'

import re
import json
import pandas as pd

from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from sklearn.metrics.pairwise import cosine_similarity

from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datetime import datetime
timestamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

In [3]:
from datasets import Dataset

In [None]:
# 3b baseline
#dataset = pd.read_csv('eval_set_03-11-2024_08-31-29.csv', index_col=0)
# 1b baseline
dataset = pd.read_csv('eval_set_07-11-2024_16-24-25.csv', index_col=0)

In [6]:
dataset.head()

Unnamed: 0,question,answer,retrieved_context,reference
0,What is prompt engineering?,Prompt engineering is the process of designing...,OpenAI Cookbook has many in-depth examples for...,"Prompt Engineering, also known as In-Context P..."
1,What are the basic approaches for prompting a ...,The basic approaches for prompting a language ...,OpenAI Cookbook has many in-depth examples for...,Zero-shot and few-shot learning are two of the...
2,What are the issues in few-shot learning that ...,The issues in few-shot learning that lead to p...,Zero-Shot#\nZero-shot learning is to simply fe...,(1) Majority label bias exists if the distribu...
3,What is Chain-of-Thought (CoT) prompting?,Chain-of-Thought (CoT) prompting is a techniqu...,Definition: Determine which category the quest...,Chain-of-thought (CoT) prompting generates a s...
4,What are the types of Chain-of-Thought prompts?,"The question asks for the category ""Quantity"" ...",References#\n[1] Zhao et al. “Calibrate Before...,Two main types of CoT prompting:\n\nFew-shot C...


### Evaluation (LLM-as-a-Judge)

In [None]:
# Set up llama 3.2:3b as the judge model
judge_llm = ChatOllama(
    model="llama3.2:3b", 
    temperature=0
)




relevance_prompt = PromptTemplate(
    template="""Evaluate the relevance score of the provided context to the question shared here.
    
    Context: {documents}
    Question: {question}
    
    To calculate the relevance score. Do the following step by step. Break the provided context into
    distinct statements. Find out how many of these statements provide relevant information to answering 
    the question. Then, calculate the relevance score as the no. of total statements that are relevant divided by the total
    no. of statements in the context.

    Provide the response strictly in the following JSON format:


    "{{
      "relevance_score": <relevance score>,
      "total_relevant_statements": <total no. of relevant statements in context>,
      "total_statements_in_context": <total no. of statements in context>,
      # "relevant_statements": ['statement 1', 'statement 2', ...],
      # "statements_in_context": ['statement 1', 'statement 2', ...]

    }}"

    
    Confirm that the JSON format is correct with no extra or missing brackets.
    Confirm that the calculation for relevance score is correct and Respond only 
    with the JSON output without any additional commentary.
    """,
    input_variables=["documents","question"],
)

recall_prompt = PromptTemplate(
    template="""Evaluate the recall score of the provided reference to the context shared here.
    
    Reference: {reference}
    Context: {documents}

    To calculate the recall score. Do the following step by step. Break the provided reference into
    distinct statements.Find out how many of 
    these statements in reference can be attributed to the information provided in the context. Then, calculate 
    the recall score as the no. of total statements that can be attributed to context divided by the total 
    no. of statements in the reference.

    Provide the response strictly in the following JSON format:


    "{{
      "recall_score": <recall score>,
      "total_attributed_statements": <total no. of attributed statements in reference>,
      "total_statements_in_reference": <total no. of statements in context>,
      # "attributed_statements": <text of attributed statements as a list>,
      # "statements_in_reference": <text of all statements in reference as a list>

    }}"

    
    Confirm that the JSON format is correct with no extra or missing brackets.
    Confirm that the calculation for recall score is correct and Respond only 
    with the JSON output without any additional commentary.
    """,
    input_variables=["reference","documents"],
)



faithfulness_prompt = PromptTemplate(
    template="""Evaluate the faithfulness of the generated answer to the question based on the provided documents.
    Faithfulness is calculated as the number of claims in the answer that can be inferred from the documents divided
    by the total no. of claims in the answer.
    
    Provide the response in the following JSON format:
     
    "{{
      "faithfulness_score": <score>,
      "total_inferred_claims": <total inferred claims in generated answer>,
      "total_claims": <total claims in generated answer>,
      # "inferred_claims": ["claim 1", "claim 2", ...],
      # "claims_in_answer": ["claim 1", "claim 2", ...]
    }}"

    Question: {question}
    Documents: {documents}
    Generated Answer: {answer}
    
    Confirm that the JSON format is correct with no extra or missing brackets. Respond only with the JSON output
    without any additional commentary.
    """,
    input_variables=["question", "documents", "answer"],
)

response_quality_prompt = PromptTemplate(
    template="""Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant for the question below. 
    Consider the correctness, relevance, and completeness of the assistant's answer compared to the reference answer.
    
    If the assistant's answer matches the quality of the reference answer in all respects, respond with "10" only.
    If the answer is partially correct or lacks completeness or relevance, assign an appropriate score between 1 and 9
    and respond with this score only.
    If the assistant's answer is incorrect or irrelevant, respond with "0" only. 
    Do not respond with additional commentary in your response.
    
    Question: {question}
    Reference Answer: {reference}
    Assistant's Answer: {answer}
    
    Score:""",
    input_variables=["question", "reference", "answer"],
)


In [None]:
# Evaluator class for calculating various evaluation metrics for the judge model
class RAGEvaluator:
    def __init__(self, judge_llm):
        self.judge_llm = judge_llm
    
    def evaluate_relevance(self, documents, question):

        relevance_score = self.judge_llm.invoke(relevance_prompt.format(documents=documents, question=question))
        return relevance_score
    
    def evaluate_recall(self, reference, documents):

        relevance_score = self.judge_llm.invoke(recall_prompt.format(reference=reference, documents=documents))
        return relevance_score

    def evaluate_faithfulness(self, question, documents, answer):

        faithfulness_score = self.judge_llm.invoke(faithfulness_prompt.format(question=question, documents=documents, answer=answer))
        return faithfulness_score


    def evaluate_quality(self, question, answer, reference):

        quality_score = self.judge_llm.invoke(response_quality_prompt.format(question=question, reference=reference, answer=answer))
        return quality_score
    
    
# Initialize evaluator
rag_evaluator = RAGEvaluator(judge_llm=judge_llm)

In [None]:

def extract_faithfulness_score(response):
    """
    Extracts the JSON output from the LLM's response content.

    Parameters:
    response (AIMessage or str): The response from the LLM, containing a JSON-formatted faithfulness score.

    Returns:
    dict: A dictionary containing the parsed JSON output, or None if parsing fails.
    """
    # Ensure response is a string
    response_content = response.content if hasattr(response, "content") else str(response)

    # Regular expression to find JSON block in the response
    json_match = re.search(r'\{[\s\S]*\}', response_content)
    
    if json_match:
        json_str = json_match.group()  # Extract JSON string
        try:
            # Parse JSON string to Python dictionary
            return json.loads(json_str)
        except json.JSONDecodeError:
            print("Failed to decode JSON.")
            return None
    else:
        print("No JSON content found in the response.")
        return None


In [None]:

def extract_relevance_score(response):
    # Ensure response is a string
    response_content = response.content if hasattr(response, "content") else str(response)

    # Regular expression to find JSON block in the response
    json_match = re.search(r'\{[\s\S]*\}', response_content)
    
    if json_match:
        json_str = json_match.group()  # Extract JSON string
        try:
            # Parse JSON string to Python dictionary
            return json.loads(json_str)
        except json.JSONDecodeError:
            print("Failed to decode JSON.")
            return None
    else:
        print("No JSON content found in the response.")
        return None

In [None]:

def extract_recall_score(response):
    # Ensure response is a string
    response_content = response.content if hasattr(response, "content") else str(response)

    # Regular expression to find JSON block in the response
    json_match = re.search(r'\{[\s\S]*\}', response_content)
    
    if json_match:
        json_str = json_match.group()  # Extract JSON string
        try:
            # Parse JSON string to Python dictionary
            return json.loads(json_str)
        except json.JSONDecodeError:
            print("Failed to decode JSON. Please check the response format.")
            return None
    else:
        print("No JSON content found in the response.")
        return None

In [12]:
def extract_response_quality_score(response):
    response = StrOutputParser().parse(response).content
    # Use regex to find the binary relevance score (either 0 or 1)
    match = re.search(r"\b(10|[0-9])\b", response)
    return int(match.group()) if match else None  # Return score as integer, or None if not found

In [13]:
# binary_relevance_scores = []
# for _, row in dataset.iterrows():
#     response = rag_evaluator.evaluate_binary_relevance(row['question'], row['retrieved_context'])
#     parsed_response = extract_binary_relevance_score(response)
#     binary_relevance_scores.append(parsed_response)

In [14]:
relevance_scores = []
relevance_responses = [] 
for _, row in dataset.iterrows():
    response = rag_evaluator.evaluate_relevance(row['retrieved_context'], row['question'])
    parsed_response = extract_relevance_score(response)
    relevance_scores.append(parsed_response.get("relevance_score"))
    relevance_responses.append(parsed_response)

In [15]:
response

AIMessage(content='{"relevance_score": 0.8, "total_relevant_statements": 4, "total_statements_in_context": 5}', additional_kwargs={}, response_metadata={'model': 'llama3.2:3b', 'created_at': '2024-11-07T12:29:37.5278089Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 27540304900, 'load_duration': 65772200, 'prompt_eval_count': 1420, 'prompt_eval_duration': 23227442000, 'eval_count': 31, 'eval_duration': 4242957000}, id='run-96a33431-e54e-4553-bb1a-7a63ff7fa074-0', usage_metadata={'input_tokens': 1420, 'output_tokens': 31, 'total_tokens': 1451})

In [16]:
relevance_scores

[0.5,
 0.8,
 0.5,
 0.5,
 4,
 0.8,
 0.8,
 0.8,
 0.67,
 0.5,
 0.5,
 0.8,
 0.5,
 0.4,
 0.2,
 0.8,
 0.8]

In [17]:
recall_scores = []
recall_responses = [] 
for _, row in dataset.iterrows():
    response = rag_evaluator.evaluate_recall(row['reference'], row['retrieved_context'])
    parsed_response = extract_recall_score(response)
    recall_scores.append(parsed_response.get("recall_score"))
    recall_responses.append(parsed_response)

In [18]:
recall_scores

[0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5]

In [19]:
faithfulness_scores = []
faithfulness_responses = []
for _, row in dataset.iterrows():
    response = rag_evaluator.evaluate_faithfulness(row['question'],row['retrieved_context'], row['answer'])
    parsed_response = extract_faithfulness_score(response)
    faithfulness_scores.append(parsed_response.get("faithfulness_score"))
    faithfulness_responses.append(parsed_response)

In [20]:
faithfulness_scores

[0.5, 0.5, 0.75, 1, 1, 0.5, 0.5, 0.75, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 0.5, 1]

In [21]:
response_quality_scores = []
for _, row in dataset.iterrows():
    response = rag_evaluator.evaluate_quality(row['question'], row['answer'], row['reference'])
    parsed_response = extract_response_quality_score(response)
    response_quality_scores.append(parsed_response)

In [22]:
response_quality_scores

[8, 8, 6, 8, 6, 6, 6, 8, 8, 8, 6, 6, 9, 6, 8, 8, 6]

In [23]:
dataset['relevance_score'] = relevance_scores
dataset['relevance_response'] = relevance_responses
dataset['recall_score'] = recall_scores
dataset['recall_response'] = recall_responses
dataset['faithfulness_score'] = faithfulness_scores
dataset['faithfulness_response'] = faithfulness_responses
dataset['response_quality_score'] = response_quality_scores

In [24]:
dataset.head()

Unnamed: 0,question,answer,retrieved_context,reference,relevance_score,relevance_response,recall_score,recall_response,faithfulness_score,faithfulness_response,response_quality_score
0,What is prompt engineering?,Prompt engineering is the process of designing...,OpenAI Cookbook has many in-depth examples for...,"Prompt Engineering, also known as In-Context P...",0.5,"{'relevance_score': 0.5, 'total_relevant_state...",0.5,"{'recall_score': 0.5, 'total_attributed_statem...",0.5,"{'faithfulness_score': 0.5, 'total_inferred_cl...",8
1,What are the basic approaches for prompting a ...,The basic approaches for prompting a language ...,OpenAI Cookbook has many in-depth examples for...,Zero-shot and few-shot learning are two of the...,0.8,"{'relevance_score': 0.8, 'total_relevant_state...",0.5,"{'recall_score': 0.5, 'total_attributed_statem...",0.5,"{'faithfulness_score': 0.5, 'total_inferred_cl...",8
2,What are the issues in few-shot learning that ...,The issues in few-shot learning that lead to p...,Zero-Shot#\nZero-shot learning is to simply fe...,(1) Majority label bias exists if the distribu...,0.5,"{'relevance_score': 0.5, 'total_relevant_state...",0.5,"{'recall_score': 0.5, 'total_attributed_statem...",0.75,"{'faithfulness_score': 0.75, 'total_inferred_c...",6
3,What is Chain-of-Thought (CoT) prompting?,Chain-of-Thought (CoT) prompting is a techniqu...,Definition: Determine which category the quest...,Chain-of-thought (CoT) prompting generates a s...,0.5,"{'relevance_score': 0.5, 'total_relevant_state...",0.5,"{'recall_score': 0.5, 'total_attributed_statem...",1.0,"{'faithfulness_score': 1, 'total_inferred_clai...",8
4,What are the types of Chain-of-Thought prompts?,"The question asks for the category ""Quantity"" ...",References#\n[1] Zhao et al. “Calibrate Before...,Two main types of CoT prompting:\n\nFew-shot C...,4.0,"{'relevance_score': 4, 'total_relevant_stateme...",0.5,"{'recall_score': 0.5, 'total_attributed_statem...",1.0,"{'faithfulness_score': 1, 'total_inferred_clai...",6


In [None]:
filename = f"RAG_evaluation_{timestamp}.csv"
dataset.to_csv(filename)


In [27]:
scores = dataset[['relevance_score','recall_score','faithfulness_score','response_quality_score']]
scores

Unnamed: 0,relevance_score,recall_score,faithfulness_score,response_quality_score
0,0.5,0.5,0.5,8
1,0.8,0.5,0.5,8
2,0.5,0.5,0.75,6
3,0.5,0.5,1.0,8
4,4.0,0.5,1.0,6
5,0.8,0.5,0.5,6
6,0.8,0.5,0.5,6
7,0.8,0.5,0.75,8
8,0.67,0.5,0.5,8
9,0.5,0.5,0.5,8


In [28]:
mean_scores = pd.DataFrame(scores.mean()).transpose()
mean_scores.columns = ['mean_'+col for col in mean_scores.columns]
mean_scores

Unnamed: 0,mean_relevance_score,mean_recall_score,mean_faithfulness_score,mean_response_quality_score
0,0.815882,0.5,0.647059,7.117647


In [None]:
print('Done!')