### EVALUATE RAG RESULT USING CUSTOMED EVALUATION METRICS
This is the code to evaluate the llm result using our customed evaluation metrics
To run this code, you need to have the gcp credential file
We have prepared the test dataset for you, you can directly use it to evaluate the llm result

In [1]:
# Set up environment variables and load credentials
import os
import time
from google.auth import load_credentials_from_file
import pandas as pd
# Replace this with your own GCP project IAM credential file
credentials, project_id = load_credentials_from_file(
    "<Your GCP IAM credential file>"
)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "<Your GCP IAM credential file>"

In [2]:
# Import required libraries and initialize Vertex AI models and embeddings
# Set up helper functions for:
# - Generating directory hash to track PDF changes
# - Extracting text from PDFs while preserving metadata

import os
from typing import List, Union
import fitz
from langchain import hub
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
import hashlib

llm = VertexAI(
    model_name="gemini-1.0-pro",
    temperature=0.3,
    max_output_tokens=8192,
    max_workers=2,
    
)

embedding_model = VertexAIEmbeddings(
    model_name="textembedding-gecko"
)
def get_directory_hash(directory_path: str) -> str:
    hasher = hashlib.md5()
    for filename in sorted(os.listdir(directory_path)):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'rb') as f:
                hasher.update(f.read())
    return hasher.hexdigest()

def extract_text_from_pdf(pdf_path: str) -> List[Document]:
    documents = []
    try:
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text = page.get_text("text", flags=fitz.TEXT_DEHYPHENATE | fitz.TEXT_PRESERVE_WHITESPACE)
            if text.strip():
                metadata = {
                    "source": pdf_path,
                    "page": page_num + 1,
                    "total_pages": len(pdf_document)
                }
                documents.append(Document(page_content=text, metadata=metadata))
        pdf_document.close()
    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")
    return documents

def load_pdfs_from_directory(directory_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            documents.extend(extract_text_from_pdf(file_path))
    return documents

class RAGPipeline:
    def __init__(self, data_dir: str = "./data", persist_dir: str = "./chroma_db"):
        self.data_dir = data_dir
        self.persist_dir = persist_dir
        self.vectorstore = None
        self.rag_chain = None
        self.retrieve_docs = []
        self.initialize_pipeline()
        
    def initialize_pipeline(self):
        os.makedirs(self.persist_dir, exist_ok=True)
        should_update = self._should_update_embeddings()
        print(f'Should update embeddings: {should_update}')
        if not should_update and os.path.exists(self.persist_dir):
            self.vectorstore = Chroma(
                persist_directory=self.persist_dir,
                embedding_function=embedding_model
            )
        else:
            self._create_new_embeddings()

        retriever = self.vectorstore.as_retriever(
            search_kwargs={"k": 4}
        )
        prompt = hub.pull("rlm/rag-prompt")

        def format_docs(docs):
            doc_contents = [doc.page_content for doc in docs]
            self.retrieve_docs.append(doc_contents)
            return "\n\n".join(doc_contents)

        self.rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )

    def _should_update_embeddings(self) -> bool:
        if not os.path.exists(self.persist_dir):
            return True
            
        current_hash = get_directory_hash(self.data_dir)
        hash_file = os.path.join(self.persist_dir, "directory_hash.txt")
        
        if not os.path.exists(hash_file):
            return True
        
        with open(hash_file, 'r') as f:
            stored_hash = f.read().strip()
        
        return current_hash != stored_hash

    def _create_new_embeddings(self):
        docs = load_pdfs_from_directory(self.data_dir)
        if not docs:
            raise ValueError(f"No documents were loaded from {self.data_dir}")

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            is_separator_regex=False,
        )
        splits = text_splitter.split_documents(docs)
        self.vectorstore = Chroma.from_documents(
            documents=splits,
            embedding=embedding_model,
            persist_directory=self.persist_dir
        )
        current_hash = get_directory_hash(self.data_dir)
        with open(os.path.join(self.persist_dir, "directory_hash.txt"), 'w') as f:
            f.write(current_hash)

    def retrieve(self, query: str, k: int = 4) -> List[Document]:
        docs = self.vectorstore.similarity_search(query, k=k)
        self.retrieve_docs.append([doc.page_content for doc in docs])
        return docs

    def query(self, question: str) -> str:
        return self.rag_chain.invoke(question)

    def get_retrieve_history(self) -> List[List[str]]:
        return self.retrieve_docs

    def clear_retrieve_history(self):
        self.retrieve_docs = []
    
    def retrieve_and_query(self, query: str) -> str:
        self.retrieve(query)
        query_res = self.query(query), 
        retrieve_res = self.get_retrieve_history()[0] if self.get_retrieve_history() else []
        self.clear_retrieve_history()
        return query_res, retrieve_res
rag = RAGPipeline(data_dir="./data", persist_dir="./chroma_db")
ans, retrieve_history = rag.retrieve_and_query("How does the number of datasets and templates affect the performance of instruction tuning in the FLAN model?")


In [None]:
# Run the RAG pipeline to get the response and retrieve history
import pandas as pd
test_dataset = pd.read_json('./data/eval_dataset_rag_version_<your_version>.json')
test_dataset.columns = ['user_input', 'reference', 'response', 'retrieved_contexts']
test_dataset
import time
for i in range(len(test_dataset)):
    res, retrieve_data = rag.retrieve_and_query(test_dataset['user_input'][i])
    test_dataset.loc[i, 'response'] = res[0]
    test_dataset.loc[i, 'retrieved_contexts'] = retrieve_data
    time.sleep(1)
test_dataset.to_json('./test_dataset_for_eval.json', orient='records', lines=True)

In [11]:
# This is to fix the issue that the ragas evaluator is not working because of the llm stop signal
from ragas.run_config import RunConfig
import pandas as pd
my_run_config = RunConfig(max_workers=5, timeout=60)
from typing import cast as t
from langchain_core.outputs import LLMResult
from langchain_core.messages import BaseMessage
from langchain_core.outputs import ChatGeneration
def custom_is_finished_parser(response: LLMResult):
    is_finished_list = []
    for g in response.flatten():
        resp = g.generations[0][0]
        if resp.generation_info is not None:
            if resp.generation_info.get("finish_reason") is not None:
                is_finished_list.append(
                    resp.generation_info.get("finish_reason") == "STOP"
                )
        elif (
            isinstance(resp, ChatGeneration)
            and t.cast(ChatGeneration, resp).message is not None
        ):
            resp_message: BaseMessage = t.cast(ChatGeneration, resp).message
            if resp_message.response_metadata.get("finish_reason") is not None:
                is_finished_list.append(
                    resp_message.response_metadata.get("finish_reason") == "STOP"
                )
        else:
            is_finished_list.append(True)
    return all(is_finished_list)

In [14]:
# Evaluation metrics settings
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from factualcorrectness_revise import FactualCorrectnessReviseVer7
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas import EvaluationDataset
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity, BleuScore
llm = VertexAI(
    model_name="gemini-1.0-pro",
    temperature=0.01,
    max_output_tokens=8192,
    max_workers=1,
    
)

embedding_model = VertexAIEmbeddings(
    model_name="textembedding-gecko"
)
evaluator_llm = LangchainLLMWrapper(llm, run_config=my_run_config, is_finished_parser=custom_is_finished_parser,)
evaluator_embeddings = LangchainEmbeddingsWrapper(embedding_model, run_config=my_run_config)
eval_test_dataset = pd.read_json('./test_dataset_for_eval_<your_version>.json', orient='records', lines=True)
eval_dataset = EvaluationDataset.from_pandas(eval_test_dataset)


In [None]:
# Run the evaluation metrics
for i in range(1):
    metrics = [
        LLMContextRecall(llm=evaluator_llm), 
        FactualCorrectnessReviseVer7(llm=evaluator_llm), 
        Faithfulness(llm=evaluator_llm),
        SemanticSimilarity(embeddings=evaluator_embeddings),
        BleuScore()
    ]
    results = evaluate(dataset=eval_dataset, metrics=metrics, run_config=my_run_config)

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

processed_data Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason.
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema and OpenAPI specification:
{'$defs': {'ContextRecallClassification': {'properties': {'statement': {'title': 'Statement', 'type': 'string'}, 'reason': {'title': 'Reason', 'type': 'string'}, 'attributed': {'title': 'Attributed', 'type': 'integer'}}, 'required': ['statement', 'reason', 'attributed'], 'title': 'ContextRecallClassification', 'type': 'object'}}, 'properties': {'classifications': {'items': {'$ref': '#/$defs/ContextRecallClassification'}, 'title': 'Classifications', 'type': 'array'}}, 'required': ['classifications'], 'title': 'ContextRecallClassifications', 'type': 'object'}
These are some examples to show how to perform the above

Exception raised in Job[5]: AttributeError('StringIO' object has no attribute 'decomposed_claims')


generations=[[GenerationChunk(text='```json\n{\n  "text": "```json\\n{\\n  \\"decomposed_claims\\": [\\n    [\\n      \\"The answer to your question cannot be found in the provided context.\\"\\n    ],\\n    [\\n      \\"The context discusses the Zero-shot-CoT method.\\"\\n    ],\\n    [\\n      \\"The Zero-shot-CoT method is a way to elicit chain of thought from large language models.\\"\\n    ],\\n    [\\n      \\"The context does not provide a definition of the Zero-shot-CoT method itself.\\"\\n    ]\\n  ]\\n}\\n```"\n}\n```', generation_info={'is_blocked': False, 'safety_ratings': [{'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability_label': 'NEGLIGIBLE', 'blocked': False, 'severity': 'HARM_SEVERITY_NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability_label': 'NEGLIGIBLE', 'blocked': False, 'severity': 'HARM_SEVERITY_NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability_label': 'NEGLIGIBLE', 'blocked': False, 'severity': 'HARM_SEVERITY_LOW'}, {

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt claim_decomposition_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[21]: RagasOutputParserException(The output parser failed to parse the output including retries.)


generations=[[GenerationChunk(text='```json\n{\'decomposed_claims\': [[\'PaLM was used in the experiment "Exploring Zero-Shot Learning in Neural Networks".\', \'PaLM has 540B parameters.\'], [\'Original GPT3 was used in the experiment "Exploring Zero-Shot Learning in Neural Networks".\', \'Original GPT3 has 175B parameters.\'], [\'Original GPT3 was used in the experiment "Exploring Zero-Shot Learning in Neural Networks".\', \'Original GPT3 has 6.7B parameters.\'], [\'Original GPT3 was used in the experiment "Exploring Zero-Shot Learning in Neural Networks".\', \'Original GPT3 has 1.3B parameters.\'], [\'Original GPT3 was used in the experiment "Exploring Zero-Shot Learning in Neural Networks".\', \'Original GPT3 has 0.3B parameters.\'], [\'Instruct GPT3 was used in the experiment "Exploring Zero-Shot Learning in Neural Networks".\', \'Instruct GPT3 has unknown parameters.\'], [\'OPT was used in the experiment "Exploring Zero-Shot Learning in Neural Networks".\', \'OPT has 13B parameter

Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt claim_decomposition_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[77]: RagasOutputParserException(The output parser failed to parse the output including retries.)


generations=[[GenerationChunk(text='```json\n{\n  "statements": [\n    {\n      "statement": "Genes are passed from one generation to the next through a process called reproduction.",\n      "reason": "The context explicitly mentions that genes are passed from parent to offspring, which is the same as passing them from one generation to the next. The process of passing genes is also referred to as reproduction.",\n      "verdict": 1\n    },\n    {\n      "statement": "During reproduction, two parents contribute their genetic material to create a new individual.",\n      "reason": "The context states that two parents contribute their genetic material to create a new individual. This is a key aspect of the reproduction process.",\n      "verdict": 1\n    },\n    {\n      "statement": "The genetic material from each parent is combined in the offspring.",\n      "reason": "The context mentions that the genetic material from each parent is combined in the offspring. This is a direct result 