<h2>RAG EVALUATION WITH RAGAS FRAMEWORK, LANGCHAIN, AZURE OPENAI</h2>

<h4> Import all necessary dependencies here </h4>

In [1]:
import os
from langchain_ollama import ChatOllama
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import AIMessage
from langchain_core.documents import Document
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.callbacks import BaseCallbackHandler

from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.run_config import RunConfig
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

  from .autonotebook import tqdm as notebook_tqdm


<h4> Azure OpenAI subscription details and keys are configured as environment variables </h4>
<h4> Define embedding model and Vector store (In-memory) </h4>

In [2]:
llm = AzureChatOpenAI(
        azure_deployment='gpt-4o',
    )
embed_model = HuggingFaceEmbeddings(model_name=os.getenv('EMBEDDING_MODEL_PATH'))
vector_store = InMemoryVectorStore(embed_model)



<h4>Testing the LLM chat API</h4>

In [3]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg

AIMessage(content="J'aime programmer.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 31, 'total_tokens': 35, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_f3927aa00d', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_t

<h4>From the master datasource, create the documents for indexing</h4>

In [4]:
content_list = [
    "Andrew Ng is the CEO of Landing AI and is known for his pioneering work in deep learning. He is also widely recognized for democratizing AI education through platforms like Coursera.",
    "Sam Altman is the CEO of OpenAI and has played a key role in advancing AI research and development. He is a strong advocate for creating safe and beneficial AI technologies.",
    "Demis Hassabis is the CEO of DeepMind and is celebrated for his innovative approach to artificial intelligence. He gained prominence for developing systems that can master complex games like AlphaGo.",
    "Sundar Pichai is the CEO of Google and Alphabet Inc., and he is praised for leading innovation across Google's vast product ecosystem. His leadership has significantly enhanced user experiences on a global scale.",
    "Arvind Krishna is the CEO of IBM and is recognized for transforming the company towards cloud computing and AI solutions. He focuses on providing cutting-edge technologies to address modern business challenges.",
]

langchain_documents = []

for content in content_list:
    langchain_documents.append(
        Document(
            page_content=content,
        )
    )

<h4>Store the input documents into Vector Store and then create retriever component from it</h4>

In [5]:
_ = vector_store.add_documents(langchain_documents)
retriever = vector_store.as_retriever(search_kwargs={"k": 1})

<h4>Define RAG retriever chain</h4>

In [6]:
template = """Answer the question based only on the following context:
{context}

Question: {query}
"""
prompt = ChatPromptTemplate.from_template(template)

qa_chain = prompt | llm | StrOutputParser()

In [7]:
def format_docs(relevant_docs):
    return "\n".join(doc.page_content for doc in relevant_docs)

#query = "Who is the CEO of OpenAI?"

#relevant_docs = retriever.invoke(query)
#qa_chain.invoke({"context": format_docs(relevant_docs), "query": query})

<h4>Sample Validation data in the form of Questions and their expected responses</h4>

In [8]:
sample_queries = [
    "Which CEO is widely recognized for democratizing AI education through platforms like Coursera?",
    "Who is Sam Altman?",
    "Who is Demis Hassabis and how did he gained prominence?",
    "Who is the CEO of Google and Alphabet Inc., praised for leading innovation across Google's product ecosystem?",
    "How did Arvind Krishna transformed IBM?",
]

expected_responses = [
    "Andrew Ng is the CEO of Landing AI and is widely recognized for democratizing AI education through platforms like Coursera.",
    "Sam Altman is the CEO of OpenAI and has played a key role in advancing AI research and development. He strongly advocates for creating safe and beneficial AI technologies.",
    "Demis Hassabis is the CEO of DeepMind and is celebrated for his innovative approach to artificial intelligence. He gained prominence for developing systems like AlphaGo that can master complex games.",
    "Sundar Pichai is the CEO of Google and Alphabet Inc., praised for leading innovation across Google's vast product ecosystem. His leadership has significantly enhanced user experiences globally.",
    "Arvind Krishna is the CEO of IBM and has transformed the company towards cloud computing and AI solutions. He focuses on delivering cutting-edge technologies to address modern business challenges.",
]

<h4>Generation of Evaluation dataset using sample validation data created above, for RAGAS</h4>

In [9]:
dataset = []

for query, reference in zip(sample_queries, expected_responses):
    relevant_docs = retriever.invoke(query)
    response = qa_chain.invoke({"context": format_docs(relevant_docs), "query": query})
    dataset.append(
        {
            "user_input": query,
            "retrieved_contexts": [rdoc.page_content for rdoc in relevant_docs],
            "response": response,
            "reference": reference,
        }
    )

evaluation_dataset = EvaluationDataset.from_list(dataset)

<h4>Evaluate RAG retriever component with evaluation dataset prepared using RAGAS evaluation.</h4>
<h4>Metrics evaluated against are - Faithfulness, Context Recall, Factual Correctness</h4>

In [10]:
evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embed = LangchainEmbeddingsWrapper(embed_model)

my_run_config = RunConfig(max_workers=5, timeout=60, max_retries=3)

class TestCallback(BaseCallbackHandler):

    def on_llm_start(self, serialized, prompts, **kwargs):
        pass
        #print(f"**********Prompts*********:\n {prompts[0]}\n\n")

    def on_llm_end(self, response, **kwargs):
        pass
        #print(f"**********Response**********:\n {response}\n\n")

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[Faithfulness(), LLMContextRecall(), FactualCorrectness()], #
    llm=evaluator_llm,
    embeddings=evaluator_embed,
    run_config=my_run_config,
    raise_exceptions=True,
    callbacks=[TestCallback()],
    #batch_size=2,
)

result

Evaluating: 100%|██████████| 15/15 [00:30<00:00,  2.05s/it]


{'faithfulness': 1.0000, 'context_recall': 1.0000, 'factual_correctness': 0.8860}