### Import modules

In [1]:
import os

from datasets import load_dataset
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import HuggingFaceDatasetLoader

from dotenv import load_dotenv

import pandas as pd

In [2]:
os.chdir("../")

### Load the existing Chroma instance

In [3]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(persist_directory="chroma", embedding_function=embeddings)

#### Create retriever

In [4]:
# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [5]:
retriever.invoke("What is the article about?")

[Document(page_content='and I" (DW) This year, the jury chose two winners in the category Online: "Digital journalism has a lot to offer. The two prizewinners represent a different approach in an interesting way and show how journalism generally evolves with multimedia possibilities," explains the jury. In the first contribution, Christian Salewski und Felix Rohrbeck track the disposal of electronic scrap in Germany and find out that it isn\'t always legal and fair. In the second contribution, a group of Deutsche Welle trainees asked their grandmothers from Belarus, Brazil, Chile, China, Kenya and Germany about their'),
 Document(page_content='is, in journalism, if we gather the "facts," we can usually find the answers to what we\'re looking for.  When it comes to God, Jesus and the Holy Spirit, those answers rest in faith. As a journalist, I seek intellectual certainty. When it came to my faith, I felt intellectually embarrassed. There was so much I just couldn\'t explain. When I star

In [6]:
# Initialize language model
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

#### Create a RAG chain

In [7]:
rag_chain = RetrievalQA.from_chain_type(
  llm = llm,
  chain_type = "stuff",
  retriever = retriever,
  return_source_documents = True,
)

#### Test the RAG chain

In [8]:
question = "Who bit Jon Huntsman in 2011"
# result = rag_chain.invoke(question)

## Evaluation the RAG system using RAGAS

In [9]:
# contexts = [doc.page_content for doc in result["source_documents"]]
# formatted_context = pretty_print_docs(contexts)

In [10]:
from src.ragas.ragas_pipeline import get_context_and_answer
from src.ragas.ragas_utils import load_evaluation_data

In [11]:
eval_data = load_evaluation_data('data/evaluation_set.csv')

#### Pick one sample for testing from eval_data


In [24]:
from datasets import Dataset
from typing import Dict, List
def get_context_and_answer(
    evaluation_data: List[Dict[str, List[str]]],
    rag_chain, 
) -> List[Dict[str, str]]:
    """Retrieves context and generates answers for each question in the evaluation data.

    Args:
        evaluation_data (Dict[str, List[str]]): A dictionary containing:
            - "questions": A list of questions.
            - "ground_truths": A list of corresponding ground truth answers.
        rag_chain: The RAG chain instance to use for retrieval and generation.

    Returns:
        List[Dict[str, str]]: A list of dictionaries, each containing:
            - "question": The original question.
            - "context": A string of concatenated relevant contexts.
            - "answer": The generated answer from the RAG chain.
            - "ground_truth": The ground truth answer (from the evaluation data).
    """

    results = {
        "question": [],
        "contexts": [],
        "answer": [],
        "ground_truth": [],
    }

    for question, ground_truth in zip(
        evaluation_data["questions"], evaluation_data["ground_truths"]
    ):
        response = rag_chain.invoke(question)
        contexts_list = [doc.page_content for doc in response["source_documents"]]
                
        results["question"].append(question)
        results["contexts"].append(contexts_list)
        results["answer"].append(response["result"])
        results["ground_truth"].append(ground_truth)
        
    dataset = Dataset.from_dict(results)
    return dataset

In [25]:
# Pick one sample from the evaluation data
question = eval_data['questions'][0]
ground_truth = eval_data['ground_truths'][0]

test_eval = {"questions": [question], "ground_truths": [ground_truth]}

test_data = get_context_and_answer(evaluation_data=test_eval, rag_chain=rag_chain)

In [26]:
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    faithfulness,
    answer_relevancy,
    context_precision,
)

# testset = get_context_and_answer(eval_data, rag_chain)

# evaluating test set on listed metrics
result = evaluate(
    dataset=test_data,
    metrics=[
        answer_correctness,
        faithfulness,
        answer_relevancy,
        context_precision
    ]
)




Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
result.to_pandas().head()

Unnamed: 0,question,contexts,answer,ground_truth,answer_correctness,faithfulness,answer_relevancy,context_precision
0,What upcoming animated project will feature Ad...,[(The Hollywood Reporter)The skies over Gotham...,The upcoming animated project that will featur...,Adam West and Burt Ward will be reprising thei...,0.744099,1.0,0.0,1.0
