## DeepEval Evaluation

python -m ipykernel install --user --name=myenv --display-name="Python (myenv)"

In [1]:
## Imports
import sys
from pathlib import Path

sys.path.append(str(Path().resolve().parent))

#DeepEval
import pandas as pd
import time
import os

from src.retrieval import format_docs_with_metadata, rerank_documents
from src.prompts import get_prompt_template
from src.core.models import get_retriever, get_generator, get_reranker
from langchain_community.retrievers import BM25Retriever
from langchain_mistralai import ChatMistralAI
from langchain_openai import AzureChatOpenAI
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval import evaluate
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval.metrics import AnswerRelevancyMetric, ContextualPrecisionMetric, FaithfulnessMetric, ContextualRecallMetric

import ast
import json
from dotenv import load_dotenv

load_dotenv()

True

## Creating Synthetic Dataset for RAGAS

#### **Specific vs. Abstract Queries in a RAG:<br>**

**Specific Query:** Focuses on clear, fact-based retrieval. The goal in RAG is to retrieve highly relevant information from one or more documents that directly address the specific question.

**Abstract Query:** Requires a broader, more interpretive response. In RAG, abstract queries challenge the retrieval system to pull from documents that contain higher-level reasoning, explanations, or opinions, rather than simple facts.

In [2]:
#Design the prompt to generate QA pair
os.environ['MISTRAL_API_KEY'] = os.getenv("MISTRAL_API")
llm_qa = ChatMistralAI(
    model=os.getenv("MODEL_QA_GENERATOR"),
    temperature=0,
    max_retries=2,
    )
def get_qa_pair(Content):
    messages = [{"role":"system","content":f"""You are an experienced Insurance Consultant. Your task is to generate difficult Question‚ÄìAnswer pairs from the given policy document.

            You are given:
            - A block of Content text from that section.

            An **Abstract Query** is a question that requires interpretation, reasoning, or combining multiple ideas from the text. It must NOT be answerable by copying a single sentence.

            Follow these rules strictly:
            1. You must create ONLY abstract queries as defined above.
            2. Each question must include at least two policy terms or phrases that appear verbatim in the provided content (e.g., defined terms, or key policy phrases).
            3. Use ONLY the provided content. Do NOT guess or use outside knowledge.
            4. Keep both questions and answers concise and precise.
            5. Each query must be at most 2 sentences and must not exceed 50 words.
            6. Return a JSON array of exactly two objects in this format:
            {json.dumps([{"query": "...", "answer": "..."}, {"query": "...", "answer": "..."}])}
            7. NEVER return more than two pairs. If more are possible, choose the two most important.

            Content:
            {Content}

            Generate exactly 2 abstract QA pairs based on the above.
            Return ONLY the JSON array and nothing else.
        """}
        ]
    
    ai_msg = llm_qa.invoke(messages)
    return ai_msg.content

In [None]:
chunks = pd.read_csv(os.getenv("CLEANED_DF"))
#Combine chunks for similar sub level headings to pass to QA generator

combined_chunks = {}
for index, row in chunks.iterrows():
    meta_dic = ast.literal_eval(row['Metadata'])
    part = meta_dic['PART_DETAILS']
    section = meta_dic['SECTION_DETAILS']
    article = meta_dic['ARTICLE_DETAILS']
    combined_chunks.setdefault(f"Part: {part}, Section: {section}, Article: {article}", []).append(row['Text'])

for key, value in combined_chunks.items():
    combined_chunks[key] = ''.join(combined_chunks[key])



In [4]:
#Let's generate the content one by one
all_qa_pairs = []
count = 20
for heading , content in combined_chunks.items():
    if count == 0:
        break
    response = get_qa_pair(content)
    all_qa_pairs.append(response)
    count-=1


In [10]:
#Post processing all the responses
all_qa_pairs_cleaned=[]
for response in all_qa_pairs:
    clean_response = response.replace("\n","").replace("json",'').replace("  ","").replace("```","").strip()
    try:
        list_qa = json.loads(clean_response)
        all_qa_pairs_cleaned.append(list_qa)
    except:
        print("Not able to load this pair",response)

In [None]:
qa_df = pd.DataFrame(columns = ['query','answer'])
for qa_pair in all_qa_pairs_cleaned:
    for dict_qa in qa_pair:
        qa_df = pd.concat([qa_df,pd.DataFrame(dict_qa,index=[len(qa_df)+1])])
qa_df = qa_df.rename(columns={'answer':'expected_output'})

In [None]:
qa_df.to_parquet(os.getenv("OUTPUT_QA"),index=False)
qa_df.head()

Unnamed: 0,query,expected_answer
1,How does the policy define a Member's ability ...,A Member is Actively at Work if able and avail...
2,What criteria must a Dependent Child meet to q...,A Dependent Child must be under 26 and meet sp...
3,How does the Group Policy define the contractu...,The Principal's obligations are solely defined...
4,What constitutes the entire contract under thi...,The entire contract consists of the Group Poli...
5,What conditions must be met for changes to the...,Changes to the Group Policy must be in Writing...


## Retrieve LLM Response + Retrievals

In [14]:
os.environ['MISTRAL_API_KEY'] = os.getenv("MISTRAL_API")
llm_gen = ChatMistralAI(
    model= os.getenv("MODEL_GENERATOR"),
    temperature=0,
    max_retries=2
    ).with_fallbacks(
        [
            ChatMistralAI(model = os.getenv("MODEL_BACKUP_1"), temperature = 0, max_retries=2),
            ChatMistralAI(model = os.getenv("MODEL_BACKUP_2"), temperature = 0, max_retries=2)
        ]
    )
def get_response(query):
    chroma_retriever = get_retriever()
    retrieved_docs =  chroma_retriever.invoke(query)
    bm25_docs_retrieved = BM25Retriever.from_documents(retrieved_docs, k=len(retrieved_docs)).invoke(query)
    results_df = format_docs_with_metadata(bm25_docs_retrieved)
    top_docs_and_context = rerank_documents(query,results_df)
    metadatas = top_docs_and_context['Documents']
    context = top_docs_and_context['Metadatas']
    prompt = get_prompt_template().format_messages(query=query, context=context, metadatas=metadatas)
    response = llm_gen.invoke(prompt)
    return top_docs_and_context['Documents'],response.content


In [None]:
qa_df = pd.read_parquet(os.getenv("OUTPUT_QA"))
qa_df['actual_output'] = ""
qa_df['retrieval_context'] = [[] for _ in range(len(qa_df))]

for idx, row in qa_df.iterrows():
    try:
        retrieved_context, response = get_response(row["query"])
        qa_df.at[idx, "retrieval_context"] = retrieved_context
        qa_df.at[idx, "actual_output"] = response
    except Exception as e:
        qa_df.at[idx, "retrieval_context"] = []
        qa_df.at[idx, "actual_output"] = f"ERROR: {str(e)}"

qa_df['retrieval_context'] = qa_df['retrieval_context'].apply(lambda x: x.to_list())
qa_df.to_parquet(os.getenv("OUTPUT_QA"))

2026-01-14 01:57:57 INFO MrHelpMateAI Chroma Retriever is Created!
2026-01-14 01:58:02 INFO MrHelpMateAI Reranker Created!
2026-01-14 01:58:02 INFO MrHelpMateAI Chunk Results fetche with size 3
2026-01-14 01:58:17 INFO MrHelpMateAI Chroma Retriever is Created!
2026-01-14 01:58:21 INFO MrHelpMateAI Reranker Created!
2026-01-14 01:58:21 INFO MrHelpMateAI Chunk Results fetche with size 3
2026-01-14 01:58:28 INFO MrHelpMateAI Chroma Retriever is Created!
2026-01-14 01:58:33 INFO MrHelpMateAI Reranker Created!
2026-01-14 01:58:33 INFO MrHelpMateAI Chunk Results fetche with size 3
2026-01-14 01:58:47 INFO MrHelpMateAI Chroma Retriever is Created!
2026-01-14 01:58:51 INFO MrHelpMateAI Reranker Created!
2026-01-14 01:58:52 INFO MrHelpMateAI Chunk Results fetche with size 3
2026-01-14 01:59:02 INFO MrHelpMateAI Chroma Retriever is Created!
2026-01-14 01:59:06 INFO MrHelpMateAI Reranker Created!
2026-01-14 01:59:06 INFO MrHelpMateAI Chunk Results fetche with size 3
2026-01-14 01:59:17 INFO MrHel

### Configuring Azure AI

In [None]:

class AzureOpenAI(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "gpt-40"

# Replace these with real values
custom_model = AzureChatOpenAI(
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    azure_deployment=os.getenv("OPENAI_MODEL"),
    azure_endpoint=os.getenv("AZURE_ENDPOINT"),
    openai_api_key=os.getenv("OPENAI_KEY"),
)
azure_openai = AzureOpenAI(model=custom_model)
print(azure_openai.generate("Write me a joke"))

Sure! Here's a joke for you:

Why don‚Äôt skeletons fight each other?

Because they don‚Äôt have the guts!


## Mistral as a Judge

In [6]:
#Mistral as a Judge

# from mistralai import Mistral
# from deepeval.models.base_model import DeepEvalBaseLLM

# class MistralAPI(DeepEvalBaseLLM):
#     def __init__(self, api_key):
#         self.client = Mistral(api_key=api_key)

#     def load_model(self):
#         return self.client   # required by DeepEval

#     def generate(self, prompt: str) -> str:
#         resp = self.client.chat.complete(
#             model=os.getenv("MODEL_JUDGE"),
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.3
#         )
#         return resp.choices[0].message.content

#     async def a_generate(self, prompt: str) -> str:
#         return self.generate(prompt)

#     def get_model_name(self):
#         return "Mistral-API"


# mistral = MistralAPI(api_key = os.getenv("MODEL_API"))
# print(mistral.generate("Tell me a joke"))


In [None]:
all_test_cases_df = pd.read_parquet(os.getenv("OUTPUT_QA"))

contextual_precision  = ContextualPrecisionMetric(
    threshold=0.7,
    model=azure_openai,
    include_reason=True
)

answer_relevancy = AnswerRelevancyMetric(threshold=0.7,model=azure_openai,include_reason=True)

contextual_Recall = ContextualRecallMetric(
    threshold=0.7,
    model=azure_openai,
    include_reason=True
)

faithfullness = FaithfulnessMetric(
    threshold=0.7,
    model=azure_openai,
    include_reason=True
)

test_cases = []

for _, row in all_test_cases_df.iterrows():
    if row["contextual_precision"]["score"] == None:
        test_cases.append(
            LLMTestCase(
                input=row["query"],
                actual_output=row["actual_output"],
                expected_output=row["expected_output"],
                retrieval_context=list(row["retrieval_context"])
            )
        )

In [64]:
all_results = []
for i in range(2):
    subset_test = test_cases[i*4:i*4+4]
    results = evaluate(
        test_cases=subset_test,
        metrics=[
            contextual_precision,
            answer_relevancy,
            contextual_Recall,
            faithfullness
        ]
    )
    all_results.append(results)
    time.sleep(60)

Output()



Metrics Summary

  - ‚úÖ Contextual Precision (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-40, reason: The score is 1.00 because the first node directly addresses the input question with detailed information, while the irrelevant nodes, such as the second discussing timelines under ERISA and the third describing proof of loss, are appropriately ranked lower., error: None)
  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-40, reason: The score is 1.00 because the output directly addresses the input without any irrelevant statements, demonstrating complete relevance and clarity. Great job!, error: None)
  - ‚úÖ Contextual Recall (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-40, reason: The score is 1.00 because the expected output perfectly aligns with the node(s) in retrieval context, showing a clear and precise match. Great work‚Äîkeep it up!, error: None)
  - ‚úÖ Faithfulness (score: 1.0, threshold: 0.

Output()



Metrics Summary

  - ‚úÖ Contextual Precision (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-40, reason: The score is 1.00 because all relevant nodes are ranked higher than irrelevant nodes. The first node is highly relevant because it states, 'the Policyholder must: a. notify each Member of the effective date of the termination; and b. refund or otherwise account to each Member all contributions received or withheld from Members for premiums not actually paid to The Principal,' which directly addresses compliance with GC 6005 Section C. The irrelevant nodes, such as the second node, which focuses on policy renewal, and the third node, which discusses reasons for policy termination without addressing compliance details, are ranked lower, ensuring precision!, error: None)
  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-40, reason: The score is 1.00 because the response is completely relevant and effectively addresses the in

In [65]:
# all_test_cases_df['contextual_precision'] = [{} for _ in range(len(all_test_cases_df))]
# all_test_cases_df['contextual_recall'] = [{} for _ in range(len(all_test_cases_df))]
# all_test_cases_df['answer_relevancy'] = [{} for _ in range(len(all_test_cases_df))]
# all_test_cases_df['faithfulness'] = [{}for _ in range(len(all_test_cases_df))]

for results in all_results:
    for label, test_results in results:
        if test_results:
            for t in test_results:
                expected_output = t.expected_output
                for m in t.metrics_data:
                    if m.name == "Contextual Precision":
                        all_test_cases_df.loc[all_test_cases_df['expected_output']==expected_output,"contextual_precision"] = [{"score": m.score, "reason": m.reason}]
                    elif m.name == "Answer Relevancy":
                        all_test_cases_df.loc[all_test_cases_df['expected_output']==expected_output,"answer_relevancy"] = [{"score": m.score, "reason": m.reason}]

                    elif m.name == "Contextual Recall":
                        all_test_cases_df.loc[all_test_cases_df['expected_output']==expected_output,"contextual_recall"] =[{"score": m.score, "reason": m.reason}]

                    elif m.name == "Faithfulness":
                        all_test_cases_df.loc[all_test_cases_df['expected_output']==expected_output,"faithfulness"] =[{"score": m.score, "reason": m.reason}]


In [None]:
all_test_cases_df.to_parquet(os.getenv("OUTPUT_QA"),index=False)

In [None]:
import pandas as pd
all_test_cases_df = pd.read_parquet(os.getenv("OUTPUT_QA"))

In [17]:
#Getting all the scores

all_test_cases_df["contextual_precision_score"] = 0
all_test_cases_df["answer_relevancy_score"] = 0
all_test_cases_df["contextual_recall_score"] = 0
all_test_cases_df["faithfulness_score"] = 0

for i in range(0,40):
    all_test_cases_df.loc[i,"contextual_precision_score"] = all_test_cases_df.loc[i,"contextual_precision"]["score"]
    all_test_cases_df.loc[i,"answer_relevancy_score"] = all_test_cases_df.loc[i,"answer_relevancy"]["score"]
    all_test_cases_df.loc[i,"contextual_recall_score"] = all_test_cases_df.loc[i,"contextual_recall"]["score"]
    all_test_cases_df.loc[i,"faithfulness_score"] = all_test_cases_df.loc[i,"faithfulness"]["score"]

  all_test_cases_df.loc[i,"answer_relevancy_score"] = all_test_cases_df.loc[i,"answer_relevancy"]["score"]
  all_test_cases_df.loc[i,"faithfulness_score"] = all_test_cases_df.loc[i,"faithfulness"]["score"]
  all_test_cases_df.loc[i,"contextual_precision_score"] = all_test_cases_df.loc[i,"contextual_precision"]["score"]


In [None]:
all_test_cases_df.describe

Unnamed: 0,contextual_precision_score,answer_relevancy_score,contextual_recall_score,faithfulness_score
count,40.0,40.0,40.0,40.0
mean,0.9625,0.880828,1.0,0.807301
std,0.133373,0.127165,0.0,0.20677
min,0.5,0.454545,1.0,0.0
25%,1.0,0.809375,1.0,0.75
50%,1.0,0.897368,1.0,0.851648
75%,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0


In [20]:
all_test_cases_df['query'][0]

"How does the policy define a Member's ability to be considered Actively at Work, and what exceptions are made for short-term absences?"