In [13]:
from llama_index.core import Settings
from dotenv import load_dotenv
from typing import List
import os
import itertools
import numpy as np
import pandas as pd
import time

from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import VectorStoreIndex

from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer, PromptTemplate

from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.core.llms.llm import LLM
from pinecone import Pinecone
from deepeval.metrics import FaithfulnessMetric
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.metrics import ContextualRecallMetric
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
gemini_llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), model="models/gemini-1.5-flash")
embed_model = GeminiEmbedding(model_name="models/embedding-001")
Settings.llm = gemini_llm
Settings.embed_model = embed_model
Settings.chunk_size = 768

In [4]:
pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pinecone_index = pinecone_client.Index("lorawan-rag")
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [5]:
# Load test data from file
test_df = pd.read_csv("data/faq/qa-test.tsv", sep="\t", na_filter=False)
test_df.head()

Unnamed: 0,question,answer,manual quote
0,How does LoRaWAN handle message fragmentation ...,LoRaWAN handles message fragmentation and reas...,
1,What is the significance of the LoRaWAN region...,LoRaWAN regional parameters define the frequen...,
2,How does LoRaWAN handle different data rates?,LoRaWAN handles different data rates by using ...,
3,How does LoRaWAN enable long-range communication?,LoRaWAN enables long-range communication using...,
4,What is the function of the network server in ...,The network server manages device registration...,


In [6]:
relevancy = AnswerRelevancyMetric(
    threshold=0.7, model="gpt-4o-mini", include_reason=True
)
faithfulness = FaithfulnessMetric(
    threshold=0.7, model="gpt-4o-mini", include_reason=True
)

In [14]:
def construct_test_cases(
    model: LLM,
    prompt: str,
    similiarity_top_k: int = 3,
    sparse_top_k: int = 10,
    rerank_top_n: int = 2,
) -> List[LLMTestCase]:
    qa_prompt = PromptTemplate(prompt)
    cohere_rank = CohereRerank(api_key=COHERE_API_KEY, top_n=rerank_top_n)
    retriever = VectorIndexRetriever(
        index=VectorStoreIndex.from_vector_store(vector_store=vector_store),
        similiarity_top_k=similiarity_top_k,
        sparse_top_k=sparse_top_k,
        vector_store_query_mode="hybrid",
    )
    response_synthesizer = get_response_synthesizer(
        llm=model, text_qa_template=qa_prompt, response_mode="compact"
    )
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=[cohere_rank],
    )
    deepeval_test_cases: List[LLMTestCase] = []
    for index, test_case in test_df.iterrows():
        question = test_case["question"]
        expected_output = test_case["answer"]
        response = query_engine.query(question)
        actual_output = response.response
        retrieval_context = [node.get_content() for node in response.source_nodes]
        tc = LLMTestCase(
            input=question,
            expected_output=expected_output,
            actual_output=actual_output,
            retrieval_context=retrieval_context,
        )
        deepeval_test_cases.append(tc)
        time.sleep(10)
    return deepeval_test_cases

In [8]:
template = (
    "Given the context that I will provide you, answer the questions.\n"
    "Context:\n"
    "#####################################\n"
    "{context_str}\n"
    "Answer: {query_str}\n"
)
prompt_template = [template]

llm = [gemini_llm]
similiarity_top_k = [3]
sparse_top_k = [10]
rerank_top_n = [2]

In [9]:
params = list(
    itertools.product(
        prompt_template, similiarity_top_k, sparse_top_k, rerank_top_n, llm
    )
)

In [15]:
test_dataset = []
for param in params:
    (
        prompt_template,
        similiarity_top_k,
        sparse_top_k,
        rerank_top_n,
        llm,
    ) = param
    param_test_cases = construct_test_cases(
        llm, prompt_template, similiarity_top_k, sparse_top_k, rerank_top_n
    )
    test_dataset.append((param, param_test_cases))

In [16]:
for param, param_test_cases in test_dataset:
    (
        prompt_template,
        similiarity_top_k,
        sparse_top_k,
        rerank_top_n,
        llm,
    ) = param
    evaluate(
        test_cases=param_test_cases,
        metrics=[
            relevancy,
            faithfulness,
        ],
        hyperparameters={
            "model": "Gemini 1.5 Flash",
            "prompt template": prompt_template,
            "similiarity_top_k": similiarity_top_k,
            "sparse_top_k": sparse_top_k,
            "rerank_top_n": rerank_top_n,
        }
    )

Evaluating test cases...
Event loop is already running. Applying nest_asyncio patch to allow async execution...




Metrics Summary

  - ❌ Answer Relevancy (score: 0.6666666666666666, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: The score is 0.67 because while some relevant information may have been provided, several statements were irrelevant to the specific question about LoRaWAN's handling of message fragmentation and reassembly. These irrelevant statements suggested seeking outside resources instead of giving direct answers, which detracted from the overall relevance., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because there are no contradictions present, indicating perfect alignment between the actual output and the retrieval context., error: None)

For test case:

  - input: How does LoRaWAN handle message fragmentation and reassembly?
  - actual output: The provided context doesn't contain information about how LoRaWAN handles message fragmentation and reassembly. 

To answer y