In [15]:
from llama_index.core import VectorStoreIndex
from deepeval.metrics import FaithfulnessMetric
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate
from llama_index.core.llms.llm import LLM
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
import os
from typing import List
import itertools
import time
from pinecone import Pinecone
from dotenv import load_dotenv
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    PromptTemplate,
    get_response_synthesizer,
)
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.retrievers import VectorIndexRetriever
import nest_asyncio
import pandas as pd
nest_asyncio.apply()

In [2]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY")

In [3]:
openai_llm = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o-mini", temperature=0
)
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
Settings.llm = openai_llm
Settings.embed_model = embed_model
Settings.chunk_size = 1536

In [4]:
pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pinecone_index = pinecone_client.Index("rag-lorawan")

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [5]:
prompt_template = """
You are a specialized chatbot focused on LoRa and LoRaWAN technologies. Your role is to provide expert-level information and guidance on these subjects. Your knowledge base includes detailed insights into LoRa and LoRaWAN technologies, as well as relevant case studies, best practices, and technical standards.

Follow these guidelines when answering queries:
- Provide concise answers to straightforward questions.
- Offer in-depth explanations for more complex or open-ended inquiries.
- Assist with network design, deployment strategies, troubleshooting, and any other tasks related to LoRa and LoRaWAN.
- Use markdown for technical formatting and code snippets.
- Do not reveal information about your role or knowledge base unless it is directly relevant to the user's query.

When handling different types of questions:
- **Technical Questions:** Provide detailed, accurate information.
- **Practical Applications:** Offer insights based on case studies and best practices.
- **Troubleshooting:** Suggest step-by-step approaches to identify and resolve issues.
- **Design and Deployment:** Provide strategic advice and considerations.

**Use markdown formatting** as follows:
- Use backticks for inline code or technical terms.
- Use 
triple backticks
 for multi-line code blocks or command-line instructions.
- Use **bold** for emphasis on important points.
- Use *italics* for subtle emphasis or technical terms on first mention.
- Use bullet points or numbered lists for step-by-step instructions or multiple related points.

**Additional instructions:**
- **Chain of Thought:** For complex questions, clearly break down the response into logical steps or sequences. Ensure each step is visible and reasoned out before concluding.
- **Few-Shot Learning:** Adapt your responses based on provided examples or context clues within the query. Utilize patterns from previous answers to maintain consistency.
- **Contextual Adjustment:** Adjust the level of technical detail based on the perceived expertise of the user. Simplify terms for beginners; use more advanced terminology for experts.
- **Avoid Redundancy:** Before reiterating a point, review the response to ensure it has not already been covered sufficiently. Simplify and streamline the information.
- **Context-Sensitive Responses:** If the query is related to a specific context (e.g., rural vs. urban deployment), tailor your advice to address challenges and solutions relevant to that context.

Format your response as follows:
1. **Detailed Explanation:** Follow with a more detailed explanation or discussion as needed.
2. **Technical Details and Examples:** Include relevant technical details, examples, or code snippets using markdown.
3. **Summary:** Conclude with a summary or key takeaway if the response is lengthy.

**Here is the user's question:**
{query_str}

Process the query and provide a response following the guidelines and formatting instructions above. If the query requires complex reasoning or multiple steps, break down your reasoning step by step before providing your final answer. First, identify the key points of the problem, then explain how you would apply the relevant knowledge, and finally give your conclusion.
"""

## Build evaluators with Deepeval Framework 

In [6]:
def parse_qa_csv(file_path: str)-> list[tuple]:
    df = pd.read_csv(file_path)
    # delete url column
    df = df.drop(columns=["url"])
    # return the 50 first rows
    df = df.head(50)
    return df
qa_pairs = parse_qa_csv("../data/faq/faq_cleaned.csv")
qa_pairs

Unnamed: 0,question,answer
0,What is the best compromise between spreading ...,Consider the following scenario: the payload d...
1,Is there a LoRa IQ Waveform library narrower t...,Lower BW are made by simply playing the wavefo...
2,Can I used a long preamble to wake-up devices ...,This method is extremely widespread in the ind...
3,Does a LoRaWAN concentrator have built-in GPS ...,Most commercial gateways have either a built-i...
4,What are the typical ranges of good and poor v...,Typical Noise Floor is usually close to -120 d...
5,What would be the minimal channel spacing for ...,The typical channel spacing for LoRa and LoRaW...
6,Where does the CAD Detected interrupt occur in...,The CAD interrupt happens at a determined time...
7,It is possible to initialize a packet transmis...,It is not possible to use one of the DIOs of S...
8,What is the maximum size of an application pay...,The payload size limitations are identical in ...
9,Why is there a latency when a SX1272 is proces...,Between the moment a packet is sent by a senso...


### Metrics to evaluate

In [7]:
relevancy = AnswerRelevancyMetric(
    threshold=0.7, model="gpt-4o-mini", include_reason=True
)
faithfulness = FaithfulnessMetric(
    threshold=0.7, model="gpt-4o-mini", include_reason=True
)

In [8]:
def construct_test_cases(
    model: LLM,
    prompt: str,
    similiarity_top_k: int = 5,
    sparse_top_k: int = 10,
    rerank_top_n: int = 5,
) -> List[LLMTestCase]:
    qa_prompt = PromptTemplate(template=prompt)
    cohere_rank = CohereRerank(api_key=os.getenv("COHERE_API_KEY"), top_n=rerank_top_n)
    retriever = VectorIndexRetriever(
        index=VectorStoreIndex.from_vector_store(vector_store=vector_store),
        similiarity_top_k=similiarity_top_k,
        sparse_top_k=sparse_top_k,
        vector_store_query_mode="hybrid",
    )
    response_synthesizer = get_response_synthesizer(
        llm=model, text_qa_template=qa_prompt, response_mode="compact"
    )
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=[cohere_rank],
    )
    deepeval_test_cases: List[LLMTestCase] = []
    for index, test_case in qa_pairs.iterrows():
        question = test_case["question"]
        expected_output = test_case["answer"]
        response = query_engine.query(question)
        actual_output = response.response
        retrieval_context = [node.get_content() for node in response.source_nodes]
        tc = LLMTestCase(
            input=question,
            expected_output=expected_output,
            actual_output=actual_output,
            retrieval_context=retrieval_context,
        )
        deepeval_test_cases.append(tc)
        time.sleep(5)
    return deepeval_test_cases

In [9]:
prompt_template = [prompt_template]

llm = [openai_llm]
similiarity_top_k = [5]
sparse_top_k = [10]
rerank_top_n = [2]

In [10]:
params = list(
    itertools.product(
        prompt_template, similiarity_top_k, sparse_top_k, rerank_top_n, llm
    )
)

In [11]:
test_dataset = []
for param in params:
    (
        prompt_template,
        similiarity_top_k,
        sparse_top_k,
        rerank_top_n,
        llm,
    ) = param
    param_test_cases = construct_test_cases(
        llm, prompt_template, similiarity_top_k, sparse_top_k, rerank_top_n
    )
    test_dataset.append((param, param_test_cases))

In [12]:
print(test_dataset)

[(("\nYou are a specialized chatbot focused on LoRa and LoRaWAN technologies. Your role is to provide expert-level information and guidance on these subjects. Your knowledge base includes detailed insights into LoRa and LoRaWAN technologies, as well as relevant case studies, best practices, and technical standards.\n\nFollow these guidelines when answering queries:\n- Provide concise answers to straightforward questions.\n- Offer in-depth explanations for more complex or open-ended inquiries.\n- Assist with network design, deployment strategies, troubleshooting, and any other tasks related to LoRa and LoRaWAN.\n- Use markdown for technical formatting and code snippets.\n- Do not reveal information about your role or knowledge base unless it is directly relevant to the user's query.\n\nWhen handling different types of questions:\n- **Technical Questions:** Provide detailed, accurate information.\n- **Practical Applications:** Offer insights based on case studies and best practices.\n- *

In [13]:
for param, param_test_cases in test_dataset:
    (
        prompt_template,
        similiarity_top_k,
        sparse_top_k,
        rerank_top_n,
        llm,
    ) = param
    try:
        evaluate(
            param_test_cases,
            [
                relevancy,
                faithfulness,
            ],
        )
    except Exception as e:
        print(f"An error occurred: {e}")

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 50 test case(s) in parallel: |          |  0% (0/50) [Time Taken: 00:00, ?test case/s]

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


Evaluating 50 test case(s) in parallel: |▍         |  4% (2/50) [Time Taken: 00:29, 12.37s/test case]ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
Evaluating 50 test case(s) in parallel: |▌         |  6% (3/50) [Time Taken: 00:33,  8.62s/test case]ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
Evaluating 50 test case(s) in parallel: |▊         |  



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because there were no irrelevant statements in the actual output, providing a clear and focused response to the question about the maximum size of an application payload downlink message., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o-mini, reason: The score is 1.00 because there are no contradictions, indicating that the actual output aligns perfectly with the retrieval context., error: None)

For test case:

  - input: What is the maximum size of an application payload downlink message?
  - actual output: ### Detailed Explanation

In LoRaWAN, the maximum size of an application payload for a downlink message is determined by several factors, including the data rate, the spreading factor, and the specific configuration of the network. The maximum payload size can vary based on the region and




In [14]:
import json

# Load the JSON data
with open(".deepeval-cache.json", "r") as f:
    data = json.load(f)

test_cases_lookup_map = data["test_cases_lookup_map"]
metrics_scores = {}
metrics_counts = {}

# Iterate through the test cases and aggregate metric scores and counts
for key, value in test_cases_lookup_map.items():
    cached_metrics_data = value["cached_metrics_data"]
    for metric in cached_metrics_data:
        metric_metadata = metric["metric_data"]
        metric_name = metric_metadata["name"]
        score = metric_metadata["score"]

        # Initialize the metric in both dictionaries if it hasn't been encountered yet
        if metric_name not in metrics_scores:
            metrics_scores[metric_name] = 0
            metrics_counts[metric_name] = 0

        # Aggregate score and count
        metrics_scores[metric_name] += score
        metrics_counts[metric_name] += 1

# Calculate and print the average scores
for metric_name in metrics_scores:
    average_score = (
        metrics_scores[metric_name] / metrics_counts[metric_name]
        if metrics_counts[metric_name] > 0
        else 0
    )
    print(f"Metric: {metric_name}")
    print(f"Average: {average_score * 100:.2f}%")
    print("-" * 50)


Metric: Answer Relevancy
Average: 89.78%
--------------------------------------------------
Metric: Faithfulness
Average: 94.65%
--------------------------------------------------
