In [1]:
from core.agent import agent_executor, agent_llm
from core.CustomTools import fetch_and_rerank

from langchain.agents.openai_functions_agent.agent_token_buffer_memory import (
    AgentTokenBufferMemory,
)


from datasets import Dataset

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

import regex as re
import glob

# ---

agent_memory = AgentTokenBufferMemory(llm=agent_llm)

def query_agent(prompt):
   response = agent_executor(
            #{"input": prompt, "history": agent_memory.buffer},            
            {"input": prompt, "history": []},            
            include_run_info=True,
        )
   return response



In [20]:
# source data

filenames = []
for _file in glob.glob("ragas_data/*.txt"):
    filenames.append(_file)

questions = []
ground_truths = []

DELIM = "__###__"
for name in filenames:
    file_content = None
    with open(name) as f:                
        file_content = f.read()

    content = file_content.split(DELIM)
    questions.append(content[0])
    truths = []
    truths.append(content[1])
    ground_truths.append(truths)

print("Questions:", len(questions))
print("Grounded Truths:", len(ground_truths))
    

Questions: 2
Grounded Truths: 2


In [15]:
answers = []
contexts = []

# Inference
for query in questions:
  
  # retrieve an answer
  response = query_agent(query)
  answers.append(response["output"])

  # retrieve the context (full query, which is different to what the agent does)
  docs = fetch_and_rerank(query, ["rhaetor.github.io_2", "rhaetor.github.io_components_2"])  
  
  response_content = [d.page_content for d in docs]        
  contexts.append(response_content)

print("Answer len:", len(contexts))
print("Context len:", len(contexts))



Embedding ms:  0.3784139156341553
0.831 :  _faq_how-do-the-direct-event-seda-and-vm-endpoints-compare.html.txt
0.82 :  _eips_aggregate-eip.html.txt_10
0.822 :  _eips_scatter-gather.html.txt_2
0.829 :  _others_cloudevents.html.txt
0.829 :  _next_timer-component.html.txt_3
Embedding ms:  0.3508141040802002
0.83 :  _next_timer-component.html.txt_3
0.814 :  _next_jooq-component.html.txt_6
0.81 :  _manual_endpoint.html.txt
0.818 :  _manual_route-reload.html.txt_0
0.812 :  _faq_how-to-send-the-same-message-to-multiple-endpoints.html.txt
Embedding ms:  0.6066417694091797
0.86 :  _next_aws2-s3-component.html.txt_11
0.831 :  _next_aws-summary.html.txt
0.832 :  _next_aws2-s3-component.html.txt_20
0.758 :  _manual_camel-jbang.html.txt_3
0.817 :  _next_aws2-s3-component.html.txt_9
Embedding ms:  0.342710018157959
0.795 :  _next_aws2-s3-component.html.txt_11
0.726 :  _books_.txt
0.776 :  _next_azure-storage-datalake-component.html.txt_14
0.772 :  _next_ftp-component.html.txt_21
0.726 :  _manual_str

In [21]:
# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()

df.head()

evaluating with [context_precision]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.24s/it]


evaluating with [context_recall]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.38s/it]


evaluating with [faithfulness]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:25<00:00, 25.08s/it]


evaluating with [answer_relevancy]


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.45s/it]


Unnamed: 0,question,answer,contexts,ground_truths,context_precision,context_recall,faithfulness,answer_relevancy
0,I am having an issue while working on the requ...,To listen to the events generated by one route...,"[""Sample\n\n\nTo set up a route that generates...",[\nTo listen to the events of one route by ano...,0.0,1.0,1.0,0.767416
1,"I want to read a local file, create equal chun...","To read a local file, create equal chunks, and...","[""If we want to trust all certificates in case...","[\nTo read a local file, create equal chunks, ...",0.0,1.0,1.0,0.795805
