In [1]:
import nest_asyncio
import pandas as pd
from libs.index import initialize_chroma_vector_store
from libs import configs
from llama_index.retrievers import VectorIndexRetriever
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    Response,
    get_response_synthesizer
)
from llama_index.llms import OpenAI
from llama_index.evaluation import FaithfulnessEvaluator, RelevancyEvaluator, CorrectnessEvaluator, SemanticSimilarityEvaluator
from llama_index.embeddings import HuggingFaceEmbedding, SimilarityMode
from llama_index.evaluation import BatchEvalRunner

In [2]:
def display_eval_df(response: Response, eval_result: str) -> None:
    if response.source_nodes == []:
        print("no response!")
        return
    eval_df = pd.DataFrame(
        {
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [3]:
QUESTION = "How does the hormone ghrelin affect hunger and appetite regulation?"

nest_asyncio.apply()

gpt_3_5_turbo = OpenAI(temperature=0, model="gpt-3.5-turbo")
embed_model = HuggingFaceEmbedding(model_name=configs.EMB_MODEL)
service_context = ServiceContext.from_defaults(llm=gpt_3_5_turbo, embed_model=embed_model)

vector_store = initialize_chroma_vector_store()

index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    service_context=service_context, verbose=True
)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
    verbose=False,
)

nodes = retriever.retrieve(QUESTION)
for node in nodes:
    node.node.excluded_llm_metadata_keys = [
        "episode_description",
        "timestamp_start",
        "timestamp_end",
        "timestamp_sentencepiece_token_length",
    ]
    
response = response_synthesizer.synthesize(query=QUESTION, nodes=nodes)

  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
faithfulness = FaithfulnessEvaluator(service_context=service_context)
relevancy = RelevancyEvaluator(service_context=service_context)
correctness = CorrectnessEvaluator(service_context=service_context)
semanticsimilarity = SemanticSimilarityEvaluator(
    service_context=service_context,
    similarity_mode=SimilarityMode.DEFAULT,
    similarity_threshold=0.6,
)

runner = BatchEvalRunner(
    {"faithfulness": faithfulness, "relevancy": relevancy, "correctness": correctness, "semanticsimilarity": semanticsimilarity},
    workers=8,
    show_progress=True,
)

In [7]:
eval_results = await runner.aevaluate_responses(responses=[response], queries=[QUESTION], reference=["test"])

print(eval_results["relevancy"][0].passing)
print(eval_results["faithfulness"][0].passing)
print(eval_results["correctness"][0].passing)
print(eval_results["semanticsimilarity"][0].passing)
print(eval_results["relevancy"][0])
print(eval_results["faithfulness"][0])
print(eval_results["correctness"][0])
print(eval_results["semanticsimilarity"][0])

100%|██████████| 4/4 [00:04<00:00,  1.14s/it]

True
True
True
False
query='How does the hormone ghrelin affect hunger and appetite regulation?' contexts=None response='The hormone ghrelin increases in our body depending on how long it has been since we last ate. When ghrelin levels are higher, it tends to make us feel hungry. Ghrelin interacts with specific neurons in the brain, such as the arcuate nucleus of the hypothalamus, to stimulate hunger. When we eat, ghrelin levels typically decrease, which helps to reduce our appetite.' passing=True feedback='YES' score=1.0 pairwise_source=None
query=None contexts=["to respond to those events. So what happens when we eat? Well, I've done an entire episode on metabolism. So if you're interested in the full cascade of hormonal and neural events that occurs when we eat, please check out that episode. But for the sake of today's discussion, let's just take a, what I call top contour view of the hormonal response to ingesting food. Now, anytime we eat, that is the consequence of a number of t


