# evaluation
ragas ref:


https://docs.ragas.io/en/stable/index.html

In [1]:
import os
from dotenv import load_dotenv
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI


# Load environment variables
load_dotenv()  # Load environment variables from .env file

# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# Initialize settings
Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")

from src_rag import ConfigManager, LoggerManager, Retriever, Reranker, ResponseSynthesizer, RAGQueryEngine
from llama_index.core import StorageContext, load_index_from_storage

import pandas as pd
from IPython.display import Markdown, display
%load_ext autoreload
%autoreload 2



In [3]:
# Setup configuration and logging
config = ConfigManager()
api_key = config.load_config()

LoggerManager.setup_logging()

# Rebuild storage context and load the index
storage_context = StorageContext.from_defaults(persist_dir="./persist")
index = load_index_from_storage(storage_context)

# Initialize components
retriever = Retriever(index)
reranker = Reranker(api_key)
synthesizer = ResponseSynthesizer()

# Define parameters for both query engines
vector_top_k = 30
reranker_top_n = 5

# Instantiate the query engine with reranking
query_engine_with_reranker = RAGQueryEngine(
    retriever=retriever,
    reranker=reranker,
    synthesizer=synthesizer,
    vector_top_k=vector_top_k,
    reranker_top_n=reranker_top_n,
    with_reranker=True
)

# Instantiate the query engine without reranking
query_engine_without_reranker = RAGQueryEngine(
    retriever=retriever,
    reranker=reranker,  # Reranker is still passed, but reranking won't be used
    synthesizer=synthesizer,
    vector_top_k=vector_top_k,
    reranker_top_n=reranker_top_n,  # This parameter won't be used as reranking is disabled
    with_reranker=False
)

# Define your query
query = "What are potential CONSEQUENCES of Keytruda?"

# Query and display results using the engine with reranking
response_with_reranker = query_engine_with_reranker.query(query)
response_text_with_reranker = response_with_reranker.response
display(Markdown(f"**Response with Reranker:**\n{response_text_with_reranker}"))

# Query and display results using the engine without reranking
response_without_reranker = query_engine_without_reranker.query(query)
response_text_without_reranker = response_without_reranker.response
display(Markdown(f"**Response without Reranker:**\n{response_text_without_reranker}"))


INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
Loading all indices.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
After Reranking, new rank list for nodes: [0, 5, 1, 3, 2, 4, 24, 25, 20, 27, 18, 21, 28, 26, 22, 16, 9, 10, 14, 15, 7, 6, 8, 19, 17, 11, 12, 13, 23, 29]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


**Response with Reranker:**
Potential consequences of Keytruda include immune-related adverse effects such as colitis, hepatitis, pneumonitis, and the exacerbation of pre-existing autoimmune diseases. It can also lead to symptoms like cough, shortness of breath, chest pain, and changes in vision that should be reported immediately for optimal care and safety.

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


**Response without Reranker:**
Potential consequences of Keytruda include immune-related adverse effects such as colitis, hepatitis, pneumonitis, and changes in blood pressure. Additionally, interactions with steroids and certain immunosuppressants can affect its efficacy and safety. Close monitoring for symptoms like cough, chest pain, changes in vision, and neurological symptoms is essential to manage potential side effects effectively.

# Evaluation

In [11]:
df = pd.read_parquet("../data/processed_data/df_eval.parquet")


In [12]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
#from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    #context_precision,
    #context_recall,
    #harmfulness,
]

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# using GPT 3.5, use GPT 4 / 4-turbo for better accuracy
evaluator_llm = OpenAI(model="gpt-3.5-turbo")



In [14]:
ds = df[["augmented_questions"]].rename(columns = {"augmented_questions":"question"})[0:2]

ds_dict = ds.to_dict()
ds_dict["question"] = [v for k, v in ds_dict["question"].items()] 
ds_dict["question"]

['What are the frequently encountered adverse reactions associated with Keytruda?',
 'Which side effects are typically seen in patients receiving Keytruda therapy?']

In [15]:
from datasets import Dataset

ds_dict = ds.to_dict()
ds_dict["question"] = [v for k, v in ds_dict["question"].items()] 

dataset = Dataset.from_dict(ds_dict)
dataset

Dataset({
    features: ['question'],
    num_rows: 2
})

In [None]:


from ragas.integrations.llama_index import evaluate

result = evaluate(
    query_engine=query_engine,
    metrics=metrics,
    dataset=dataset,
    llm=evaluator_llm,
    embeddings=OpenAIEmbedding(),
)

df_result = result.to_pandas()
df_result.to_parquet("../data/processed_data/df_result_ragas_without_reranking_test.parquet")

df_result

In [57]:
# without reranking
df_result.describe()

Unnamed: 0,faithfulness,answer_relevancy
count,218.0,220.0
mean,0.831684,0.859019
std,0.29759,0.301806
min,0.0,0.0
25%,0.666667,0.94917
50%,1.0,0.964384
75%,1.0,0.974852
max,1.0,1.0


In [45]:
# with reranking
df_result.describe()

Unnamed: 0,faithfulness,answer_relevancy
count,219.0,220.0
mean,0.793747,0.89062
std,0.314552,0.259305
min,0.0,0.0
25%,0.5,0.949299
50%,1.0,0.966063
75%,1.0,0.978517
max,1.0,1.0
