# RAG Triad of metrics

In [1]:
from dotenv import load_dotenv, find_dotenv

import os
import openai

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
from datasets import load_dataset
import pandas as pd

xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()
xsum_sample.head(2)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035


In [3]:
xsum_sample["combined"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)

In [4]:
!mkdir -p 'document/'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
for i, document in enumerate(xsum_sample["combined"]):
    file_name = f'document/document_{i+1}.txt'  # Generate a unique filename for each document
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(document)  # Write each document to its own file

In [5]:
from llama_index import SimpleDirectoryReader

loader = SimpleDirectoryReader(input_dir="./document/")
documents = loader.load_data()

In [6]:
from llama_index import SimpleDirectoryReader

loader = SimpleDirectoryReader(input_dir="./document/")
documents = loader.load_data()

In [7]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

1000 

<class 'llama_index.schema.Document'>
Doc ID: 16e9f172-ea17-44ca-a223-b1bb14f8aaff
Text: Document: The full cost of damage in Newton Stewart, one of the
areas worst affected, is still being assessed. Repair work is ongoing
in Hawick and many roads in Peeblesshire remain badly affected by
standing water. Trains on the west coast mainline face disruption due
to damage at the Lamington Viaduct. Many businesses and householders
were aff...


## Sentence Window Retrieval Evaluation

In [8]:
# conda activate llama-index3

In [9]:
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage

In [10]:
def build_sentence_window_index(document, llm, embed_model, save_dir):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model="local:BAAI/bge-small-en-v1.5",
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir="sentence_index")
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir="sentence_index"),
            service_context=sentence_context,
        )
    return sentence_index

In [11]:
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index"
)

In [12]:
def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, 
        model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

sentence_window_engine = get_sentence_window_query_engine(sentence_index)

In [13]:
result = sentence_window_engine.query("I'm looking for the information of Harry Potter. What could you suggest to me?")
result.response

'You may want to explore the Harry Potter books, which have sold over 450 million copies worldwide since 1997 and have been adapted into eight films. Additionally, the script of Harry Potter and the Cursed Child was published recently and has received five-star reviews from critics, with one review calling it "a game-changing production."'

## Feedback functions

In [14]:
from trulens_eval import Tru

tru = Tru()
tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [15]:
import nest_asyncio

nest_asyncio.apply()

In [16]:
from trulens_eval import OpenAI as fOpenAI

provider = fOpenAI()

### 1. Answer Relevance

In [17]:
from trulens_eval import Feedback

f_qa_relevance = Feedback(
    provider.relevance_with_cot_reasons,
    name="Answer Relevance"
).on_input_output()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


### 2. Context Relevance

In [18]:
from trulens_eval import TruLlama

context_selection = TruLlama.select_source_nodes().node.text

In [19]:
import numpy as np

f_qs_relevance = (
    Feedback(provider.qs_relevance,
             name="Context Relevance")
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .


In [20]:
import numpy as np

f_qs_relevance = (
    Feedback(provider.qs_relevance_with_cot_reasons,
             name="Context Relevance")
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .


### 3. Groundedness

In [21]:
from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=provider)

In [22]:
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons,
             name="Groundedness"
            )
    .on(context_selection)
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


### 4. Hate

In [23]:
# Toxicity of input
f_hate = (
    Feedback(provider.moderation_hate,
    name="Hate",
    higher_is_better=False)
    .on_input()
)

✅ In Hate, input text will be set to __record__.main_input or `Select.RecordInput` .


### 5. Coherence

In [24]:
# LLM-based feedback functions
f_coherence = (Feedback(
    provider.coherence_with_cot_reasons, name="Coherence"
    ).on_output()
)

✅ In Coherence, input text will be set to __record__.main_output or `Select.RecordOutput` .


## Evaluation of the RAG application

In [25]:
from trulens_eval import TruLlama
from trulens_eval import FeedbackMode

tru_recorder = TruLlama(
    sentence_window_engine,
    app_id="RAG_Evaluation",
    feedbacks=[
        f_qa_relevance,
        f_qs_relevance,
        f_groundedness,
        f_hate,
        f_coherence
    ]
)

In [26]:
eval_questions = []
with open('eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [27]:
eval_questions

["I'm looking for the information of Harry Potter. What could you suggest to me?"]

In [28]:
for question in eval_questions:
    with tru_recorder as recording:
        sentence_window_engine.query(question)

Could not find usage information in openai response:
ModerationCreateResponse(id='modr-9DzW8cjakiPaTTyWXL8t3YBjzA1Rp', model='text-moderation-007', results=[Moderation(categories=Categories(harassment=False, harassment_threatening=False, hate=False, hate_threatening=False, self_harm=False, self_harm_instructions=False, self_harm_intent=False, sexual=False, sexual_minors=False, violence=False, violence_graphic=False, self-harm=False, sexual/minors=False, hate/threatening=False, violence/graphic=False, self-harm/intent=False, self-harm/instructions=False, harassment/threatening=False), category_scores=CategoryScores(harassment=6.725809362251312e-05, harassment_threatening=6.265715001063654e-06, hate=1.1078228453698102e-05, hate_threatening=7.598943739139941e-06, self_harm=5.316969236446312e-06, self_harm_instructions=1.1291792816336965e-06, self_harm_intent=2.9899895253038267e-06, sexual=8.618633728474379e-06, sexual_minors=7.6213305874262e-05, violence=8.447197615168989e-06, violence_gr

In [29]:
records, feedback = tru.get_records_and_feedback(app_ids=[])

In [30]:
import pandas as pd

pd.set_option("display.max_colwidth", None)
records[["input", "output"] + feedback]

Unnamed: 0,input,output
0,"""I'm looking for the information of Harry Potter. What could you suggest to me?""","""I would suggest looking into the Harry Potter books, which have sold over 450 million copies since 1997 and have been adapted into eight films. Additionally, you may want to explore the script of Harry Potter and the Cursed Child, which has received five-star reviews from critics and is described as \""a game-changing production.\"""""


In [31]:
tru.get_leaderboard(app_ids=["RAG_Evaluation"])

Unnamed: 0_level_0,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1
RAG_Evaluation,2.0,0.00074


In [32]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.1.156:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

In [33]:
tru.get_leaderboard(app_ids=["RAG_Evaluation"]).to_dict()

{'Hate': {'RAG_Evaluation': 1.1078228453698102e-05},
 'latency': {'RAG_Evaluation': 2.0},
 'total_cost': {'RAG_Evaluation': 0.0007395000000000001}}

In [34]:
tru.get_leaderboard(app_ids=["RAG_Evaluation"])

Unnamed: 0_level_0,Hate,Groundedness,Coherence,Answer Relevance,Context Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
RAG_Evaluation,1.1e-05,0.666667,0.9,1.0,0.1,2.0,0.00074
