# Lesson 1: Advanced RAG Pipeline

In [1]:
from dotenv import load_dotenv, find_dotenv

import os
import openai

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
from datasets import load_dataset
import pandas as pd

xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()
xsum_sample.head(2)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035


In [3]:
xsum_sample["combined"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)

In [4]:
!mkdir -p 'document/'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
for i, document in enumerate(xsum_sample["combined"]):
    file_name = f'document/document_{i+1}.txt'  # Generate a unique filename for each document
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(document)  # Write each document to its own file

In [5]:
# conda activate llama-index

In [6]:
from llama_index import SimpleDirectoryReader

loader = SimpleDirectoryReader(input_dir="./document/")
documents = loader.load_data()

In [7]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

1000 

<class 'llama_index.schema.Document'>
Doc ID: ec0d3d30-1311-4197-870e-36b57c670d72
Text: Document: The full cost of damage in Newton Stewart, one of the
areas worst affected, is still being assessed. Repair work is ongoing
in Hawick and many roads in Peeblesshire remain badly affected by
standing water. Trains on the west coast mainline face disruption due
to damage at the Lamington Viaduct. Many businesses and householders
were aff...


## Basic RAG pipeline

In [8]:
from llama_index import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

In [9]:
from llama_index import VectorStoreIndex
from llama_index import ServiceContext
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document], service_context=service_context)

In [10]:
query_engine = index.as_query_engine()

In [11]:
response = query_engine.query(
    "I'm looking for the information of Harry Potter. What could you suggest to me?"
)
print(str(response))

You may want to explore the Harry Potter series of books, which have sold over 450 million copies worldwide since 1997. Additionally, there are eight film adaptations based on the books. Another interesting aspect to look into is "Harry Potter and the Cursed Child," a play that has received five-star reviews from critics and is considered a game-changing production.


### Creating an Evaluation Dataset

In [12]:
eval_questions = []
with open('eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        print(item)
        eval_questions.append(item)

I'm looking for the information of Harry Potter. What could you suggest to me?


In [13]:
new_question = "What is the Harry Potter show for me?"
eval_questions.append(new_question)

In [14]:
print(eval_questions)

["I'm looking for the information of Harry Potter. What could you suggest to me?", 'What is the Harry Potter show for me?']


In [15]:
from trulens_eval import Tru
tru = Tru()

tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [16]:
from trulens_eval import (
    Feedback,
    TruLlama,
    OpenAI
)
from trulens_eval.feedback import Groundedness
import numpy as np

openai = OpenAI()

# Answer Relevance
qa_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)

# Context Relevance
# on_input - pointer to user query
# on - pointer to retrieved contexts (intermediate results
# aggregate - aggregate score across all retrieved content
qs_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

# Groundedness
grounded = Groundedness(groundedness_provider=openai)
groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text)
        .on_output()
        .aggregate(grounded.grounded_statements_aggregator)
)
feedbacks = [qa_relevance, qs_relevance, groundedness]

def get_prebuilt_trulens_recorder(query_engine, app_id):
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
        )
    return tru_recorder

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [17]:
tru_recorder = get_prebuilt_trulens_recorder(query_engine, app_id="Direct Query Engine")

In [18]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

In [19]:
records, feedback = tru.get_records_and_feedback(app_ids=[])

In [20]:
records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,Answer Relevance,Answer Relevance_calls,latency,total_tokens,total_cost
0,Direct Query Engine,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_dcf115d5e2c1cf827613a219da059462,"""I'm looking for the information of Harry Pott...","""You could explore the Harry Potter series of ...",-,"{""record_id"": ""record_hash_dcf115d5e2c1cf82761...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-04-08T18:48:37.876118"", ""...",2024-04-08T18:48:39.857270,1.0,[{'args': {'prompt': 'I'm looking for the info...,1,2197,0.003328
1,Direct Query Engine,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_6fa0ca56520f6a8430850b7c48b2dd6a,"""What is the Harry Potter show for me?""","""The Harry Potter show is described as a \""gam...",-,"{""record_id"": ""record_hash_6fa0ca56520f6a84308...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-04-08T18:48:39.989549"", ""...",2024-04-08T18:48:41.590913,,,1,2230,0.003392


In [21]:
# launches on http://localhost:8501/
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.1.156:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

## Advanced RAG pipeline

### 1. Sentence Window retrieval

In [22]:
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage

def build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index

In [23]:
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [24]:
sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index"
)

In [25]:
def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

sentence_window_engine = get_sentence_window_query_engine(sentence_index)

In [26]:
window_response = sentence_window_engine.query(
    "I'm looking for the information of Harry Potter. What could you suggest to me?"
)
print(str(window_response))

You might be interested in exploring the Harry Potter books, which have sold over 450 million copies worldwide since 1997 and have been adapted into eight films. Additionally, the script of Harry Potter and the Cursed Child has been published and has received five-star reviews from critics, with one review calling it "a game-changing production."


In [27]:
from trulens_eval import Tru
tru = Tru()

tru.reset_database()

tru_recorder_sentence_window = get_prebuilt_trulens_recorder(
    sentence_window_engine,
    app_id = "Sentence Window Query Engine"
)

In [28]:
for question in eval_questions:
    with tru_recorder_sentence_window as recording:
        response = sentence_window_engine.query(question)
        print(question)
        print(str(response))

I'm looking for the information of Harry Potter. What could you suggest to me?
I would suggest looking into the Harry Potter books, which have sold over 450 million copies since 1997 and have been adapted into eight films. Additionally, you may want to explore the script of Harry Potter and the Cursed Child, which is being published this weekend and has received five-star reviews from critics.
What is the Harry Potter show for me?
The Harry Potter show being referred to in the context is "Harry Potter and the Cursed Child."


In [29]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Groundedness,Context Relevance,Answer Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sentence Window Query Engine,1.0,0.3,1.0,11.5,0.000688


In [30]:
# launches on http://localhost:8501/
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.1.156:8501



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

### 2. Auto-merging retrieval

In [31]:
from llama_index.node_parser import HierarchicalNodeParser

from llama_index.node_parser import get_leaf_nodes
from llama_index import StorageContext
from llama_index.retrievers import AutoMergingRetriever
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.query_engine import RetrieverQueryEngine

def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    merging_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
    )
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context, service_context=merging_context
        )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=merging_context,
        )
    return automerging_index

In [32]:
automerging_index = build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index"
)

In [33]:
def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=2,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine

In [34]:
automerging_query_engine = get_automerging_query_engine(
    automerging_index,
)

In [35]:
auto_merging_response = automerging_query_engine.query(
     "I'm looking for the information of Harry Potter. What could you suggest to me?"
)
print(str(auto_merging_response))

> Merging 2 nodes into parent node.
> Parent node id: 5b6edb58-7060-4f3f-a7a7-33c84f589895.
> Parent node text: Chris Jones, in the Chicago Tribune, says that "heretical as this may sound", the play left him "...

> Merging 1 nodes into parent node.
> Parent node id: cf9dbb73-b23c-40cb-9ad1-6bdec1693e85.
> Parent node text: Document: Is there something you have seen or heard that you would like us to investigate?
It cou...

> Merging 1 nodes into parent node.
> Parent node id: 28a0112d-c22d-45cd-837f-4b9c63e032de.
> Parent node text: Document: Is there something you have seen or heard that you would like us to investigate?
It cou...

You could consider exploring the play "Harry Potter and the Cursed Child," written by Jack Thorne, which is set 19 years after the final book in the series by JK Rowling. The play features the adult characters from the wizarding saga in their mid-30s as their own children embark on their own adventures. It has received positive reviews from critics and is 

In [36]:
tru.reset_database()

tru_recorder_automerging = get_prebuilt_trulens_recorder(automerging_query_engine,
                                                         app_id="Automerging Query Engine")

In [None]:
for question in eval_questions:
    with tru_recorder_automerging as recording:
        response = automerging_query_engine.query(question)
        print(question)
        print(response)

> Merging 2 nodes into parent node.
> Parent node id: 5b6edb58-7060-4f3f-a7a7-33c84f589895.
> Parent node text: Chris Jones, in the Chicago Tribune, says that "heretical as this may sound", the play left him "...

> Merging 1 nodes into parent node.
> Parent node id: cf9dbb73-b23c-40cb-9ad1-6bdec1693e85.
> Parent node text: Document: Is there something you have seen or heard that you would like us to investigate?
It cou...

> Merging 1 nodes into parent node.
> Parent node id: 28a0112d-c22d-45cd-837f-4b9c63e032de.
> Parent node text: Document: Is there something you have seen or heard that you would like us to investigate?
It cou...

I'm looking for the information of Harry Potter. What could you suggest to me?
You could consider exploring the play "Harry Potter and the Cursed Child," written by Jack Thorne, which is set 19 years after the final book in the series by JK Rowling. The play features the characters from the wizarding saga as adults in their mid-30s, with their own children

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
# launches on http://localhost:8501/
tru.run_dashboard()