# Lesson 1: Advanced RAG Pipeline

In [4]:
# llama-index==v0.9.14.post3
# llama-index-readers-file==0.1.13
# langchain[openai]==0.0.343
# openai==1.3.5
# python-dotenv==1.0.1
# datasets==2.18.0
# sentence_transformers
# trulens==0.13.4
# trulens-eval==0.20.0
# https://github.dev/openai/openai-cookbook/blob/main/examples/evaluation/Evaluate_RAG_with_LlamaIndex.ipynb

In [5]:
from datasets import load_dataset
import pandas as pd

xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()
xsum_sample.head(2)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035


In [6]:
xsum_sample["combined"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)

In [7]:
!mkdir -p 'document/'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
for i, document in enumerate(xsum_sample["combined"]):
    file_name = f'document/document_{i+1}.txt'  # Generate a unique filename for each document
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(document)  # Write each document to its own file

In [8]:
# conda activate llama-index

## RetrieverEvaluator

In [24]:
from llama_index import SimpleDirectoryReader

loader = SimpleDirectoryReader(input_dir="./document/")
documents = loader.load_data()

In [25]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

1000 

<class 'llama_index.schema.Document'>
Doc ID: 656cd4d1-5c4a-49f5-ad60-600629e6a8fd
Text: Document: The full cost of damage in Newton Stewart, one of the
areas worst affected, is still being assessed. Repair work is ongoing
in Hawick and many roads in Peeblesshire remain badly affected by
standing water. Trains on the west coast mainline face disruption due
to damage at the Lamington Viaduct. Many businesses and householders
were aff...


In [42]:
from dotenv import load_dotenv, find_dotenv

import os
import openai
from llama_index.node_parser import SimpleNodeParser

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

# Define an LLM
llm = OpenAI(model="gpt-4")

# Build index with a chunk_size of 512
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)
vector_index = VectorStoreIndex(nodes)

In [28]:
# build a query engine
query_engine = index.as_query_engine()

In [29]:
response = query_engine.query(
    "I'm looking for the information of Harry Potter. What could you suggest to me?"
)
print(str(response))

You may want to explore the Harry Potter series of books written by J.K. Rowling, which have sold over 450 million copies worldwide since 1997. Additionally, there are eight film adaptations based on the books. Another interesting aspect to look into is "Harry Potter and the Cursed Child," a play that has received five-star reviews from critics and is considered a game-changing production.


In [30]:
response.source_nodes[0].get_text()[:1000]

'Ben Brantley in the New York Times writes: "Like the novels that preceded it, The Cursed Child is stuffed with arcana-filled plots that defy diagrams and baldly wrought sentimental life lessons, along with anguished dives into the earnest, tortured solipsism of adolescence.\n"By rights, such a combination should try the patience of any grown-up. But like Ms Rowling\'s books, the play vanquishes resistance."\nThe Harry Potter books have sold more than 450 million copies since 1997 and been adapted into eight films.\nThe script of Harry Potter and the Cursed Child is published this weekend.\nFollow us on Twitter @BBCNewsEnts, on Instagram at bbcnewsents, or email entertainment.news@bbc.co.uk.; Summary: Harry Potter and the Cursed Child has won five-star reviews from critics, with one describing it as "a game-changing production".\n\nDocument: There were celebrations in Europe after Germany surrendered.\nBut on the other side of the world in the Pacific Ocean, Japan was still fighting ag

In [31]:
from llama_index.evaluation import generate_question_context_pairs

qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm,
    num_questions_per_chunk=2
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1625/1625 [1:25:29<00:00,  3.16s/it]


### Retrieval Evaluation:

In [34]:
retriever = vector_index.as_retriever(similarity_top_k=2)

In [37]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

In [38]:
# Evaluate
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [43]:
def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metrics =  {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
    return metrics

In [44]:
display_results("OpenAI Embedding Retriever", eval_results)

{'Retriever Name': ['OpenAI Embedding Retriever'],
 'Hit Rate': [0.9273846153846154],
 'MRR': [0.8507692307692307]}