# Load Sampled Data

In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
from dotenv import load_dotenv, find_dotenv
import os
import openai


_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

In [4]:
import pandas as pd

In [5]:
# Taking a sample of 1000 rows
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()
xsum_sample.head(2)

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035


In [6]:
# Combining 'document' and 'summary' columns
xsum_sample["combined"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)

## Build External Knowledge


In [7]:
#  ~/Git/NLP/notebook/LLM-RAG  master *1 !1 ?4  pip freeze | grep llama                                                                                                                 ✔  llm-rag   04:48:13 PM
# llama-hub==0.0.79.post1
# llama-index==0.10.14
# llama-index-agent-openai==0.1.5
# llama-index-cli==0.1.6
# llama-index-core==0.10.14
# llama-index-embeddings-openai==0.1.6
# llama-index-indices-managed-llama-cloud==0.1.3
# llama-index-legacy==0.9.48
# llama-index-llms-openai==0.1.6
# llama-index-multi-modal-llms-openai==0.1.4
# llama-index-program-openai==0.1.4
# llama-index-question-gen-openai==0.1.3
# llama-index-readers-file==0.1.6
# llama-index-readers-github==0.1.7
# llama-index-readers-llama-parse==0.1.3
# llama-index-vector-stores-chroma==0.1.5
# llama-index-vector-stores-weaviate==0.1.3
# llama-parse==0.3.5
# llamaindex-py-client==0.1.13
# conda activate llm-rag
# pip install llama-index llama-index-vector-stores-qdrant

In [8]:
!mkdir -p 'document/'
documents = xsum_dataset["train"].select(range(1000)).to_pandas()
joined_documents = '\n'.join(xsum_sample["combined"])
with open('document/documents.txt', 'w', encoding='utf-8') as file:
    file.write(joined_documents)

In [7]:
from llama_index.core import SimpleDirectoryReader

loader = SimpleDirectoryReader(input_dir="./document/")
documents = loader.load_data()

In [8]:
# if you want to see what the text looks like
documents[0].text[:1000]

'Document: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.\nRepair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.\nTrains on the west coast mainline face disruption due to damage at the Lamington Viaduct.\nMany businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.\nFirst Minister Nicola Sturgeon visited the area to inspect the damage.\nThe waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.\nJeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.\nHowever, she said more preventative work could have been carried out to ensure the retaining wall did not fail.\n"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate 

In [9]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
import qdrant_client

client = qdrant_client.QdrantClient(location=":memory:")
vector_store = QdrantVectorStore(client=client, collection_name="test_store")

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(),
        OpenAIEmbedding(),
    ],
    vector_store=vector_store,
)
_nodes = pipeline.run(documents=documents, num_workers=4)

In [10]:
# if you want to see the nodes
len(_nodes)
# _nodes[0].text

616

In [11]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

## Retrieve Against A Query

In [12]:
retriever = index.as_retriever(similarity_top_k=2)
retrieved_nodes = retriever.retrieve("I'm looking for the information of Harry Potter. What could you suggest to me?")

In [13]:
# to view the retrieved node
print(retrieved_nodes[0].text)

The main battle in Tamaulipas is between the Zetas and the Gulf cartels, the AFP news agency reports.
Their capacity for violence and ability to pay huge bribes gives them considerable power to subvert the prison system and get their people out.
President Felipe Calderon came to power in 2006 promising a war on drugs.
More than 35,000 people have died in drug violence since he began his campaign, which has involved launching an army assault on drug gangs.; Summary: Seven prisoners have been killed and 59 others have escaped after a riot at a jail in northern Mexico near the US border, officials say.
Document: The play, written by Jack Thorne, is set 19 years after the seventh and final book in the series by JK Rowling.
It opens officially at the Palace Theatre, in London's West End, on Saturday.
Audiences have been urged to "keep the secrets" since the play began previews in early June.
Presented in two parts, the play - showing the stars of the wizarding saga as adults in their mid-30

## Generate Final Response

In [14]:
query_engine = index.as_query_engine()

In [18]:
# to inspect the default prompt being used
query_engine = index.as_query_engine()

# to inspect the default prompt being used
print(
    query_engine.get_prompts()[
        "response_synthesizer:text_qa_template"
    ].default_template.template
)

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


In [25]:
query_engine = index.as_query_engine(similarity_top_k=2)
response = query_engine.query("I'm looking for the information of Harry Potter. What could you suggest to me?")
print(response)

You may be interested in learning about "Harry Potter and the Cursed Child," a play set 19 years after the final book in the Harry Potter series. It has received rave reviews from critics and offers a new and captivating story within the wizarding world created by JK Rowling.


In [26]:
query_engine = index.as_query_engine(response_mode="tree_summarize")
response = query_engine.query("I'm looking for the information of Harry Potter. What could you suggest to me?")
print(response)

You may be interested in learning about "Harry Potter and the Cursed Child," a play set 19 years after the last book in the Harry Potter series. The play has received rave reviews from critics, with many praising its magical elements, storytelling, and performances. It is presented in two parts and offers a new and original experience for fans of the Harry Potter universe.


In [27]:
query_engine = index.as_query_engine(streaming=True)
response = query_engine.query("I'm looking for the information of Harry Potter. What could you suggest to me?")
response.print_response_stream()

You may be interested in learning about "Harry Potter and the Cursed Child," a play set 19 years after the final book in the Harry Potter series. It has received rave reviews from critics and offers a new storyline involving Harry Potter and his friends as adults. The play is presented in two parts and has been described as a magical and game-changing production.

In [28]:
query_engine = index.as_chat_engine()
response = query_engine.query("I'm looking for the information of Harry Potter. What could you suggest to me?")
print(response)

I found information about the play "Harry Potter and the Cursed Child." It is set 19 years after the seventh and final book in the series by JK Rowling. The play is presented in two parts, showing the stars of the wizarding saga as adults in their mid-30s as their own children head off to school. The play has received high praise from critics, with many giving it five-star reviews and describing it as a game-changing production. The script of the play has been published and it has been noted for its magical effects, moments of comedy, and its ability to captivate both fans of the series and new audiences.
