### Setup

In [3]:
from dotenv import load_dotenv, find_dotenv
import os
import openai

_ = load_dotenv(find_dotenv())
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [4]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.2)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

### Load Data

In [5]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import pandas as pd

In [7]:
xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)  

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [8]:
xsum_sample = xsum_dataset["train"].select(range(1000)).to_pandas()

# Combining 'document' and 'summary' columns
xsum_sample["combined"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)
xsum_sample.head(2)

Unnamed: 0,document,summary,id,combined
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142,Document: The full cost of damage in Newton St...
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035,Document: A fire alarm went off at the Holiday...


In [9]:
xsum_sample["combined"] = (
    "Document: " + xsum_sample.document.str.strip() + "; Summary: " + xsum_sample.summary.str.strip()
)

In [10]:
!mkdir -p 'document/'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
for i, document in enumerate(xsum_sample["combined"]):
    file_name = f'document/document_{i+1}.txt'  # Generate a unique filename for each document
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(document)  # Write each document to its own file

In [11]:
from llama_index.core import SimpleDirectoryReader

# load documents
loader = SimpleDirectoryReader(input_dir="./document/")
documents = loader.load_data()

In [12]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

1000 

<class 'llama_index.core.schema.Document'>
Doc ID: 2772fd61-6ad7-47ec-9970-92f1699c9c00
Text: Document: The full cost of damage in Newton Stewart, one of the
areas worst affected, is still being assessed. Repair work is ongoing
in Hawick and many roads in Peeblesshire remain badly affected by
standing water. Trains on the west coast mainline face disruption due
to damage at the Lamington Viaduct. Many businesses and householders
were aff...


In [23]:
from llama_index.core.schema import MetadataMode

In [25]:
print(documents[0].get_content(metadata_mode=MetadataMode.ALL))

file_path: /Users/linghuang/Git/NLP/notebook/document/document_1.txt
file_name: document_1.txt
file_type: text/plain
file_size: 2470
creation_date: 2024-05-05
last_modified_date: 2024-05-05

Document: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work c

In [27]:
print(documents[0].metadata)

{'file_path': '/Users/linghuang/Git/NLP/notebook/document/document_1.txt', 'file_name': 'document_1.txt', 'file_type': 'text/plain', 'file_size': 2470, 'creation_date': '2024-05-05', 'last_modified_date': '2024-05-05'}


In [28]:
text_template = "Content Metadata:\n{metadata_str}\n\nContent:\n{content}"

metadata_template = "{key}: {value},"
metadata_seperator= " "

for doc in documents:
    doc.text_template = text_template
    doc.metadata_template = metadata_template
    doc.metadata_seperator = metadata_seperator

In [29]:
print(documents[0].get_content(metadata_mode=MetadataMode.ALL))

Content Metadata:
file_path: /Users/linghuang/Git/NLP/notebook/document/document_1.txt, file_name: document_1.txt, file_type: text/plain, file_size: 2470, creation_date: 2024-05-05, last_modified_date: 2024-05-05,

Content:
Document: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, 

### Advanced Customization

In [31]:
documents[0].excluded_llm_metadata_keys = ["file_path"]
print(documents[0].get_content(metadata_mode=MetadataMode.LLM))

Content Metadata:
file_name: document_1.txt, file_type: text/plain, file_size: 2470, creation_date: 2024-05-05, last_modified_date: 2024-05-05,

Content:
Document: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure 

In [33]:
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage

In [35]:
# create a vector store index for each folder
try:
    index = load_index_from_storage(StorageContext.from_defaults(persist_dir="document/index"))
except:
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir="document/index")

### Create Query Engine Tools

In [36]:
tool = QueryEngineTool.from_defaults(
    query_engine=index.as_query_engine(),
    name="Document Modules",
    description="Useful for answering questions about related BBC News."
)

In [38]:
# needed for notebooks
import nest_asyncio
nest_asyncio.apply()

from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.response_synthesizers import get_response_synthesizer

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        tool,
    ],
    # enable this for streaming
    # response_synthesizer=get_response_synthesizer(streaming=True),
    verbose=False
)

In [39]:
response = query_engine.query("I'm looking for the information of Harry Potter. What could you suggest to me?")
print(str(response))

You may want to explore recent developments related to the play "Harry Potter and the Cursed Child," including reviews and insights from critics.


In [13]:
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding

# initialize settings (set chunk size)
Settings.chunk_size = 512
nodes = Settings.node_parser.get_nodes_from_documents(documents)

In [14]:
from llama_index.core import StorageContext

# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

### Define Summary Index and Vector Index over Same Data

In [15]:
from llama_index.core import SummaryIndex
from llama_index.core import VectorStoreIndex

summary_index = SummaryIndex(nodes, storage_context=storage_context)
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)

### Define Query Engines and Set Metadata

In [16]:
# needed for notebooks
import nest_asyncio
nest_asyncio.apply()

list_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)
vector_query_engine = vector_index.as_query_engine()

In [17]:
from llama_index.core.tools import QueryEngineTool


list_tool = QueryEngineTool.from_defaults(
    query_engine=list_query_engine,
    description=(
        "Useful for summarization questions related to Harry Potter"
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for summarization questions related to Harry Potter"
    ),
)

### Define Router Query Engine

#### PydanticSingleSelector

In [18]:
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector

from llama_index.core.selectors import (
    PydanticMultiSelector,
    PydanticSingleSelector,
)


query_engine = RouterQueryEngine(
    selector=PydanticSingleSelector.from_defaults(),
    query_engine_tools=[
        list_tool,
        vector_tool,
    ],
)

In [19]:
response = query_engine.query("I'm looking for the information of Harry Potter. What could you suggest to me?")
print(str(response))

RuntimeError: asyncio.run() cannot be called from a running event loop

In [20]:
query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        list_tool,
        vector_tool,
    ],
)

In [21]:
response = query_engine.query("Harry Potter")
print(str(response))

  '|'.join(regex_opt_inner(list(group[1]), '')
  '|'.join(regex_opt_inner(list(group[1]), '')


RuntimeError: asyncio.run() cannot be called from a running event loop