In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, Settings
from llama_index.core.indices import load_index_from_storage
from llama_index.core import VectorStoreIndex, DocumentSummaryIndex
from llama_index.core import SummaryIndex
from llama_index.core.prompts import PromptTemplate
from llama_index.core.response.notebook_utils import display_response
import chromadb
import torch
import model_utils
import prompt_utils
import nest_asyncio
nest_asyncio.apply()

  from .autonotebook import tqdm as notebook_tqdm



INFO:datasets:PyTorch version 2.3.0+cu118 available.
PyTorch version 2.3.0+cu118 available.


In [3]:
llm_path = "models/Meta-Llama-3.1-8B-Instruct"
embed_path = "models/bge-small-en-v1.5"
device = "cuda:1"

In [4]:
# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name=embed_path, device=device)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: models/bge-small-en-v1.5
Load pretrained SentenceTransformer: models/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [7]:
# load local llm llama
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path=llm_path,
    device=device
)

Loading tokenizer and model with quantization config from: models/Meta-Llama-3.1-8B-Instruct


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.35s/it]


In [8]:
# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate(prompt_utils.get_llama31_ins_prompt_template()),
    generate_kwargs={
        "temperature": 0.2,
        "do_sample": True,
        "top_p": 0.9,
    },
    device_map=device,
    model_name=llm_path,
    model=model,
    tokenizer=tokenizer
)

In [5]:
# Settings.embed_model = embed_model
# Settings.llm = llm_hf

## Load vector index and document summary index

In [9]:
# Creates a persistent instance of Chroma that saves to disk
chroma_client = chromadb.PersistentClient(path="./chroma_db")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [10]:
chroma_client.list_collections()

[Collection(id=3186f0ec-26e5-46fa-b687-281a3a26066f, name=llama_index_blogs),
 Collection(id=46aee9cc-50b0-4474-90d9-61e1e160c15d, name=blogs_vector_index),
 Collection(id=cf744ea1-c23d-41c8-8204-363adcd4b3fe, name=llma_blogs_summary),
 Collection(id=e2bc79ee-49dd-45e4-85f3-6acb87185f7a, name=blogs_summary)]

In [11]:
# Get or create a collection with the given name and metadata.
vector_collection = chroma_client.get_or_create_collection("blogs_vector_index")
vector_store = ChromaVectorStore(chroma_collection=vector_collection)
vector_storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7fdc9ed49c90>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7fdca606eef0>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={}), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7fdca606f610>, property_graph_store=None)

In [12]:
# load your index from stored vectors
vector_index = VectorStoreIndex.from_vector_store(
    llm=llm_hf,
    embed_model=embed_model,
    vector_store=vector_store, 
    storage_context=vector_storage_context
)
vector_index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7fdc9bcf3670>

In [13]:
# Get or create a collection with the given name and metadata.
doc_sum_collection = chroma_client.get_or_create_collection("blogs_summary")
doc_sum_vector_store = ChromaVectorStore(
    chroma_collection=doc_sum_collection
)

doc_sum_storage_context = StorageContext.from_defaults(
    vector_store=doc_sum_vector_store,
    persist_dir="./database/blogs_summary_index/"
)
doc_sum_storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7fdca606f1f0>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7fdc9bcf3eb0>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={})}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7fdc9bcf3700>, property_graph_store=None)

In [14]:
doc_summary_index = load_index_from_storage(
    llm=llm_hf,
    embed_model=embed_model,
    storage_context=doc_sum_storage_context
)
doc_summary_index

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


<llama_index.core.indices.document_summary.base.DocumentSummaryIndex at 0x7fdc9bcf2fb0>

## Advanced Querying

In [15]:
question1 = "What are key features of llama-agents?"
question2 = '''What are the two critical areas of RAG system performance that are assessed \
in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?'''
question3 = '''What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?'''

In [16]:
def print_ref_docs(resp_metadata):
    print("References:")
    base_url = "https://www.llamaindex.ai/blog/"
    for idx, (_, doc_metatada) in enumerate(resp_metadata.items()):
        ref_url = base_url + doc_metatada['file_name'].split(".")[0]
        print(f"{idx+1}.", ref_url)

In [17]:
Settings.llm = llm_hf
Settings.embed_model = embed_model

In [15]:
# tracing log
import llama_index.core
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
llama_index.core.set_global_handler("simple")

### VectorIndex as query engine

In [18]:
vector_query_engine = vector_index.as_query_engine(
    response_mode="compact", 
    use_async=True,
)

In [19]:
print("Question:", question1)
response1 = vector_query_engine.query(question1)
display_response(response1)
print_ref_docs(response1.metadata)

Question: What are key features of llama-agents?


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.22it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**`Final Response:`** The key features of llama-agents are:

1. Distributed Service Oriented Architecture: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: launch, scale and monitor each agent and your control plane independently.
5. Scalability and resource management: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service.

References:
1. https://www.llamaindex.ai/blog/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems


### DocumentSummaryIndex as query engine

In [29]:
summary_query_engine = doc_summary_index.as_query_engine(
    response_mode="compact", 
    use_async=True,
)

In [30]:
print("Question:", question1)
response = summary_query_engine.query(question1)

Question: What are key features of llama-agents?


Batches: 100%|██████████| 1/1 [00:00<00:00, 151.58it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [31]:
display_response(response)
print_ref_docs(response.metadata)

**`Final Response:`** Based on the provided context, it appears that the key features of LlamaIndex are:

1. **Vector Indexing**: LlamaIndex uses a vector index created from 10K DiffusionDB prompts to enable the Text2Image Prompt Assistant tool to re-write prompts to generate more beautiful images.
2. **Query Engine**: The tool uses a query engine to generate new text-to-image prompts based on the provided context and examples.
3. **Integration with Transformers Agents**: LlamaIndex is integrated with Transformers Agents to enable the creation of a text-to-image prompt assistant tool.
4. **Customization**: The tool can be customized to use different LLMs (Large Language Models) and to refine the existing prompts based on the provided examples.
5. **Temperature Control**: The tool allows for temperature control, which enables the generation of varied prompts with a temperature above zero.
6. **Prompt Refinement**: LlamaIndex can suggest better prompts when generating images, as demonstrated by the Text2Image Prompt Assistant tool.
7. **Vector Database**: LlamaIndex uses a vector database created from DiffusionDB to enable the Text2Image Prompt Assistant tool.

The refined answer is:

"The key features of LlamaIndex are its ability to create a vector index, use a query engine, integrate with Transformers Agents, customize the tool to suit specific needs, control temperature, refine prompts, and utilize a vector database created from DiffusionDB."

References:
1. https://www.llamaindex.ai/blog/llamaindex-and-transformers-agents-67042ee1d8d6


### Router Query Engine

In [32]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector

In [33]:
vector_tool = QueryEngineTool(
    vector_query_engine,
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for retrieving specific context"
    )
)

summary_tool = QueryEngineTool(
    summary_query_engine,
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarization questions related to document content"
    )
)

#### Single selector

In [23]:
router_query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool],
    selector=LLMSingleSelector.from_defaults()
)

In [24]:
response = router_query_engine.query(question1)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The question is asking about key features of llama-agents, which is a summarization question related to document content, making choice 2 the most relevant..
Selecting query engine 1: The question is asking about key features of llama-agents, which is a summarization question related to document content, making choice 2 the most relevant..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [25]:
display_response(response)

**`Final Response:`** Based on the provided context, the key features of Llama-agents are:

1. **Agent Structure**: Llama-agents provide a structure that enables Large Language Models (LLMs) to make decisions, use tools, and accomplish tasks.
2. **Customizable Tools**: Transformers Agents come with pre-configured tools that leverage the vast amounts of open-source models hosted on Hugging Face-Hub. Additionally, new tools can be created and shared by publishing a new Hugging Face Space with the proper tool setup.
3. **LlamaIndex Integration**: Llama-agents can integrate with LlamaIndex, a vector index created from a large dataset of text-to-image prompts.

Note that the query asks about the key features of llama-agents, not LlamaIndex.

In [26]:
response.metadata

{'c71a83e3-f32a-4445-9ee3-cad21e9a0778': {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-and-transformers-agents-67042ee1d8d6.html',
  'file_name': 'llamaindex-and-transformers-agents-67042ee1d8d6.html',
  'file_type': 'text/html',
  'file_size': 14762,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'},
 'selector_result': MultiSelection(selections=[SingleSelection(index=1, reason='The question is asking about key features of llama-agents, which is a summarization question related to document content, making choice 2 the most relevant.')])}

In [27]:
response = router_query_engine.query(question2)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The question is asking about the performance of RAG system, which is related to summarization questions, hence option 2 is more relevant.
Selecting query engine 1: The question is asking about the performance of RAG system, which is related to summarization questions, hence option 2 is more relevant.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [28]:
display_response(response)

**`Final Response:`** Based on the provided context information, the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook are:

1. **The Retrieval System**
2. **Response Generation**

These two areas are mentioned in the context as the focus of the "Evaluating RAG with LlamaIndex" section of the cookbook.

In [29]:
response = router_query_engine.query(question3)
display_response(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The question is asking for a summary of the RAG system's performance, which aligns with the description of choice (2) as 'Useful for summarization questions related to document content'.
Selecting query engine 1: The question is asking for a summary of the RAG system's performance, which aligns with the description of choice (2) as 'Useful for summarization questions related to document content'.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**`Final Response:`** Based on the provided context, the two main metrics used to evaluate the performance of the different rerankers in the RAG system are:

1. **Hit Rate**
2. **Mean Reciprocal Rank (MRR)**

In [30]:
test_summary_question = "What is the summmarization of bridging-the-gap-in-crisis-counseling-introducing-counselor-copilot document?"
response = router_query_engine.query(test_summary_question)
display_response(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: The question asks for the summary of a document, which implies retrieving specific context, making choice 1 the most relevant..
Selecting query engine 0: The question asks for the summary of a document, which implies retrieving specific context, making choice 1 the most relevant..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**`Final Response:`** Based on the provided context, the summary of the "Bridging the Gap in Crisis Counseling: Introducing Counselor Copilot" document is:

The document discusses the challenges faced by crisis counselors, specifically those working with The Trevor Project's TrevorText service, which provides online chat services to LGBTQ+ youth who are contemplating suicide. The counselors face high demand during peak times, administrative tasks, and the need to locate relevant local resources, which can lead to burnout and hinder effective care. The authors introduce "Counselor Copilot", a solution that aims to bridge the gap between the demand and supply of crisis services, and won awards at the LlamaIndex RAG-a-thon.

#### MultiSelector

In [31]:
multi_selector_query_engine = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(),
    query_engine_tools=[vector_tool, summary_tool],
    verbose=True
)

In [32]:
print(question1)
response = multi_selector_query_engine.query(question1)
display_response(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What are key features of llama-agents?
INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: The question is asking about key features of llama-agents, which is about retrieving specific context..
Selecting query engine 0: The question is asking about key features of llama-agents, which is about retrieving specific context..
[1;3;38;5;200mSelecting query engine 0: The question is asking about key features of llama-agents, which is about retrieving specific context..
[0m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**`Final Response:`** According to the provided context information, the key features of llama-agents are:

1. **Distributed Service Oriented Architecture**: every agent can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. **Communication via standardized API interfaces**: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. **Define agentic and explicit orchestration flows**: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. **Ease of deployment**: launch, scale and monitor each agent and your control plane independently.
5. **Scalability and resource management**: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service.

In [33]:
print(question2)
response = multi_selector_query_engine.query(question2)
display_response(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: The question is asking about specific areas of RAG system performance, which aligns with the description in option 1: 'Useful for retrieving specific context'..
Selecting query engine 0: The question is asking about specific areas of RAG system performance, which aligns with the description in option 1: 'Useful for retrieving specific context'..
[1;3;38;5;200mSelecting query engine 0: The question is asking about specific areas of RAG system performance, which aligns with the description in option 1: 'Useful for retrieving specific context'..
[0m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: Option 2 is related to summarization questions, which is not the focus of the question..
Selecting query engine 0: Option 2 is related to summarization questions, which is not the focus of the question..


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mSelecting query engine 0: Option 2 is related to summarization questions, which is not the focus of the question..
[0mINFO:llama_index.core.query_engine.router_query_engine:Combining responses from multiple query engines.
Combining responses from multiple query engines.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**`Final Response:`** The two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook are:

1. **The Retrieval System**
2. **Response Generation**

In [34]:
print(question3)
response = multi_selector_query_engine.query(question3)
display_response(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: Retrieving specific context is relevant to evaluating the performance of different rerankers in the RAG system, as it suggests the ability to retrieve specific information from the documents..
Selecting query engine 0: Retrieving specific context is relevant to evaluating the performance of different rerankers in the RAG system, as it suggests the ability to retrieve specific information from the documents..
[1;3;38;5;200mSelecting query engine 0: Retrieving specific context is relevant to evaluating the performance of different rerankers in the RAG system, as it suggests the ability to retrieve specific information from the documents..
[0m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: Summarization questions related to document content are also relevant, as they imply the ability to summarize the content of documents and retrieve specific information..
Selecting query engine 1: Summarization questions related to document content are also relevant, as they imply the ability to summarize the content of documents and retrieve specific information..


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mSelecting query engine 1: Summarization questions related to document content are also relevant, as they imply the ability to summarize the content of documents and retrieve specific information..
[0m

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Combining responses from multiple query engines.
Combining responses from multiple query engines.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**`Final Response:`** According to the provided context information, the two main metrics used to evaluate the performance of the different rerankers in the RAG system are:

1. **Hit Rate**: calculates the fraction of queries where the correct answer is found within the top-k retrieved documents.
2. **Mean Reciprocal Rank (MRR)**: evaluates the system's accuracy by looking at the rank of the highest-placed relevant document for each query.

In [35]:
print(question3)
response = multi_selector_query_engine.query(question3)
display_response(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: The question is asking about specific context, which is a key aspect of the RAG system's performance evaluation, as it involves retrieving specific information from the documents..
Selecting query engine 0: The question is asking about specific context, which is a key aspect of the RAG system's performance evaluation, as it involves retrieving specific information from the documents..
[1;3;38;5;200mSelecting query engine 0: The question is asking about specific context, which is a key aspect of the RAG system's performance evaluation, as it involves retrieving specific information from the documents..
[0m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The question also mentions summarization questions related to document content, which is a key evaluation metric for the RAG system's ability to summarize and retrieve relevant information..
Selecting query engine 1: The question also mentions summarization questions related to document content, which is a key evaluation metric for the RAG system's ability to summarize and retrieve relevant information..


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;5;200mSelecting query engine 1: The question also mentions summarization questions related to document content, which is a key evaluation metric for the RAG system's ability to summarize and retrieve relevant information..
[0m

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Combining responses from multiple query engines.
Combining responses from multiple query engines.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**`Final Response:`** According to the provided context information, the two main metrics used to evaluate the performance of the different rerankers in the RAG system are:

1. **Hit Rate**
2. **Mean Reciprocal Rank (MRR)**

In [36]:
response = multi_selector_query_engine.query("What are diffusion models?")
display_response(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: Diffusion models are complex and nuanced, requiring specific context to understand..
Selecting query engine 0: Diffusion models are complex and nuanced, requiring specific context to understand..
[1;3;38;5;200mSelecting query engine 0: Diffusion models are complex and nuanced, requiring specific context to understand..
[0m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**`Final Response:`** Based on the context information provided, it appears that DiffusionDB is a dataset of text-to-image prompts, and the context does not explicitly define what diffusion models are. However, it can be inferred that DiffusionDB is likely a dataset of text prompts used to generate images using diffusion-based models, such as diffusion models like DALL-E, Stable Diffusion, or other text-to-image generation models.

In the context of the text, it seems that DiffusionDB is used to create a vector index for the Text2Image Prompt Assistant tool, which can rewrite prompts to generate more beautiful images. This implies that the prompts in DiffusionDB are used to train or fine-tune diffusion models to generate images, and the tool is able to leverage this dataset to generate new prompts for image generation.

Therefore, while the context does not explicitly define what diffusion models are, it is likely that DiffusionDB is a dataset used to train or fine-tune diffusion models for text-to-image generation.

In [37]:
response.metadata

{'c8871a51-f53e-43c9-aa4f-0d6e4fe9751d': {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-and-transformers-agents-67042ee1d8d6.html',
  'file_name': 'llamaindex-and-transformers-agents-67042ee1d8d6.html',
  'file_type': 'text/html',
  'file_size': 14762,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'},
 'selector_result': MultiSelection(selections=[SingleSelection(index=0, reason='Diffusion models are complex and nuanced, requiring specific context to understand.')])}

### Subquery engine

In [20]:
from llama_index.core.query_engine import SubQuestionQueryEngine

In [37]:
subquery_engine = SubQuestionQueryEngine.from_defaults(
    [vector_tool, summary_tool],
    verbose=True,
    use_async=True
)

In [38]:
reponse = subquery_engine.query(question1)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


ValueError: No valid JSON found in output: 

To generate the list of relevant sub-questions, we'll use the following approach:

1. Tokenize the user question into individual words.
2. Identify the main concepts or entities in the question (e.g., "Uber", "Lyft", "revenue growth", "EBITDA").
3. Map each concept or entity to the relevant tools that can provide information about it.
4. Generate sub-questions for each tool that can help answer the main question.

Here's the Python code to achieve this:

```python
import json

def generate_sub_questions(user_question, tools):
    # Tokenize the user question into individual words
    tokens = user_question.split()

    # Identify the main concepts or entities in the question
    concepts = []
    for token in tokens:
        if token.lower() in ["uber", "lyft", "revenue", "growth", "ebitda"]:
            concepts.append(token)

    # Map each concept or entity to the relevant tools
    sub_questions = []
    for concept in concepts:
        for tool_name, description in tools.items():
            if concept.lower() in description.lower():
                sub_questions.append({
                    "sub_question": f"What is the {concept} of {concept}?",
                    "tool_name": tool_name
                })

    return sub_questions

# Example 1
tools = {
    "uber_10k": "Provides information about Uber financials for year 2021",
    "lyft_10k": "Provides information about Lyft financials for year 2021"
}

user_question = "Compare and contrast the revenue growth and EBITDA of Uber and Lyft for year 2021"
print(json.dumps(generate_sub_questions(user_question, tools), indent=4))

# Example 2
tools = {
    "vector_search": "Useful for retrieving specific context",
    "summary": "Useful for summarization questions related to document content"
}

user_question = "What are key features of llama-agents?"
print(json.dumps(generate_sub_questions(user_question, tools), indent=4))
```

Output:

```json
[
    {
        "sub_question": "What is the revenue growth of Uber?",
        "tool_name": "uber_10k"
    },
    {
        "sub_question": "What is the EBITDA of Uber?",
        "tool_name": "uber_10k"
    },
    {
        "sub_question": "What is the revenue growth of Lyft?",
        "tool

In [None]:
disp