In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, Settings
from llama_index.core.indices import load_index_from_storage
from llama_index.core import VectorStoreIndex, DocumentSummaryIndex
from llama_index.core import SummaryIndex
from llama_index.core.prompts import PromptTemplate
from llama_index.core.response.notebook_utils import display_response
import chromadb
import torch
import model_utils
import prompt_utils
import nest_asyncio
nest_asyncio.apply()




INFO:datasets:PyTorch version 2.3.0+cu118 available.
PyTorch version 2.3.0+cu118 available.


In [3]:
# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="models/bge-small-en-v1.5", device="cuda")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: models/bge-small-en-v1.5
Load pretrained SentenceTransformer: models/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [4]:
# load local llm zephyr
model_name = "models/zephyr-7b-beta"
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path=model_name,
    device="cuda"
)
# Set `pad_token_id` to `eos_token_id`
model.generation_config.pad_token_id = model.generation_config.eos_token_id

# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    generate_kwargs={
        "temperature": 0.7,
        "top_k": 50, 
        "top_p": 0.95,
        "do_sample": True
    },
    device_map="cuda",
    model_name=model_name,
    model=model,
    messages_to_prompt=prompt_utils.zephyr_messages_to_prompt,
    tokenizer=tokenizer
)

Loading tokenizer and model with quantization config from: models/zephyr-7b-beta


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

## Load vector index and document summary index

In [7]:
# Creates a persistent instance of Chroma that saves to disk
chroma_client = chromadb.PersistentClient(path="./chroma_db")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [8]:
chroma_client.list_collections()

[Collection(id=3186f0ec-26e5-46fa-b687-281a3a26066f, name=llama_index_blogs),
 Collection(id=46aee9cc-50b0-4474-90d9-61e1e160c15d, name=blogs_vector_index),
 Collection(id=cf744ea1-c23d-41c8-8204-363adcd4b3fe, name=llma_blogs_summary),
 Collection(id=e2bc79ee-49dd-45e4-85f3-6acb87185f7a, name=blogs_summary)]

In [9]:
# Get or create a collection with the given name and metadata.
vector_collection = chroma_client.get_or_create_collection("blogs_vector_index")
vector_store = ChromaVectorStore(chroma_collection=vector_collection)
vector_storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7f8421da2320>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7f84212dfbb0>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={}), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7f84212df8e0>, property_graph_store=None)

In [10]:
# load your index from stored vectors
vector_index = VectorStoreIndex.from_vector_store(
    llm=llm_hf,
    embed_model=embed_model,
    vector_store=vector_store, 
    storage_context=vector_storage_context
)
vector_index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7f84213055a0>

In [11]:
# Get or create a collection with the given name and metadata.
doc_sum_collection = chroma_client.get_or_create_collection("blogs_summary")
doc_sum_vector_store = ChromaVectorStore(
    chroma_collection=doc_sum_collection
)

doc_sum_storage_context = StorageContext.from_defaults(
    vector_store=doc_sum_vector_store,
    persist_dir="./database/blogs_summary_index/"
)
doc_sum_storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7f84213050c0>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7f8402ab78b0>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={})}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7f8402ab76a0>, property_graph_store=None)

In [12]:
doc_summary_index = load_index_from_storage(
    llm=llm_hf,
    embed_model=embed_model,
    storage_context=doc_sum_storage_context
)
doc_summary_index

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


<llama_index.core.indices.document_summary.base.DocumentSummaryIndex at 0x7f8421304790>

## Advanced Querying

In [13]:
question1 = "What are key features of llama-agents?"
question2 = '''
What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?
'''
question3 = '''
What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?
'''

In [14]:
def print_ref_docs(resp_metadata):
    print("References:")
    base_url = "https://www.llamaindex.ai/blog/"
    for idx, (_, doc_metatada) in enumerate(resp_metadata.items()):
        ref_url = base_url + doc_metatada['file_name'].split(".")[0]
        print(f"{idx+1}.", ref_url)

In [15]:
Settings.llm = llm_hf
Settings.embed_model = embed_model

In [16]:
# tracing log
import llama_index.core
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
llama_index.core.set_global_handler("simple")

### VectorIndex as query engine

In [17]:
vector_query_engine = vector_index.as_query_engine(
    response_mode="compact", 
    use_async=True,
)

In [18]:
print("Question:", question1)
response1 = vector_query_engine.query(question1)
display_response(response1)
print_ref_docs(response1.metadata)

Question: What are key features of llama-agents?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** Some key features of llama-agents, as mentioned in the context information, are:

1. Distributed Service Oriented Architecture: Each agent in LlamaIndex can be its own independently running microservice, orchestrated by a control plane.
2. Communication via standardized API interfaces: Agents can communicate with each other using a central control plane orchestrator or a message queue.
3. Define agentic and explicit orchestration flows: Developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an "agentic orchestrator" that decides which agents are relevant to the task.
4. Ease of deployment: Launch, scale, and monitor each agent and the control plane independently.
5. Scalability and resource management: Use built-in observability tools to monitor the quality and performance of the system and each individual agent service.

More information and resources for getting started with llama-agents can be found in the context information provided.

References:
1. https://www.llamaindex.ai/blog/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems


### DocumentSummaryIndex as query engine

In [19]:
summary_query_engine = doc_summary_index.as_query_engine(
    response_mode="tree_summarize", 
    use_async=True,
)

In [20]:
print("Question:", question1)
response = summary_query_engine.query(question1)

Question: What are key features of llama-agents?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
display_response(response)
print_ref_docs(response.metadata)

**`Final Response:`** Llama-agents are a type of language model (LLM) agent developed by Hugging Face with the following key features:

1. Based on large LLMs such as GPT-3, which have been trained on vast amounts of text data.
2. Use the transformers architecture for processing large amounts of text data efficiently and accurately.
3. Can be customized with various tools and libraries such as LlamaIndex, a tool for vector databases that can suggest better prompts when generating images.
4. Open-source, allowing developers to modify and extend the agents to fit their specific needs.
5. Capable of performing multiple tasks simultaneously, making them versatile and useful for various applications.
6. Provide real-time feedback on their performance, allowing developers to fine-tune and improve the agents as needed.
7. Integration with popular frameworks such as TensorFlow, PyTorch, and Hugging Face Transformers, making it easy to train, test, and deploy Llama-agents in various environments.
8. Scalable and capable of handling large datasets, making them suitable for handling complex tasks and large volumes of data.
9. Capable of achieving high accuracy on various tasks, making them a reliable choice for many applications.
10. Developed collaboratively by a community of developers, ensuring that they are continually improved and updated with the latest advancements.

References:
1. https://www.llamaindex.ai/blog/llamaindex-and-transformers-agents-67042ee1d8d6


### Router Query Engine

In [22]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector

In [23]:
vector_tool = QueryEngineTool(
    vector_query_engine,
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for retrieving specific context"
    )
)

summary_tool = QueryEngineTool(
    summary_query_engine,
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarization questions related to document content"
    )
)

#### Single selector

In [24]:
router_query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool],
    selector=LLMSingleSelector.from_defaults()
)

In [25]:
response = router_query_engine.query(question1)

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The summary provided for choice 2 suggests that it would be useful for summarization questions related to document content, and as 'key features of llama-agents' would likely be discussed in the context of the document, choice 2 is more relevant to the question..
Selecting query engine 1: The summary provided for choice 2 suggests that it would be useful for summarization questions related to document content, and as 'key features of llama-agents' would likely be discussed in the context of the document, choice 2 is more relevant to the question..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
display_response(response)

**`Final Response:`** Llama-agents, which are intelligent software agents built using LLM agents, have several key features. These include the ability to interact with external services, perform specific tasks, integrate with external tools, learn and adapt, work collaboratively, handle large amounts of data, handle natural language and multimodal data, provide explanations, and work in real-time. These features make llama-agents versatile and powerful, allowing them to be useful in various applications such as search engines, recommendation engines, knowledge graphs, chatbots, virtual assistants, customer support, decision making, diagnosis, and analysis.

In [27]:
response.metadata

{'c71a83e3-f32a-4445-9ee3-cad21e9a0778': {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-and-transformers-agents-67042ee1d8d6.html',
  'file_name': 'llamaindex-and-transformers-agents-67042ee1d8d6.html',
  'file_type': 'text/html',
  'file_size': 14762,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'},
 'selector_result': MultiSelection(selections=[SingleSelection(index=1, reason="The summary provided for choice 2 suggests that it would be useful for summarization questions related to document content, and as 'key features of llama-agents' would likely be discussed in the context of the document, choice 2 is more relevant to the question.")])}

In [28]:
response = router_query_engine.query(question2)

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The question asks about summarization questions related to document content, which suggests that choice (2) is most relevant as it relates to document content and summarization..
Selecting query engine 1: The question asks about summarization questions related to document content, which suggests that choice (2) is most relevant as it relates to document content and summarization..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
display_response(response)

**`Final Response:`** The two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook are "the Retrieval System" and "Response Generation."

In [30]:
response = router_query_engine.query(question3)
display_response(response)

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The question asks about metrics for evaluating rerankers in the RAG system, which suggests that it relates to questions about document content. Choice 2 indicates that this option is useful for summarization questions related to document content, which aligns with the nature of the question..
Selecting query engine 1: The question asks about metrics for evaluating rerankers in the RAG system, which suggests that it relates to questions about document content. Choice 2 indicates that this option is useful for summarization questions related to document content, which aligns with the nature of the question..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** The two main metrics used to evaluate the performance of the different rerankers in the RAG system are Hit Rate and Mean Reciprocal Rank (MRR).

In [31]:
test_summary_question = "What is the summmarization of bridging-the-gap-in-crisis-counseling-introducing-counselor-copilot document?"
response = router_query_engine.query(test_summary_question)
display_response(response)

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The question asks for a summary of the document, which suggests that choice 2 is more relevant as it is specifically related to summarization questions. Choice 1, while useful for retrieving specific context, may not necessarily provide a summary of the document as a whole..
Selecting query engine 1: The question asks for a summary of the document, which suggests that choice 2 is more relevant as it is specifically related to summarization questions. Choice 1, while useful for retrieving specific context, may not necessarily provide a summary of the document as a whole..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** The document describes a winning solution, Counselor Copilot, from a hackathon focused on bridging the gap between demand and supply of crisis services. The AI copilot automates administrative tasks for crisis counselors, allowing them to focus on providing care. It extracts contact data from complex PDFs, suggests appropriate replies based on The Trevor Project's guidelines, searches for location-specific resources, and completes case forms. The solution also includes a ReAct Agent that deploys the right tool based on the chat history and contact context. The document highlights the potential for others to build on this work and suggests possible extensions, such as reducing costs and improving the quality of suggested responses and adding a tool for assessing the stage of the conversation. The document concludes by stating that the AI copilot represents a significant step toward more efficient and effective crisis care, enhancing the quality of care provided and addressing the pressing issue of counselor shortage by maximizing the impact of existing resources.

### MultiSelector

In [32]:
multi_selector_query_engine = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(),
    query_engine_tools=[vector_tool, summary_tool],
    verbose=True
)

In [33]:
print(question1)
response = multi_selector_query_engine.query(question1)
display_response(response)

What are key features of llama-agents?
INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: llama-agents have specific contexts that can be retrieved for analysis, making choice 1 the most relevant for this question..
Selecting query engine 0: llama-agents have specific contexts that can be retrieved for analysis, making choice 1 the most relevant for this question..
[1;3;38;5;200mSelecting query engine 0: llama-agents have specific contexts that can be retrieved for analysis, making choice 1 the most relevant for this question..
[0m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** 1. Distributed Service Oriented Architecture: Every agent in LlamaIndex can be its own independently running microservice, orchestrated by a control plane.
2. Communication via standardized API interfaces: Agents communicate using a central control plane orchestrator or a message queue.
3. Define agentic and explicit orchestration flows: Developers have the flexibility to directly define the sequence of interactions between agents or leave it up to an “agentic orchestrator”.
4. Ease of deployment: Launch, scale, and monitor each agent and the control plane independently.
5. Scalability and resource management: Use built-in observability tools to monitor the quality and performance of the system and each individual agent service.

These features simplify the process of building, iterating, and deploying multi-agent AI systems for complex question-answering systems, collaborative AI assistants, and distributed AI workflows. With llama-agents, developers can orchestrate tasks using a LLM-powered control plane and pass messages between agents using standardized API interfaces. Additionally, developers have the flexibility to directly define the sequence of interactions between agents or leave it up to an “agentic orchestrator”. The framework also enables ease of deployment and scalability through independent launch, scaling, and monitoring of each agent and the control plane. Built-in observability tools are provided for monitoring the quality and performance of the system and each individual agent service.

In [34]:
print(question2)
response = multi_selector_query_engine.query(question2)
display_response(response)


What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: Choice 1 is most relevant as it suggests that the RAG system's performance in retrieving specific context is assessed in the section 'Evaluating RAG with LlamaIndex' of the OpenAI Cookbook..
Selecting query engine 0: Choice 1 is most relevant as it suggests that the RAG system's performance in retrieving specific context is assessed in the section 'Evaluating RAG with LlamaIndex' of the OpenAI Cookbook..
[1;3;38;5;200mSelecting query engine 0: Choice 1 is most relevant as it suggests that the RAG system's performance in retrieving specific context is assessed in the section 'Evaluating RAG with LlamaIndex' of the OpenAI Cookbook..
[0m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** The two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook are "the Retrieval System" and "Response Generation."

In [35]:
print(question3)
response = multi_selector_query_engine.query(question3)
display_response(response)


What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: The first choice may be relevant to locating specific context, but it is not directly related to summarization questions related to document content. The second choice is more appropriate as it pertains to metrics used to evaluate the performance of rerankers in the RAG system, which is directly related to summarization questions..
Selecting query engine 1: The first choice may be relevant to locating specific context, but it is not directly related to summarization questions related to document content. The second choice is more appropriate as it pertains to metrics used to evaluate the performance of rerankers in the RAG system, which is directly related to summarization questions..
[1;3;38;5;200mSelecting query engine 1: The first choice may be relevant to locating specific context, but it is 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** The two main metrics used to evaluate the performance of the different rerankers in the RAG system are Hit Rate and Mean Reciprocal Rank (MRR).

In [36]:
print(question3)
response = multi_selector_query_engine.query(question3)
display_response(response)


What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?



OutputParserException: Got invalid JSON object. Error: Extra data: line 12 column 1 (char 514) expected '<document start>', but found '<scalar>'
  in "<unicode string>", line 12, column 1:
    The output should be ONLY JSON f ... 
    ^. Got JSON string: [
  {
    "choice": 1,
    "reason": "These metrics are likely specific to the context of the RAG system, making choice 1 the most relevant for retrieving this information."
  },
  {
    "choice": 2,
    "reason": "However, it's also possible that the question is asking for a summary of the document content, in which case choice 2 may be more relevant. In this case, both choices could potentially provide useful information, but it's best to choose only what's needed to answer the question accurately."
  }
]

The output should be ONLY JSON formatted as a JSON instance.

Here is an example:
[
  {
    "choice": 1,
    "reason": "These metrics are likely specific to the context of the RAG system, making choice 1 the most relevant for retrieving this information."
  },
  {
    "choice": 2,
    "reason": "However, it's also possible that the question is asking for a summary of the document content, in which case choice 2 may be more relevant. In this case, both choices could potentially provide useful information, but it's best to choose only what's needed to answer the question accurately."
  }
]

The output should be ONLY JSON formatted as a JSON instance.

Here is an example:
[
  { "choice": 1, "reason": "These metrics are likely specific to the context of the RAG system, making choice 1 the most relevant for retrieving this information." },
  { "choice": 2, "reason": "However, it's also possible that the question is asking for a summary of the document content, in which case choice 2 may be more relevant. In this case, both choices could potentially provide useful information, but it's best to choose only what's needed to answer the question accurately." }
]