In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, Settings
from llama_index.core.indices import load_index_from_storage
from llama_index.core import VectorStoreIndex, DocumentSummaryIndex
from llama_index.core import SummaryIndex
from llama_index.core.prompts import PromptTemplate
from llama_index.core.response.notebook_utils import display_response
import chromadb
import torch
import model_utils
import prompt_utils
import nest_asyncio
nest_asyncio.apply()




INFO:datasets:PyTorch version 2.3.0+cu118 available.
PyTorch version 2.3.0+cu118 available.


In [3]:
# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="models/bge-small-en-v1.5", device="cuda")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: models/bge-small-en-v1.5
Load pretrained SentenceTransformer: models/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [4]:
# load local llm llama
model_name = "models/Llama-2-7b-chat-hf"
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path=model_name,
    device="cuda"
)

# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate("<s> [INST] {query_str} [/INST] "),
    generate_kwargs={
        "temperature": 0.7,
        "do_sample": True
    },
    device_map="cuda",
    model_name="models/Llama-2-7b-chat-hf",
    model=model,
    tokenizer=tokenizer
)

Loading tokenizer and model with quantization config from: models/Llama-2-7b-chat-hf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Settings.embed_model = embed_model
# Settings.llm = llm_hf

## Load vector index and document summary index

In [6]:
# Creates a persistent instance of Chroma that saves to disk
chroma_client = chromadb.PersistentClient(path="./chroma_db")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [7]:
chroma_client.list_collections()

[Collection(id=3186f0ec-26e5-46fa-b687-281a3a26066f, name=llama_index_blogs),
 Collection(id=46aee9cc-50b0-4474-90d9-61e1e160c15d, name=blogs_vector_index),
 Collection(id=cf744ea1-c23d-41c8-8204-363adcd4b3fe, name=llma_blogs_summary),
 Collection(id=e2bc79ee-49dd-45e4-85f3-6acb87185f7a, name=blogs_summary)]

In [8]:
# Get or create a collection with the given name and metadata.
vector_collection = chroma_client.get_or_create_collection("blogs_vector_index")
vector_store = ChromaVectorStore(chroma_collection=vector_collection)
vector_storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7fb93167fee0>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7fb933a79a20>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={}), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7fb93169d480>, property_graph_store=None)

In [9]:
# load your index from stored vectors
vector_index = VectorStoreIndex.from_vector_store(
    llm=llm_hf,
    embed_model=embed_model,
    vector_store=vector_store, 
    storage_context=vector_storage_context
)
vector_index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7fb93169da50>

In [10]:
# Get or create a collection with the given name and metadata.
doc_sum_collection = chroma_client.get_or_create_collection("blogs_summary")
doc_sum_vector_store = ChromaVectorStore(
    chroma_collection=doc_sum_collection
)

doc_sum_storage_context = StorageContext.from_defaults(
    vector_store=doc_sum_vector_store,
    persist_dir="./database/blogs_summary_index/"
)
doc_sum_storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7fb93169d150>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7fb90e11b910>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={})}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7fb90e11b6a0>, property_graph_store=None)

In [11]:
doc_summary_index = load_index_from_storage(
    llm=llm_hf,
    embed_model=embed_model,
    storage_context=doc_sum_storage_context
)
doc_summary_index

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


<llama_index.core.indices.document_summary.base.DocumentSummaryIndex at 0x7fb93169cf70>

## Advanced Querying

In [12]:
question1 = "What are key features of llama-agents?"
question2 = '''
What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?
'''
question3 = '''
What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?
'''

In [13]:
def print_ref_docs(resp_metadata):
    print("References:")
    base_url = "https://www.llamaindex.ai/blog/"
    for idx, (_, doc_metatada) in enumerate(resp_metadata.items()):
        ref_url = base_url + doc_metatada['file_name'].split(".")[0]
        print(f"{idx+1}.", ref_url)

In [14]:
Settings.llm = llm_hf
Settings.embed_model = embed_model

In [15]:
# tracing log
import llama_index.core
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
llama_index.core.set_global_handler("simple")

### VectorIndex as query engine

In [16]:
vector_query_engine = vector_index.as_query_engine(
    response_mode="compact", 
    use_async=True,
)

In [17]:
print("Question:", question1)
response1 = vector_query_engine.query(question1)
display_response(response1)
print_ref_docs(response1.metadata)

Question: What are key features of llama-agents?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** Based on the provided context information, the key features of llama-agents are:

1. Distributed Service Oriented Architecture: LlamaIndex allows each agent to be its own independently running microservice, with a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: Agents can communicate with each other using a central control plane orchestrator, and pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: Developers have the flexibility to directly define the sequence of interactions between agents or leave it up to an "agentic orchestrator" that decides which agents are relevant to the task.
4. Ease of deployment: LlamaIndex allows developers to launch, scale, and monitor each agent and the control plane independently.
5. Scalability and resource management: LlamaIndex provides built-in observability tools to monitor the quality and performance of the system and each individual agent service.

These are the main features of llama-agents, based on the context information provided.

References:
1. https://www.llamaindex.ai/blog/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems


### DocumentSummaryIndex as query engine

In [18]:
summary_query_engine = doc_summary_index.as_query_engine(
    response_mode="tree_summarize", 
    use_async=True,
)

In [19]:
print("Question:", question1)
response = summary_query_engine.query(question1)

Question: What are key features of llama-agents?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


In [20]:
display_response(response)
print_ref_docs(response.metadata)

**`Final Response:`** Based on the information provided in the article, the key features of LlmaAgents are:

1. Vector database: LlmaAgents use a vector database created from DiffusionDB, which is a large-scale knowledge graph that contains billions of vectors. This database is used to suggest better prompts when generating images.
2. Text-to-image prompts: LlmaAgents can generate text-to-image prompts using the transformers model. This allows for the generation of images from text descriptions.
3. Temperature variable: The temperature variable allows for controlling the variation in the generated prompts. With a temperature above zero, each prompt generated by LlmaIndex with the same agent prompt will be brand new.
4. Custom tools: LlmaAgents can be used to distribute and share custom tools in Transformers Agents using Hugging Face Spaces.
5. Easy to use: The article mentions that the tool is easy to use, and the author provides an example of how to use the tool in the article.
6. Improved image generation: The article claims that the tool can generate more stylized and varied images compared to the existing image-generator tool.

Overall, LlmaAgents are a powerful tool for generating text-to-image prompts using transformers models, and they offer a range of features that make them easy to use and customize.

References:
1. https://www.llamaindex.ai/blog/llamaindex-and-transformers-agents-67042ee1d8d6


### Router Query Engine

In [21]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector

In [22]:
vector_tool = QueryEngineTool(
    vector_query_engine,
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for retrieving specific context"
    )
)

summary_tool = QueryEngineTool(
    summary_query_engine,
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarization questions related to document content"
    )
)

#### Single selector

In [23]:
router_query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool],
    selector=LLMSingleSelector.from_defaults()
)

In [24]:
response = router_query_engine.query(question1)

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: Useful for retrieving specific context.
Selecting query engine 0: Useful for retrieving specific context.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
display_response(response)

**`Final Response:`** The key features of llama-agents are:

1. Distributed Service Oriented Architecture: Every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Communication via standardized API interfaces: Interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. Define agentic and explicit orchestration flows: Developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. Ease of deployment: Launch, scale, and monitor each agent and your control plane independently.
5. Scalability and resource management: Use built-in observability tools to monitor the quality and performance of the system and each individual agent service.

In [26]:
response.metadata

{'05e03730-b3db-464d-831f-b87fd9c5e3b7': {'file_path': '/workspace/projects/LlamindexHelper/data/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html',
  'file_name': 'introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html',
  'file_type': 'text/html',
  'file_size': 18790,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'},
 'selector_result': MultiSelection(selections=[SingleSelection(index=0, reason='Useful for retrieving specific context')])}

In [27]:
response = router_query_engine.query(question2)

OutputParserException: Got invalid JSON object. Error: Expecting property name enclosed in double quotes: line 2 column 5 (char 6) expected '<document start>', but found '<block mapping start>'
  in "<unicode string>", line 6, column 1:
    Here is the JSON output:
    ^. Got JSON string: {
    choice: 1,
    reason: "The question is asking for specific context, so choice 1 is the most relevant. The RAG system is used to evaluate and monitor the performance of a system, and choice 1 is the most relevant because it is the most useful for retrieving specific context related to the system's performance."
}

Here is the JSON output:
[
    {
        choice: 1,
        reason: "The question is asking for specific context"
    }

In [None]:
display_response(response)

In [None]:
response = router_query_engine.query(question3)
display_response(response)

In [None]:
test_summary_question = "What is the summmarization of bridging-the-gap-in-crisis-counseling-introducing-counselor-copilot document?"
response = router_query_engine.query(test_summary_question)
display_response(response)

### MultiSelector

In [None]:
multi_selector_query_engine = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(),
    query_engine_tools=[vector_tool, summary_tool],
    verbose=True
)

In [None]:
print(question1)
response = multi_selector_query_engine.query(question1)
display_response(response)

In [None]:
print(question2)
response = multi_selector_query_engine.query(question2)
display_response(response)

In [None]:
print(question3)
response = multi_selector_query_engine.query(question3)
display_response(response)

In [None]:
print(question3)
response = multi_selector_query_engine.query(question3)
display_response(response)

In [None]:
response = multi_selector_query_engine.query("What are diffusion models?")
display_response(response)

In [None]:
response.metadata