In [3]:
import nest_asyncio

nest_asyncio.apply()

In [4]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [5]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core.response.notebook_utils import display_response, display_metadata
import model_utils, prompt_utils, db_utils




In [6]:
# load vector index
vector_store, storage_context = db_utils.load_qdrant_db(
    local_path="./qdrant_db",
    coll_name="llamaindex-blogs"
)
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7fdc3cc31690>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7fdc3cc6b580>, vector_stores={'default': QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='llamaindex-blogs', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False, index_doc_id=True, fastembed_sparse_model=None), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7fdc3cc6b640>, property_graph_store=None)

In [5]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name="models/bge-small-en-v1.5/",
    device="cuda"
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: models/bge-small-en-v1.5/
Load pretrained SentenceTransformer: models/bge-small-en-v1.5/
Load pretrained SentenceTransformer: models/bge-small-en-v1.5/
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [6]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context,
    show_progress=True
)

In [7]:
# load model
model_name = "models/Meta-Llama-3-8B-Instruct"
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path=model_name,
    device="cuda"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading tokenizer and model with quantization config from: models/Meta-Llama-3-8B-Instruct
DEBUG:bitsandbytes.cextension:Loading bitsandbytes native library from: /opt/conda/envs/dev/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
Loading bitsandbytes native library from: /opt/conda/envs/dev/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
Loading bitsandbytes native library from: /opt/conda/envs/dev/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate(prompt_utils.get_llama3_prompt_template()),
    generate_kwargs={
        "temperature": 0.7,
        "do_sample": True
    },
    device_map="cuda",
    model_name=model_name,
    model=model,
    tokenizer=tokenizer
)

Settings.llm = llm_hf

## Basic Query

In [15]:
question1 = "What are key features of llama-agents?"
question2 = '''What are the two critical areas of RAG system performance that are assessed \
in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?'''
question3 = '''What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?'''

In [16]:
print("Question:", question1)
query_engine = index.as_query_engine(use_async=True)
response = query_engine.query(question1)

Question: What are key features of llama-agents?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.core.indices.utils:> Top 2 nodes:
> [Node ccb2800f-1560-404a-ae19-fd4e729ab440] [Similarity score:             0.763726] <div class="BlogPost_htmlPost__Z5oDL">
 <p class="Text_text__zPO0D Text_text-size-16__PkjFu">
  W...
> [Node 067e700a-84e7-482f-a7aa-d573faeb1e27] [Similarity score:             0.720235] """</span>
    <span class="hljs-keyword">return</span> <span class="hljs-string">"The secret fac...
> Top 2 nodes:
> [Node ccb2800f-1560-404a-ae19-fd4e729ab440] [Similarity score:             0.763726] <div class="BlogPost_htmlPost__Z5oDL">
 <p class="Text_text__zPO0D Text_text-size-16__PkjFu">
  W...
> [Node 067e700a-84e7-482f-a7aa-d573faeb1e27] [Similarity score:             0.720235] """</span>
    <span class="hljs-keyword">return</span> <span class="hljs-string">"The secret fac...
> Top 2 nodes:
> [Node ccb2800f-1560-404a-ae19-fd4e729ab440] [Similarity score:             0.763726] <div class="BlogPost_htmlPost__Z5oDL">
 <p class="Text_text__zPO0D Text_text-si

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [18]:
display_response(response)

**`Final Response:`** According to the provided context information, the key features of llama-agents are:

1. **Distributed Service Oriented Architecture**: every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. **Communication via standardized API interfaces**: interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.
3. **Define agentic and explicit orchestration flows**: developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.
4. **Ease of deployment**: launch, scale and monitor each agent and your control plane independently.
5. **Scalability and resource management**: use our built-in observability tools to monitor the quality and performance of the system and each individual agent service.

In [19]:
display_metadata(response.metadata)

{'ccb2800f-1560-404a-ae19-fd4e729ab440': {'file_path': '/workspace/projects/LlamindexHelper/data/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html',
  'file_name': 'introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html',
  'file_type': 'text/html',
  'file_size': 18790,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'},
 '067e700a-84e7-482f-a7aa-d573faeb1e27': {'file_path': '/workspace/projects/LlamindexHelper/data/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html',
  'file_name': 'introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html',
  'file_type': 'text/html',
  'file_size': 18790,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'}}

## Two-stage query

### Retrieval

In [1]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core import QueryBundle
import pandas as pd
from IPython.display import display, HTML

In [2]:
rerank_postprocessor = SentenceTransformerRerank(
    model='models/mxbai-rerank-xsmall-v1',
    top_n=2, # number of nodes after re-ranking,
    keep_retrieval_score=True
)

  from tqdm.autonotebook import tqdm, trange


In [None]:
pd.set_option("display.max_colwidth", 100)

def get_retrieved_nodes(
    query_str, vector_top_k=10, reranker_top_n=3, with_reranker=False
):
    query_bundle = QueryBundle(query_str)
    # configure retriever
    retriever = VectorIndexRetriever(
        index=indreex,
        similarity_top_k=vector_top_k,
    )
    retrieved_nodes = retriever.retrieve(query_bundle)

    if with_reranker:
        # configure reranker
        reranker = LLMRerank(
            choice_batch_size=5,
            top_n=reranker_top_n,
        )
        retrieved_nodes = reranker.postprocess_nodes(
            retrieved_nodes, query_bundle
        )

    return retrieved_nodes

def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))


def visualize_retrieved_nodes(nodes) -> None:
    result_dicts = []
    for node in nodes:
        result_dict = {"Score": node.score, "Text": node.node.get_text()}
        result_dicts.append(result_dict)

    pretty_print(pd.DataFrame(result_dicts))

In [23]:
new_nodes = get_retrieved_nodes(
    question1,
    vector_top_k=3,
    with_reranker=False,
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.core.indices.utils:> Top 3 nodes:
> [Node ccb2800f-1560-404a-ae19-fd4e729ab440] [Similarity score:             0.763726] <div class="BlogPost_htmlPost__Z5oDL">
 <p class="Text_text__zPO0D Text_text-size-16__PkjFu">
  W...
> [Node 067e700a-84e7-482f-a7aa-d573faeb1e27] [Similarity score:             0.720235] """</span>
    <span class="hljs-keyword">return</span> <span class="hljs-string">"The secret fac...
> [Node 0544da4e-b27c-40ae-b51d-f576d370d26c] [Similarity score:             0.718015] <div class="BlogPost_htmlPost__Z5oDL">
 <p class="Text_text__zPO0D Text_text-size-16__PkjFu">
  H...
> Top 3 nodes:
> [Node ccb2800f-1560-404a-ae19-fd4e729ab440] [Similarity score:             0.763726] <div class="BlogPost_htmlPost__Z5oDL">
 <p class="Text_text__zPO0D Text_text-size-16__PkjFu">
  W...
> [Node 067e700a-84e7-482f-a7aa-d573faeb1e27] [Similarity score:             0.720235] """</span>
    <span class="hljs-keyword">return</span> <span class="hljs-string">"The secret 

In [26]:
visualize_retrieved_nodes(new_nodes)

Unnamed: 0,Score,Text
0,0.763726,"<div class=""BlogPost_htmlPost__Z5oDL"">  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  We're excited to announce the alpha release of  <code class=""SanityPortableText_inlineCode__cI85z"">  llama-agents  </code>  , a new open-source framework designed to simplify the process of building, iterating, and deploying multi-agent AI systems and turn your agents into production microservices. Whether you're working on complex question-answering systems, collaborative AI assistants, or distributed AI workflows, llama-agents provides the tools and structure you need to bring your ideas to life.  </p>  <h2 class=""Text_text__zPO0D Text_text-size-48__A2f8Q"">  Key Features of llama-agents  </h2>  <ol>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  <strong>  Distributed Service Oriented Architecture:  </strong>  every agent in LlamaIndex can be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.  </li>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  <strong>  Communication via standardized API interfaces:  </strong>  interface between agents using a central control plane orchestrator. Pass messages between agents using a message queue.  </li>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  <strong>  Define agentic and explicit orchestration flows:  </strong>  developers have the flexibility to directly define the sequence of interactions between agents, or leave it up to an “agentic orchestrator” that decides which agents are relevant to the task.  </li>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  <strong>  Ease of deployment:  </strong>  launch, scale and monitor each agent and your control plane independently.  </li>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  <strong>  Scalability and resource management:  </strong>  use our built-in observability tools to monitor the quality and performance of the system and each individual agent service  </li>  </ol>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  Let's dive into how you can start using llama-agents to build your own multi-agent systems.  </p>  <h2 class=""Text_text__zPO0D Text_text-size-48__A2f8Q"">  Getting Started with llama-agents  </h2>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  First, install the framework using pip:  </p>  <pre><code>pip install llama-agents llama-index-agent-openai</code></pre>  <h3 class=""Text_text__zPO0D Text_text-size-40__fIyvA"">  Basic System Setup  </h3>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  Here's a simple example of how to set up a basic multi-agent system using llama-agents."
1,0.720235,"""""""</span>  <span class=""hljs-keyword"">return</span> <span class=""hljs-string"">""The secret fact is: A baby llama is called a 'Cria'.""</span> tool = FunctionTool.from_defaults(fn=get_the_secret_fact) <span class=""hljs-comment""># create our agents</span> worker1 = FunctionCallingAgentWorker.from_tools([tool], llm=OpenAI()) worker2 = FunctionCallingAgentWorker.from_tools([], llm=OpenAI()) agent1 = worker1.as_agent() agent2 = worker2.as_agent()</code></pre>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  We turn those agents into services:  </p>  <pre><code>agent_server_1 = AgentService(  agent=agent1,  message_queue=message_queue,  description=<span class=""hljs-string"">""Useful for getting the secret fact.""</span>,  service_name=<span class=""hljs-string"">""secret_fact_agent""</span>,  host=<span class=""hljs-string"">""localhost""</span>,  port=<span class=""hljs-number"">8003</span> ) agent_server_2 = AgentService(  agent=agent2,  message_queue=message_queue,  description=<span class=""hljs-string"">""Useful for getting random dumb facts.""</span>,  service_name=<span class=""hljs-string"">""dumb_fact_agent""</span>,  host=<span class=""hljs-string"">""localhost""</span>,  port=<span class=""hljs-number"">8004</span> )</code></pre>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  And finally we launch each service as an independent agent. Here we’re doing them all from a single script, but each of these could be a totally separate service, launched and scaled independently:  </p>  <pre><code><span class=""hljs-keyword"">from</span> llama_agents <span class=""hljs-keyword"">import</span> ServerLauncher, CallableMessageConsumer <span class=""hljs-comment""># Additional human consumer</span> <span class=""hljs-keyword"">def</span> <span class=""hljs-title function_"">handle_result</span>(<span class=""hljs-params"">message</span>) -&gt; <span class=""hljs-literal"">None</span>:  <span class=""hljs-built_in"">print</span>(<span class=""hljs-string"">f""Got result:""</span>, message.data) <span class=""hljs-comment""># the final result is published to a ""human"" consumer</span> <span class=""hljs-comment""># so we define one to handle it!</span> human_consumer = CallableMessageConsumer(  handler=handle_result, message_type=<span class=""hljs-string"">""human""</span> ) <span class=""hljs-comment""># Define Launcher</span> launcher = ServerLauncher(  [agent_server_1, agent_server_2],  control_plane,  message_queue,  additional_consumers=[human_consumer] ) launcher.launch_servers()</code></pre>  <h2 class=""Text_text__zPO0D Text_text-size-48__A2f8Q"">  Real-time monitoring  </h2>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  One of the coolest debugging features of our multi-agent system is our agent monitor, which is built right in. You launch it like this:  </p>  <pre><code>llama-agents monitor --control-plane-url http://127.0.0.1:8000</code></pre>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  Once launched, you get an intuitive, point-and-click terminal application. You can see both of the agents running, and at the bottom you can inject a task like the query “What is the secret fact?” You’ll get a job ID which you can then click on to see your results:  </p>  <h2 class=""Text_text__zPO0D Text_text-size-48__A2f8Q"">  Building a Query Rewriting RAG System  </h2>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  Let's look at a more complex example: a Query Rewriting RAG system. This system will rewrite user queries to improve retrieval, then use the rewritten query to perform RAG over a document."
2,0.718015,"<div class=""BlogPost_htmlPost__Z5oDL"">  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  Hello, Llama enthusiasts! 🦙  </p>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  Welcome to this week’s edition of the LlamaIndex newsletter! In this issue, we’re excited to bring you exciting updates about  <code class=""SanityPortableText_inlineCode__cI85z"">  llama-agents  </code>  , live demos, extensive guides, and in-depth tutorials to enhance your understanding of our tools.  </p>  <p class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  Before moving into our newsletter, we have an exciting update on our enterprise offerings. We are thrilled to announce the waitlist release of LlamaCloud, our fully-managed ingestion service.  <a class=""SanityPortableText_link__QA4Ze"" href=""http://bit.ly/llamacloud"" rel=""noreferrer noopener"">  Sign up  </a>  now if you’re eager to collaborate and build LLM applications with LlamaCloud.  </p>  <h2 class=""Text_text__zPO0D Text_text-size-48__A2f8Q"">  🤩  <strong>  The highlights:  </strong>  </h2>  <ul>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  <strong>  Launched Llama-Agents Framework:  </strong>  Our new alpha-release, llama-agents, enables multi-agent AI systems for production with a distributed architecture, HTTP API communication, and agentic orchestration. It’s designed for easy deployment, scalability, and observability.  <a class=""SanityPortableText_link__QA4Ze"" href=""https://www.llamaindex.ai/blog/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems"" rel=""noreferrer noopener"">  Blogpost  </a>  ,  <a class=""SanityPortableText_link__QA4Ze"" href=""https://x.com/llama_index/status/1806116419995844947"" rel=""noreferrer noopener"">  Tweet  </a>  .  </li>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  <strong>  <code class=""SanityPortableText_inlineCode__cI85z"">  create-llama  </code>  Integrated with LlamaCloud:  </strong>  Streamline your LLM application data pipelines with create-llama, now integrated with LlamaCloud for faster setup and efficient system maintenance.  <a class=""SanityPortableText_link__QA4Ze"" href=""https://x.com/MarcusSchiesser/status/1806960577299767767"" rel=""noreferrer noopener"">  Tweet  </a>  .  </li>  </ul>  <h2 class=""Text_text__zPO0D Text_text-size-48__A2f8Q"">  <strong>  ✨ Feature Releases and Enhancements:  </strong>  </h2>  <ol>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  We have launched llama-agents - new alpha-release framework that enables multi-agent AI systems to go into production. It features a distributed, service-oriented architecture, communication through standard HTTP APIs, agentic orchestration of flows, and is designed for easy deployment, scalability, and observability.  <a class=""SanityPortableText_link__QA4Ze"" href=""https://www.llamaindex.ai/blog/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems"" rel=""noreferrer noopener"">  Blogpost  </a>  ,  <a class=""SanityPortableText_link__QA4Ze"" href=""https://x.com/llama_index/status/1806116419995844947"" rel=""noreferrer noopener"">  Tweet  </a>  .  </li>  <li class=""Text_text__zPO0D Text_text-size-16__PkjFu"">  create-llama is now integrated with LlamaCloud to streamline the setup and management of data pipelines for LLM applications, providing a fast and efficient way to deploy and maintain these systems."


In [27]:
new_nodes = get_retrieved_nodes(
    question1,
    vector_top_k=10,
    reranker_top_n=3,
    with_reranker=True,
)
visualize_retrieved_nodes(new_nodes)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.core.indices.utils:> Top 10 nodes:
> [Node ccb2800f-1560-404a-ae19-fd4e729ab440] [Similarity score:             0.763726] <div class="BlogPost_htmlPost__Z5oDL">
 <p class="Text_text__zPO0D Text_text-size-16__PkjFu">
  W...
> [Node 067e700a-84e7-482f-a7aa-d573faeb1e27] [Similarity score:             0.720235] """</span>
    <span class="hljs-keyword">return</span> <span class="hljs-string">"The secret fac...
> [Node 0544da4e-b27c-40ae-b51d-f576d370d26c] [Similarity score:             0.718015] <div class="BlogPost_htmlPost__Z5oDL">
 <p class="Text_text__zPO0D Text_text-size-16__PkjFu">
  H...
> [Node f481b7c0-d747-4981-a0d6-305f91ed89f9] [Similarity score:             0.717277] <div class="BlogPost_htmlPost__Z5oDL">
 <p>
  Agents are autonomous systems that can execute end-...
> [Node 7237c7e9-ca0c-4222-b4d4-f807fb80a71f] [Similarity score:             0.714119] </li>
  <li>
   <strong>
    New lower-level agent API:
   </strong>
   For enhanced transparency...
> [Node

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


IndexError: list index out of range