In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, Settings
from llama_index.core import load_index_from_storage
from llama_index.core import VectorStoreIndex, DocumentSummaryIndex
from llama_index.core.prompts import PromptTemplate
from llama_index.core.response.notebook_utils import display_response
import chromadb
import torch
import model_utils
import prompt_utils
import nest_asyncio
nest_asyncio.apply()

In [4]:
# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="models/bge-small-en-v1.5", device="cuda")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: models/bge-small-en-v1.5
Load pretrained SentenceTransformer: models/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [5]:
# load local llm
model_name = "models/zephyr-7b-beta"
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path=model_name,
    device="cuda"
)
# Set `pad_token_id` to `eos_token_id`
model.generation_config.pad_token_id = model.generation_config.eos_token_id

Loading tokenizer and model with quantization config from: models/zephyr-7b-beta


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    generate_kwargs={
        "temperature": 0.7,
        "top_k": 50, 
        "top_p": 0.95,
        "do_sample": True
    },
    device_map="cuda",
    model_name=model_name,
    model=model,
    messages_to_prompt=prompt_utils.zephyr_messages_to_prompt,
    tokenizer=tokenizer
)

In [14]:
# Settings.embed_model = embed_model
# Settings.llm = llm_hf

## Load vector index and document summary index

In [7]:
# Creates a persistent instance of Chroma that saves to disk
chroma_client = chromadb.PersistentClient(path="./chroma_db")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [13]:
chroma_client.list_collections()

[Collection(id=3186f0ec-26e5-46fa-b687-281a3a26066f, name=llama_index_blogs),
 Collection(id=cf744ea1-c23d-41c8-8204-363adcd4b3fe, name=llma_blogs_summary),
 Collection(id=e2bc79ee-49dd-45e4-85f3-6acb87185f7a, name=blogs_summary)]

In [14]:
# Get or create a collection with the given name and metadata.
vector_collection = chroma_client.get_or_create_collection("llama_index_blogs")
vector_store = ChromaVectorStore(chroma_collection=vector_collection)
vector_storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [17]:
# load your index from stored vectors
vector_index = VectorStoreIndex.from_vector_store(
    llm=llm_hf,
    embed_model=embed_model,
    vector_store=vector_store, 
    storage_context=vector_storage_context
)

In [20]:
# Get or create a collection with the given name and metadata.
doc_sum_collection = chroma_client.get_or_create_collection("blogs_summary")
doc_sum_vector_store = ChromaVectorStore(
    chroma_collection=vector_collection
)

doc_sum_storage_context = StorageContext.from_defaults(
    vector_store=doc_sum_vector_store,
    persist_dir="./database/blogs_summary"
)
doc_sum_storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7f341a611cc0>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7f341a61b1f0>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={})}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7f341a47f670>, property_graph_store=None)

In [22]:
doc_summary_index = load_index_from_storage(
    llm=llm_hf,
    embed_model=embed_model,
    storage_context=doc_sum_storage_context
)
doc_summary_index

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


<llama_index.core.indices.document_summary.base.DocumentSummaryIndex at 0x7f341a4c5660>

In [41]:
doc_sum_index_struct = doc_sum_storage_context.index_store.get_index_struct()

In [44]:
doc_sum_index_struct

IndexDocumentSummary(index_id='c7d3a64a-b28e-4e0b-9288-827c67a7b980', summary=None, summary_id_to_node_ids={'83001b14-fb2d-400e-b31b-3a48501408ce': ['d7be752a-0e06-424d-8aee-3a98e5e85a5c'], '90d0dad3-83e4-48ff-969a-41a450639f14': ['fd4001fc-71fa-4139-a074-30a1e7623240'], '8459bd37-a59b-4eb1-b346-49c451ffd64b': ['c8449782-e685-40ae-b9e3-7444396543c4'], '78e2927e-d1d5-46d0-9f7d-a83732e7369b': ['e0af1e9c-54d5-4fc6-b29c-200c841ef247'], '77ba3a80-549e-47e9-bbf3-f70fbaa39686': ['2d74700e-cd3e-4388-a062-02ab00aef449'], 'c07eb866-c51d-4e7a-b9af-1df89655b2d6': ['b21d0b46-68a2-4661-a1d0-00ed2b425b11'], '6c94c9c1-8eaa-45d4-befa-54e84f2ecebe': ['7dafcb22-17ae-4dd3-8487-0b01dcb3f669'], 'f32cd777-600a-4a8e-91f1-c5340cfd85ea': ['542e4fa9-6577-4c98-a8c7-72ca69e69993'], '51cf26f6-53c6-47e1-8ff7-3b845c7744cc': ['4f6ca503-a707-49a5-8bc7-a5e0d6a923be'], '2e1d8446-49c6-4183-ad1a-6f5dfc6bc6da': ['721d9c47-002c-446a-b063-8586ab40dfe4'], '71360fb6-350f-4905-aef8-19f79ee01ff5': ['ab682f7b-a39a-4347-a397-ea949f

## Advanced Querying

In [34]:
question1 = "What are key features of llama-agents?"
question2 = '''
What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?
'''
question3 = '''
What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?
'''

In [23]:
def print_ref_docs(resp_metadata):
    print("References:")
    base_url = "https://www.llamaindex.ai/"
    for idx, (_, doc_metatada) in enumerate(resp_metadata.items()):
        ref_url = base_url + doc_metatada["file_name"]
        print(f"{idx+1}.", ref_url)

### Router Query Engine

In [31]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

In [27]:
Settings.llm = llm_hf
Settings.embed_model = embed_model

In [28]:
vector_tool = QueryEngineTool(
    vector_index.as_query_engine(use_async=True),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for specific facts."
    )
)

summary_tool = QueryEngineTool(
    doc_summary_index.as_query_engine(response_mode="tree_summarize", use_async=True),
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarizing an entire document."
    )
)

### Single selector

In [38]:
query_engine = RouterQueryEngine.from_defaults(
    [vector_tool, summary_tool],
    # selector=LLMSingleSelector.from_defaults(),
    select_multi=False
)

In [39]:
response = query_engine.query(question1)

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: Key features of llama-agents can be identified through specific facts that can be searched for using this tool..
Selecting query engine 0: Key features of llama-agents can be identified through specific facts that can be searched for using this tool..


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: A summary of an entire document about llama-agents may provide an overview of their key features, but this is less direct than searching for specific facts..
Selecting query engine 1: A summary of an entire document about llama-agents may provide an overview of their key features, but this is less direct than searching for specific facts..


KeyError: '9b674962-37bb-43af-8ef0-a1398491ca21'

In [None]:
response