In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [12]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex, StorageContext, Settings
from llama_index.core import load_index_from_storage
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.prompts import PromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import chromadb
import model_utils
import prompt_utils

In [4]:
# load embeddings
embed_model = HuggingFaceEmbedding(model_name="models/bge-small-en-v1.5", device="cuda")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: models/bge-small-en-v1.5
Load pretrained SentenceTransformer: models/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [5]:
# load llm
model_name = "models/zephyr-7b-beta"
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path=model_name,
    device="cuda"
)
# Set `pad_token_id` to `eos_token_id`
model.generation_config.pad_token_id = model.generation_config.eos_token_id

Loading tokenizer and model with quantization config from: models/zephyr-7b-beta


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    generate_kwargs={
        "temperature": 0.7,
        "top_k": 50, 
        "top_p": 0.95,
        "do_sample": True
    },
    device_map="cuda",
    model_name=model_name,
    model=model,
    messages_to_prompt=prompt_utils.zephyr_messages_to_prompt,
    tokenizer=tokenizer
)

In [None]:
# Settings.embed_model = embed_model
# Settings.llm = llm_hf

## Load documents

In [7]:
documents = SimpleDirectoryReader(
    input_dir="./data",
    filename_as_id=True,
).load_data()
len(documents)

159

In [8]:
# Creates a persistent instance of Chroma that saves to disk
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Get or create a collection with the given name and metadata.
vector_collection = chroma_client.get_or_create_collection("blogs_summary")
vector_store = ChromaVectorStore(
    chroma_collection=vector_collection, 
    persist_dir="./chroma_db/blogs_summary"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [9]:
splitter = SentenceSplitter(
    tokenizer=tokenizer,
    chunk_size=1024
)

In [11]:
response_synthesizer = get_response_synthesizer(
    llm=llm_hf,
    response_mode="tree_summarize", 
    use_async=True,
    verbose=True
)

In [None]:
# test only 3 documents
doc_summary_index = DocumentSummaryIndex.from_documents(
    documents=documents[:3],
    llm=llm_hf,
    embed_model=embed_model,
    transformations=[splitter],
    response_synthesizer=response_synthesizer,
    show_progress=True,
    storage_context=storage_context
)

In [None]:
# doc_summary_index.storage_context.persist("summary")

In [None]:
# visualize
# doc_summary_index.get_document_summary("07a29a81-1d9f-445e-b1a3-cec315ffcd79")

## Load doc sumary index

### Load by chromadb

In [18]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")
test_vector_collection = chroma_client.get_or_create_collection("blogs_summary")
test_vector_collection, test_vector_collection.count()

(Collection(id=e2bc79ee-49dd-45e4-85f3-6acb87185f7a, name=blogs_summary), 159)

In [64]:
chroma_vector_store = ChromaVectorStore(
    chroma_collection=test_vector_collection,
    # persist_dir="./backup/blogs_summary"
)
chroma_storage_context = StorageContext.from_defaults(
    vector_store=chroma_vector_store, 
    persist_dir="./backup/blogs_summary/"
)

In [70]:
index_struct = chroma_storage_context.index_store.get_index_struct()

In [78]:
for k,v in index_struct.doc_id_to_summary_id.items():
    print(k)
    print(v)

/workspace/projects/LlamindexHelper/data/a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html
83001b14-fb2d-400e-b31b-3a48501408ce
/workspace/projects/LlamindexHelper/data/a-new-document-summary-index-for-llm-powered-qa-systems-9a32ece2f9ec.html
90d0dad3-83e4-48ff-969a-41a450639f14
/workspace/projects/LlamindexHelper/data/agentic-rag-with-llamaindex-2721b8a49ff6.html
8459bd37-a59b-4eb1-b346-49c451ffd64b
/workspace/projects/LlamindexHelper/data/ai-voice-assistant-enhancing-accessibility-in-ai-with-llamaindex-and-gpt3-5-f5509d296f4a.html
78e2927e-d1d5-46d0-9f7d-a83732e7369b
/workspace/projects/LlamindexHelper/data/announcing-llamaindex-0-9-719f03282945.html
77ba3a80-549e-47e9-bbf3-f70fbaa39686
/workspace/projects/LlamindexHelper/data/arize-ai-and-llamaindex-roll-out-joint-platform-for-evaluating-llm-applications.html
c07eb866-c51d-4e7a-b9af-1df89655b2d6
/workspace/projects/LlamindexHelper/data/automate-online-tasks-with-multion-and-llamaindex.html
6c94c9c1-8eaa-45d4

In [66]:
test_doc_sum_index = load_index_from_storage(
    llm=llm_hf,
    embed_model=embed_model,
    storage_context=chroma_storage_context
)

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


In [82]:
# print sample summarization
print(test_doc_sum_index.get_document_summary("/workspace/projects/LlamindexHelper/data/a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html"))

The provided text is about using the llama-index library to evaluate the performance of RAG systems based on seven measurement aspects outlined in a survey paper by Gao et al. The text highlights the evaluation notebook guides provided by the llama-index library and explains the concept of faithfulness, which is further explained in a Notion document. This text can answer questions related to the evaluation capabilities of the llama-index library, the measurement aspects outlined by Gao et al., and how to use the library to assess the performance of RAG systems in relation to these aspects.

Some potential questions that this text can answer include:
- What is the llama-index library and how can it be used to evaluate RAG systems?
- What measurement aspects are outlined in the survey paper by Gao et al.?
- How can the evaluation notebook guides provided by the llama-index library be used to assess the performance of RAG systems in relation to these measurement aspects?
- What is faithf

## Querying

In [84]:
query_engine = test_doc_sum_index.as_query_engine(
    llm=llm_hf,
    response_mode="tree_summarize", 
    use_async=True, 
    streaming=True
)

In [85]:
question_1 = "What is the llama-index library and how can it be used to evaluate RAG systems?"

In [86]:
resp1 = query_engine.query(question_1)
resp1.print_response_stream()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The llama-index library is a Python framework for building and evaluating Retrieval-and-Generation (RAG) systems, which combine the strengths of both retrieval and generation to efficiently and accurately answer user queries. It offers tools for document indexing, query formulation, and query evaluation, as well as integrations with RAGAs like FAISS, RAGA, and FLARE. The library provides advanced techniques for addressing success requirements of both retrieval and generation, such as information compression, document re-ranking, and iterative retrieval-generator cycles. It also offers several evaluation abstractions and integrations to RAGAs to help builders measure the level to which their RAG systems achieve success requirements through various measurement aspects, such as answer relevancy, context relevancy, faithfulness, and retrieval evaluation. By utilizing the library's evaluation notebook guides, developers can assess the answer and context relevancy, faithfulness, retrieval ev

In [87]:
for node in resp1.source_nodes:
    print(node.metadata)

{'file_path': '/workspace/projects/LlamindexHelper/data/a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html', 'file_name': 'a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html', 'file_type': 'text/html', 'file_size': 24708, 'creation_date': '2024-07-21', 'last_modified_date': '2024-07-21'}


### LLM-based Retrieval