In [1]:
import logging
import sys
import os

import qdrant_client
from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

# from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core import Settings
from llama_index.core.response.notebook_utils import display_response
# from fastembed import TextEmbedding
import model_utils
import prompt_utils

  from .autonotebook import tqdm as notebook_tqdm



In [2]:
# llama index ascyncio config
import nest_asyncio
nest_asyncio.apply()

# logging config
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
llm_name = "models/Meta-Llama-3.1-8B-Instruct"
embed_model_name = "models/bge-small-en-v1.5"
device_map = "cuda:1"

In [4]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name=embed_model_name, device=device_map
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: models/bge-small-en-v1.5
Load pretrained SentenceTransformer: models/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [5]:
# load local llm llama
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path=llm_name,
    device=device_map
)

Loading tokenizer and model with quantization config from: models/Meta-Llama-3.1-8B-Instruct


Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.22s/it]


In [6]:
# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    query_wrapper_prompt=PromptTemplate(prompt_utils.get_llama31_prompt_template()),
    generate_kwargs={
        "temperature": 0.5,
        "do_sample": True
    },
    device_map="cuda",
    model_name=llm_name,
    model=model,
    tokenizer=tokenizer
)

Settings.llm = llm_hf

### Load documents

In [7]:
documents = SimpleDirectoryReader(
    input_dir="./data",
    filename_as_id=True,
).load_data()

print(f"Loaded {len(documents)} documents")

Loaded 159 documents


### Build/Load the VectorStoreIndex

In [8]:
client = qdrant_client.QdrantClient(
    path="./qdrant_db/"
)

In [9]:
vector_store = QdrantVectorStore(
    client=client, 
    collection_name="llamaindex-blogs-hybrid-search",
    enable_hybrid=True,
    fastembed_sparse_model="Qdrant/bm42-all-minilm-l6-v2-attentions"
)

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 94965.37it/s]
[0;93m2024-08-07 15:19:36.602350110 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-08-07 15:19:36.602368583 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 89877.94it/s]
[0;93m2024-08-07 15:19:37.046097667 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-08-07 15:19:37.046115130 [W:onnxruntime:, sess

In [10]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7f53ec9a6c20>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7f53eca61f90>, vector_stores={'default': QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='llamaindex-blogs-hybrid-search', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=True, index_doc_id=True, fastembed_sparse_model='Qdrant/bm42-all-minilm-l6-v2-attentions'), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7f53eca62860>, property_graph_store=None)

In [11]:
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    show_progress=True,
)

Parsing nodes: 100%|██████████| 159/159 [00:01<00:00, 143.84it/s]
Generating embeddings:   0%|          | 0/874 [00:00<?, ?it/s]
Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.22it/s][A
Generating embeddings:   1%|          | 10/874 [00:00<00:23, 36.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.91it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 23.05it/s]
Generating embeddings:   3%|▎         | 30/874 [00:00<00:10, 81.73it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.41it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 21.56it/s]
Generating embeddings:   6%|▌         | 50/874 [00:00<00:07, 111.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.48it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 31.08it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 29.46it/s]
Generating embeddings:   9%|▉         | 80/874 [00:00<00:05, 148.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.44it/s]

Batches: 100%|██████████| 1/

Payload indexes have no effect in the local Qdrant. Please use server Qdrant if you need payload indexes.


In [36]:
# index = VectorStoreIndex.from_vector_store(
#     vector_store=vector_store,
#     storage_context=storage_context,
#     show_progress=True,
# )
# index

### Query Index¶

In [12]:
question = '''What are the two critical areas of RAG system performance that are assessed \
in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?'''
print(question)

What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?


In [13]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine(use_async=True, response_mode="refine")
response = query_engine.query(question)

Batches: 100%|██████████| 1/1 [00:00<00:00, 75.71it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [14]:
display_response(response)

**`Final Response:`** The two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook are:

1. The Retrieval System
2. Response Generation

The context provided does not offer any additional information that would alter the original answer. The provided text discusses various guides, demos, and tutorials related to RAG (Retrieval-Augmented Generation) and its applications, but it does not provide information on the specific areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section. Therefore, the refined answer remains the same as the original one.

In [15]:
response.metadata

{'a2742526-5479-4dad-b533-1c5c04d33912': {'file_path': '/workspace/projects/LlamindexHelper/data/openai-cookbook-evaluating-rag-systems-fe393c61fb93.html',
  'file_name': 'openai-cookbook-evaluating-rag-systems-fe393c61fb93.html',
  'file_type': 'text/html',
  'file_size': 2220,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'},
 '2b66388c-fb02-4509-92f6-1c6b0a69cf08': {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-newsletter-2024-06-11.html',
  'file_name': 'llamaindex-newsletter-2024-06-11.html',
  'file_type': 'text/html',
  'file_size': 11257,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'}}

### Combine DocumentSummaryIndex and VectorIndex

In [16]:
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7fe41e188a30>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7fe41e188730>, vector_stores={'default': QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='llamaindex-blogs', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False, index_doc_id=True, fastembed_sparse_model=None), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7fe41e188190>, property_graph_store=None)

In [17]:
vector_store

QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='llamaindex-blogs', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False, index_doc_id=True, fastembed_sparse_model=None)

In [19]:
# load document index
idex_mapping = storage_context.index_store.to_dict()

In [22]:
index.index_id

'6089ce48-849d-4b4f-95e9-d84b8ac98b18'

### Hybrid Search

In [23]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import QueryBundle

In [17]:
question = "What is latest LlamaIndex Newsletter?"
query_bundle = QueryBundle(question)

#### Normal Query

In [37]:
query_engine = index.as_query_engine(use_async=True, response_mode="refine")
response = query_engine.query(question)

Batches: 100%|██████████| 1/1 [00:00<00:00, 166.94it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [38]:
display_response(response)

**`Final Response:`** The latest LlamaIndex newsletter is the one from October 31, 2023.

In [39]:
for node in response.source_nodes:
    print(node.score, node.metadata)

0.7741100641613752 {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-newsletter-2024-01-02-f349db8c1842.html', 'file_name': 'llamaindex-newsletter-2024-01-02-f349db8c1842.html', 'file_type': 'text/html', 'file_size': 17293, 'creation_date': '2024-07-21', 'last_modified_date': '2024-07-21'}
0.7739661334334822 {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-newsletter-2023-10-31-36244e2b3f0c.html', 'file_name': 'llamaindex-newsletter-2023-10-31-36244e2b3f0c.html', 'file_type': 'text/html', 'file_size': 11836, 'creation_date': '2024-07-21', 'last_modified_date': '2024-07-21'}



#### Hybrid Queries

In [43]:
hybrid_query_engine = index.as_query_engine(
    use_async=True, 
    response_mode="refine", 
    vector_store_query_mode="hybrid",
    similarity_top_k=2, sparse_top_k=12
)
hybrid_response = hybrid_query_engine.query(question)

Batches: 100%|██████████| 1/1 [00:00<00:00, 162.52it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [44]:
display_response(hybrid_response)

**`Final Response:`** The latest LlamaIndex Newsletter is the special edition for the last two weeks of 2023, which is packed with updates on the latest features, community demos, courses, insightful tutorials, guides, and webinars curated by LlamaIndex.

However, based on the provided context, it seems that the latest newsletter is not the special edition for the last two weeks of 2023, but rather the one available on June 18, 2024. This newsletter includes updates on the following topics:

- A tutorial by Arkiti on building a dynamic text-to-SQL solution using Llama 3 and GroqInc, highlighting the scalable and fast capabilities of SingleStoreDB Helios for multi-cloud deployments.
- A tutorial by Kingzzm on Advanced RAG Patterns detailing effective strategies for handling documents with embedded tables, utilizing tools like LlamaParse and Nougat for enhanced QA performance.
- A webinar on The Future of Web Agents with MultiOn, where Div Garg provided a full demo walkthrough and discussed the agentification of the internet.

The newsletter is available at the following path: /workspace/projects/LlamindexHelper/data/llamaindex-newsletter-2024-06-18.html.

In [45]:
for node in hybrid_response.source_nodes:
    print(node.score, node.metadata)

0.5 {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-newsletter-2024-01-02-f349db8c1842.html', 'file_name': 'llamaindex-newsletter-2024-01-02-f349db8c1842.html', 'file_type': 'text/html', 'file_size': 17293, 'creation_date': '2024-07-21', 'last_modified_date': '2024-07-21'}
0.5 {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-newsletter-2024-06-18.html', 'file_name': 'llamaindex-newsletter-2024-06-18.html', 'file_type': 'text/html', 'file_size': 12216, 'creation_date': '2024-07-21', 'last_modified_date': '2024-07-21'}
