In [2]:
import logging
import sys
import os

import qdrant_client
from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
# from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core import Settings
from fastembed import TextEmbedding
import model_utils
import prompt_utils




In [3]:
# llama index ascyncio config
import nest_asyncio
nest_asyncio.apply()

In [4]:
# Settings.embed_model = TextEmbedding(
#     model_name="BAAI/bge-base-en-v1.5",
#     cache_dir="models/embed_models/",
#     providers=["CUDAExecutionProvider"]
# )

Settings.embed_model = HuggingFaceEmbedding(
    model_name="models/bge-small-en-v1.5", device="cuda"
)

In [12]:
# load local llm llama
model_name = "models/Meta-Llama-3-8B-Instruct"
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path=model_name,
    device="cuda"
)

Loading tokenizer and model with quantization config from: models/Meta-Llama-3-8B-Instruct


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DEBUG:bitsandbytes.cextension:Loading bitsandbytes native library from: /opt/conda/envs/dev/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
Loading bitsandbytes native library from: /opt/conda/envs/dev/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
llama3_prompt_template = '''<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{query_str}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
'''

In [4]:
print(llama3_prompt_template)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{query_str}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>



In [None]:
# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate(llama3_prompt_template),
    generate_kwargs={
        "temperature": 0.7,
        "do_sample": True
    },
    device_map="cuda",
    model_name=model_name,
    model=model,
    tokenizer=tokenizer
)

Settings.llm = llm_hf

### Load documents

In [12]:
documents = SimpleDirectoryReader(
    input_dir="./data",
    filename_as_id=True,
).load_data()

print(f"Loaded {len(documents)} documents")

Loaded 159 documents


### Build the VectorStoreIndex

In [5]:
client = qdrant_client.QdrantClient(
    path="./qdrant_db/"
)

In [14]:
vector_store = QdrantVectorStore(client=client, collection_name="llamaindex-blogs")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7fe41e188a30>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7fe41e188730>, vector_stores={'default': QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='llamaindex-blogs', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False, index_doc_id=True, fastembed_sparse_model=None), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7fe41e188190>, property_graph_store=None)

In [15]:
# index = VectorStoreIndex.from_documents(
#     documents,
#     storage_context=storage_context,
#     show_progress=True
# )

Parsing nodes:   0%|          | 0/159 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/874 [00:00<?, ?it/s]



In [8]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context,
    show_progress=True
)
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7fe435e3c760>

### Query Index¶

In [9]:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [10]:
question = '''What are the two critical areas of RAG system performance that are assessed \
in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?'''
print(question)

What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?


In [15]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine(use_async=True, streaming=True)
streaming_response = query_engine.query(question)
streaming_response.print_response_stream()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.core.indices.utils:> Top 2 nodes:
> [Node 22acfe61-e268-4876-82c3-773243408439] [Similarity score:             0.849098] <div class="BlogPost_htmlPost__Z5oDL">
 <p>
  We’re excited to unveil our
  <a href="https://gith...
> [Node e4da31d2-778b-4d0a-b41d-4d1a0ad0f53e] [Similarity score:             0.754972] </li>
  <li class="Text_text__zPO0D Text_text-size-16__PkjFu">
   <a class="SanityPortableText_li...
> Top 2 nodes:
> [Node 22acfe61-e268-4876-82c3-773243408439] [Similarity score:             0.849098] <div class="BlogPost_htmlPost__Z5oDL">
 <p>
  We’re excited to unveil our
  <a href="https://gith...
> [Node e4da31d2-778b-4d0a-b41d-4d1a0ad0f53e] [Similarity score:             0.754972] </li>
  <li class="Text_text__zPO0D Text_text-size-16__PkjFu">
   <a class="SanityPortableText_li...


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


According to the provided context information, the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook are:

1. **The Retrieval System**: This area focuses on assessing the performance of the RAG system in retrieving relevant information from the dataset.
2. **Response Generation**: This area focuses on assessing the performance of the RAG system in generating responses based on the retrieved information.

These two areas are crucial in evaluating the effectiveness of a RAG system, as they ensure that the system can not only retrieve relevant information but also generate coherent and meaningful responses.

In [54]:
streaming_response.metadata

{'22acfe61-e268-4876-82c3-773243408439': {'file_path': '/workspace/projects/LlamindexHelper/data/openai-cookbook-evaluating-rag-systems-fe393c61fb93.html',
  'file_name': 'openai-cookbook-evaluating-rag-systems-fe393c61fb93.html',
  'file_type': 'text/html',
  'file_size': 2220,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'},
 'e4da31d2-778b-4d0a-b41d-4d1a0ad0f53e': {'file_path': '/workspace/projects/LlamindexHelper/data/llamaindex-newsletter-2024-06-11.html',
  'file_name': 'llamaindex-newsletter-2024-06-11.html',
  'file_type': 'text/html',
  'file_size': 11257,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'}}

### Combine DocumentSummaryIndex and VectorIndex

In [16]:
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7fe41e188a30>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7fe41e188730>, vector_stores={'default': QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='llamaindex-blogs', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False, index_doc_id=True, fastembed_sparse_model=None), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7fe41e188190>, property_graph_store=None)

In [17]:
vector_store

QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='llamaindex-blogs', url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False, index_doc_id=True, fastembed_sparse_model=None)

In [19]:
# load document index
idex_mapping = storage_context.index_store.to_dict()

In [22]:
index.index_id

'6089ce48-849d-4b4f-95e9-d84b8ac98b18'