## load data

In [1]:
from datasets import load_dataset
dataset = load_dataset('squad')

In [2]:
contexts = [item['context'] for item in dataset['train']]
u_contexts = list(set(contexts))
truncated_contexts = u_contexts[:12800]

In [3]:
# utility functions
def chunky_iterate(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i: i + chunk_size]


## embeddings

In [4]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


"""
    optimizations:
        * use a cache folder to cache the model
        * generate embeddings in a batch at a time.
"""
class Embedder:
    def __init__(self, model_name = 'nomic-ai/nomic-embed-text-v1.5',
                batch_size = 32):
        self.batch_size = batch_size
        self.model_name = model_name
        self.embed_model = self._load_embed_model()

    def _load_embed_model(self):
        return HuggingFaceEmbedding(model_name= self.model_name,
            trust_remote_code= True, cache_folder= './hf_cache')

    
    def generate_batch_embeddings(self, context_batch):
        return self.embed_model.get_text_embedding_batch(context_batch)
    

    def generate_embeddings(self, contexts):
        embeddings = []
        for batch_context in chunky_iterate(contexts, self.batch_size):
            embeddings.extend(self.generate_batch_embeddings(batch_context))
        return embeddings

    def generate_query_embeddings(self, query):
        return self.embed_model.get_query_embedding(query)

In [5]:
embedder = Embedder()
embeddings = embedder.generate_embeddings(truncated_contexts)

modules.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/95.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
<All keys matched successfully>


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

## vector db

In [6]:
from qdrant_client import QdrantClient, models
"""
    Optimizations:
        * use grpc
        * collection on disk for larger dataset
        * upload collection batches upload automatically.
        * set indexing to false / 0 when uploading large data for the first time
        
"""
class QdrantVectorDB:
    def __init__(self, collection_name = "contexts_collection",
                 vector_dim = 768, batch_size = 512,
                 host = 'localhost', port = 6333):
        
        self.batch_size = batch_size
        self.collection_name = collection_name
        self.client = self._setup_client(host, port)
        self._setup_collection(vector_dim)

    def _setup_client(self, host, port):
        return QdrantClient(host= host, port= port, prefer_grpc=True) 

    def _setup_collection(self, vector_dim):
        if not self.client.collection_exists(collection_name= self.collection_name):
            self.client.create_collection(
                collection_name= self.collection_name,
                vectors_config= models.VectorParams(size = vector_dim,
                    distance = models.Distance.DOT, on_disk=True
                ),
                optimizers_config=models.OptimizersConfigDiff(
                    default_segment_number = 5, indexing_threshold=0)
            )
    def ingest_data(self, embeddings, contexts):
             
        self.client.upload_collection(collection_name= self.collection_name,
            vectors= embeddings, batch_size= self.batch_size,
            payload=[{"context": context} for context in contexts]
        )
        
        self.client.update_collection(collection_name=self.collection_name,
            optimizers_config=models.OptimizersConfigDiff(indexing_threshold=20000))
            
    def query_collection(self, query_embeddings):
        result = self.client.query_points(
            collection_name = self.collection_name,
            query= query_embeddings,
            search_params = models.SearchParams(
                    quantization = models.QuantizationSearchParams(
                        ignore = True,
                        rescore = True,
                        oversampling = 2.0
                    )
            ),
            timeout = 1000
        )
        return result
        

In [7]:
vectordb = QdrantVectorDB()
vectordb.ingest_data(embeddings, truncated_contexts)

## retriever

In [8]:
import time

class Retriever:
    def __init__(self, vectordb, embedder):
        self.vectordb = vectordb
        self.embedder = embedder

    def search(self, query):
        query_embeddings = self.embedder.generate_query_embeddings(query)
        start_time = time.time()

        result = self.vectordb.query_collection(query_embeddings)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f'Execution of the query took : {elapsed_time} seconds')
        return result

        

In [9]:
retriever = Retriever(vectordb=vectordb, embedder=embedder)
result = retriever.search("What is warsaw famous for?")

Execution of the query took : 0.005831241607666016 seconds


In [20]:
result.points[0:5]

[ScoredPoint(id='1991c122-4bbb-400f-bd93-016416da756d', version=62, score=0.6409937739372253, payload={'context': "Chopin's music remains very popular and is regularly performed, recorded and broadcast worldwide. The world's oldest monographic music competition, the International Chopin Piano Competition, founded in 1927, is held every five years in Warsaw. The Fryderyk Chopin Institute of Poland lists on its website over eighty societies world-wide devoted to the composer and his music. The Institute site also lists nearly 1,500 performances of Chopin works on YouTube as of January 2014."}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='a7053592-0076-4e05-a0d5-529a31abe4ab', version=37, score=0.6409937739372253, payload={'context': "Chopin's music remains very popular and is regularly performed, recorded and broadcast worldwide. The world's oldest monographic music competition, the International Chopin Piano Competition, founded in 1927, is held every five years in W