In [1]:
!pip install transformers datasets torch sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [2]:
import nltk

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Chunk and Index data into DB ##

In [3]:
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer

from datasets import load_dataset
datasets = ['hagrid', 'hotpotqa', 'msmarco']

retrieval_model = "BAAI/LLM-Embedder"
# retrieval_model = "sentence-transformers/all-miniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(retrieval_model)

TOKEN_LIMIT = 512
SLIDING_WINDOW_OVERLAP = 100

# Function for chunking with token limit and sliding window
def chunk_with_token_limit(text, token_limit, overlap):
    sentences = sent_tokenize(text)  # Split text into sentences
    chunks = []  # Store resulting chunks
    current_chunk = []  # Temporarily hold sentences for the current chunk
    current_chunk_tokens = 0  # Token count for the current chunk

    for sentence in sentences:
        # Tokenize the sentence and calculate its token count
        sentence_tokens = tokenizer.tokenize(sentence)
        num_tokens = len(sentence_tokens)

        # print(f"Tokens: {sentence_tokens[0]}")

        # If adding this sentence exceeds the token limit
        if current_chunk_tokens + num_tokens > token_limit:
            # Save the current chunk
            chunk_text = " ".join(current_chunk)
            chunks.append(chunk_text)

            # Prepare the next chunk with overlap
            overlap_tokens = tokenizer.tokenize(" ".join(current_chunk[-1:]))
            current_chunk = [sentence for sentence in current_chunk[-(overlap // len(overlap_tokens)) :]] if current_chunk else []
            current_chunk_tokens = sum(len(tokenizer.tokenize(sent)) for sent in current_chunk)

        # Add the sentence to the current chunk
        current_chunk.append(sentence)
        current_chunk_tokens += num_tokens

    # Add the last chunk if it exists
    if current_chunk:
        chunk_text = " ".join(current_chunk)
        chunks.append(chunk_text)

    return chunks

def process_document_with_identifiers(document):
    processed_data = []
    title_count = -1  # to start from 0
    # print("document>>>>>>>",document)
    for section in document:
        section_chunks = []
        passage_count = [ord('a')]  # Passage identifier as a list to handle nested increments
        title_count += 1  # Increment title count

        # Tokenize the section into sentences
        sentences = sent_tokenize(section)
        for sentence in sentences:
            if sentence.startswith("Title:"):
                # New document detected
                identifier = f"{title_count}{''.join(chr(c) for c in passage_count)}"  # Identifier for the title
                chunked_texts = chunk_with_token_limit(sentence, TOKEN_LIMIT, SLIDING_WINDOW_OVERLAP)
                for chunk in chunked_texts:
                    section_chunks.append([identifier, chunk])
                passage_count = [ord('a')]  # Reset passage count for the new document
            else:
                # Sentence under the current document
                identifier = f"{title_count}{''.join(chr(c) for c in passage_count)}"
                chunked_texts = chunk_with_token_limit(sentence, TOKEN_LIMIT, SLIDING_WINDOW_OVERLAP)
                #print("chunked_texts>>>>process_document_with_identifiers>>>>> "+ "".join(chunked_texts))
                for chunk in chunked_texts:
                    section_chunks.append([identifier, chunk])

                # Increment passage_count intelligently
                i = len(passage_count) - 1
                while i >= 0:
                    passage_count[i] += 1
                    if passage_count[i] > ord('z'):
                        passage_count[i] = ord('a')
                        if i == 0:
                            passage_count.insert(0, ord('a'))  # Add a new character to the identifier
                        i -= 1
                    else:
                        break


        # print("section_chunks>>>>>>>",section_chunks)
        processed_data.append(section_chunks)

    return processed_data

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

## **Check uniqueness of data before insertion** ##

In [4]:
import hashlib

# Function to generate a hash based on content and key metadata
def generate_hash(content, metadata):
    """Generate a unique hash for the document content and key metadata."""
    key_fields = f"{content}|{metadata.get('item_index')}|{metadata.get('prefix')}"
    return hashlib.md5(key_fields.encode('utf-8')).hexdigest()

# Function to retrieve existing hashes from the database
def get_existing_hashes(collection):
    """Retrieve all existing hashes (IDs) currently in the database."""
    all_records = collection.get(include=["documents", "metadatas"])  # Fetch documents and metadata
    existing_hashes = set()
    for doc, metadata in zip(all_records["documents"], all_records["metadatas"]):
        doc_hash = generate_hash(doc, metadata)
        existing_hashes.add(doc_hash)
    return existing_hashes

# Function to retrieve existing hashes from the database
def get_existing_hashes_milvus(all_records):
    """Retrieve all existing hashes (IDs) currently in the database."""
    existing_hashes = set()
    print(f"all records >>> {len(all_records)}")    
    if all_records == None or len(all_records) == 0:
        return existing_hashes
        
    for doc, metadata in zip(all_records["documents"], all_records["metadata"]):
        doc_hash = generate_hash(doc, metadata)
        existing_hashes.add(doc_hash)
    return existing_hashes

## Store and retrieve data from Milvus** ##

In [5]:
!pip install pymilvus pymilvus[model]

Collecting pymilvus
  Downloading pymilvus-2.5.2-py3-none-any.whl.metadata (5.7 kB)
Collecting python-dotenv<2.0.0,>=1.0.1 (from pymilvus)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.4.11-py3-none-manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting milvus-model>=0.1.0 (from pymilvus[model])
  Downloading milvus_model-0.2.11-py3-none-any.whl.metadata (1.6 kB)
Collecting onnxruntime (from milvus-model>=0.1.0->pymilvus[model])
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime->milvus-model>=0.1.0->pymilvus[model])
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->milvus-model>=0.1.0->pymilvus[model])
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading pymilvus-2.5.2-py3-none-an

In [11]:
import time
import numpy as np
from pymilvus import connections
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection
from pymilvus import MilvusClient
from pymilvus import utility

class VectorDataStore:
    db_url = "http://localhost:19530"
    #description = f"collection created for {self.name}"

    def __init__(self, path="/content/ragbench.db"):
        self.client = MilvusClient(path)



    def create_collection(self, name, vec_dim=128):
        if self.client.has_collection(name):
            self.default_collection_name = name

        self.description = f"collection to store {name}"

        index_params = self.client.prepare_index_params()
        index_params.add_index(
            field_name="embedding",
            index_type="AUTOINDEX",
            # params={
            #     "M": 16, # Number of bidirectional links created for each element
            #     "efConstruction": 200 # Size of the dynamic list for the nearest neighbours during indexing
            # },
            metric_type="COSINE"
        )
        schema = self.client.create_schema(
            auto_id=False,
            enable_dynamic_fields=True,
        )
        schema.add_field(field_name="pk", datatype=DataType.VARCHAR, max_length=64, is_primary=True)
        schema.add_field(field_name="metadata", datatype=DataType.JSON)
        schema.add_field(field_name="documents", datatype=DataType.VARCHAR, max_length=512)
        schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=vec_dim)
        schema.add_field(field_name="timestamp", datatype=DataType.INT64)
        
        collection = self.client.create_collection(collection_name=name,
                                       schema=schema,
                                       index_params=index_params)
        self.current_collection = collection
        return collection


    def get_collection(self, name):
        if not self.client.has_collection(name):
            raise ValueError(f"Collection '{name}' does not exist.")
        self.current_collection = Collection(name)
        return self.current_collection

    def get_all_records(self, collection):
        all_records = self.client.query(
            collection_name=collection,
            filter=None,
            output_fields=["documents", "metadata"],
            limit=10000
        )
        if all_records == None:
            all_records = []

        return all_records

    def has_entities(self, name):
        if not self.client.has_collection(name):
            raise ValueError(f"Collection '{name}' does not exists.")
        self.default_collection = name
        collection_stats = self.client.get_collection_stats(collection_name)
        count = collection_stats.get("row_count", 0)  # Retrieve the number of entities
        return count

    def insert(self, collection_name: str, metadata: list[dict[str, any]],
                documents: list[str], embeddings: np.ndarray, ids: list[int]):

        if not self.client.has_collection(collection_name):
            raise ValueError(f"Collection '{collection_name}' does not exist. Create it first.")

        if len(metadata) != len(embeddings) != len(documents) != len(ids):
           raise ValueError("Metadata, documnets, ids and embeddings must have the same length.")

        data = []
        for meta, doc, emb, id in zip(metadata, documents, embeddings, ids):
          datum = {
              "pk": id,
              "metadata": meta,
              "documents": doc,
              "embedding": emb.tolist(),
              "timestamp": int(time.time()),
          }
          data.append(datum)

        self.client.insert(collection_name, data)
        print(f"Inserted {len(metadata)} records into collection '{collection_name}'.")

    def drop_collection(self, collection_name: str):
        if not self.client.has_collection(collection_name):
            raise ValueError(f"Collection '{collection_name}' does not exist.")
        self.client.drop_collection(collection_name)
        print(f"Dropped collection '{collection_name}'.")

    def delete_all(self, collection_name: str):
        if not self.client.has_collection(collection_name):
            raise ValueError(f"Collection '{collection_name}' does not exist.")
        self.client.delete(collection_name, expr="pk >= 0")
        self.client.flush([collection_name])

    def search(self, query_embedding: np.ndarray, top_k: int = 10) -> list[dict[str, any]]:
        """
        Search across all collections for the top-k closest embeddings.
        :param query_embedding: The embedding vector to search for.
        :param top_k: Number of top results to retrieve.
        :return: A list of dictionaries containing collection name, id, metadata, and distance.
        """
        results = []
        #collections = self.client.list_collections()
        collections = ["ragbench_collection_techqa_v09"]
        start_time = time.time()
        for collection_name in collections:
            if not self.client.has_collection(collection_name):
                continue

            # Set params to COSINE to match chromadb
            search_params = {
                "metric_type": "COSINE", 
                "params": {
                    "ef": 64
                }
            }

            search_results = self.client.search(
                collection_name=collection_name,
                data=[query_embedding],
                anns_field="embedding",
                search_params=search_params,
                limit=top_k,
                output_fields=["metadata", "documents"]
            )

            for hits in search_results:
                for hit in hits:
                    print(f"Collection: {collection_name}, data: {str(hit)}")
                    results.append({
                        "collection": collection_name,
                        "id": hit["id"],
                        "metadata": hit["entity"]["metadata"],
                        "distance": hit["distance"],
                        "documents": hit["entity"]["documents"]
                      })

        results = sorted(results, key=lambda x: x["distance"])[:top_k]
        end_time = time.time()
        print(f"Search completed. Found {len(results)} results. in {end_time - start_time} secs")
        return results

    def extract_documents(self, search_results: list[dict[str, any]]) -> list[np.ndarray]:
      """
      Extract embedding values from search results.
      :param search_results: List of dictionaries containing search results.
      :return: List of embedding vectors as NumPy arrays.
      """
      return [np.array(result["documents"]) for result in search_results if "documents" in result]

In [6]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

datasets = ['hagrid', 'hotpotqa', 'msmarco']

all_documents = []
all_ids = []
all_metadatas = []

# Process each dataset
doc_idx = 0  # Global document index for unique IDs
for dataset in datasets:
    data = load_dataset("rungalileo/ragbench", dataset, split="train")
    # #only select first 5 records for debugging duplicate records. **PLEASE REMOVE THIS AFTER DEBUGGING**
    # data = data.select(range(2))
    for idx, row in tqdm(enumerate(data), desc=f"Processing {dataset}"):
        # Extract document text
        doc_text = row.get('documents', '')

        # Skip if no documents found
        if not doc_text:
            continue

        # Process the document
        processed_output = process_document_with_identifiers(doc_text)
        added_item_idxs = set()

        # Populate the lists
        for section_idx, section in enumerate(processed_output):
            for item_idx, (prefix, content) in enumerate(section):
                # Skip if this item_idx has already been processed
                if item_idx in added_item_idxs:
                    continue

                # Add the item_idx to the set to track it
                added_item_idxs.add(item_idx)

                # Add the document
                document = f"[{prefix}] {content}"
                all_documents.append(document)

                # Construct a globally unique ID
                doc_id = f"{dataset}_{doc_idx}_{section_idx}_{item_idx}"
                all_ids.append(doc_id)

                # Construct metadata
                metadata = {
                    "dataset": dataset,
                    "global_index": doc_idx,
                    "section_index": section_idx,
                    "item_index": item_idx,
                    "prefix": prefix,
                    "type": "Title" if prefix.endswith("a") else "Passage",
                }
                all_metadatas.append(metadata)

        doc_idx += 1  # Increment global document index

# Step 4: Generate Embeddings
#embedder = SentenceTransformer(retrieval_model)  # Pretrained sentence transformer
embedder = SentenceTransformer(retrieval_model)  # Pretrained sentence transformer
batch_size = 2500  # Adjust based on available memory

# Generate embeddings in batches
all_embeddings = []
for i in tqdm(range(0, len(all_documents), batch_size), desc="Generating embeddings"):
    batch_docs = all_documents[i:i + batch_size]
    batch_embeddings = embedder.encode(batch_docs, show_progress_bar=True)
    all_embeddings.extend(batch_embeddings)

README.md:   0%|          | 0.00/24.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/9.42M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.97M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2892 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/322 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1318 [00:00<?, ? examples/s]

Processing hagrid: 2892it [00:10, 269.97it/s]


train-00000-of-00001.parquet:   0%|          | 0.00/6.37M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.45M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1883 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/424 [00:00<?, ? examples/s]

Processing hotpotqa: 1883it [00:07, 259.50it/s]


train-00000-of-00001.parquet:   0%|          | 0.00/9.13M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/2.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1870 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/423 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/397 [00:00<?, ? examples/s]

Processing msmarco: 1870it [00:13, 136.80it/s]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:   6%|▌         | 1/18 [00:06<01:53,  6.65s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  11%|█         | 2/18 [00:12<01:40,  6.25s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  17%|█▋        | 3/18 [00:18<01:30,  6.03s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  22%|██▏       | 4/18 [00:24<01:23,  5.96s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  28%|██▊       | 5/18 [00:30<01:18,  6.06s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  33%|███▎      | 6/18 [00:36<01:12,  6.06s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  39%|███▉      | 7/18 [00:42<01:07,  6.11s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  44%|████▍     | 8/18 [00:48<01:01,  6.12s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  50%|█████     | 9/18 [00:55<00:55,  6.13s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  56%|█████▌    | 10/18 [01:01<00:49,  6.16s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  61%|██████    | 11/18 [01:07<00:44,  6.32s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  67%|██████▋   | 12/18 [01:14<00:37,  6.31s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  72%|███████▏  | 13/18 [01:19<00:30,  6.08s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  78%|███████▊  | 14/18 [01:24<00:23,  5.80s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  83%|████████▎ | 15/18 [01:30<00:16,  5.61s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  89%|████████▉ | 16/18 [01:35<00:11,  5.52s/it]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generating embeddings:  94%|█████████▍| 17/18 [01:40<00:05,  5.52s/it]

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Generating embeddings: 100%|██████████| 18/18 [01:43<00:00,  5.74s/it]


In [12]:
collection_name = "ragbench_hhm_v1"
datastor = VectorDataStore()

insert_data = False
store_client = "Milvus"
num_records = 0

if datastor.client.has_collection(collection_name):
  num_records = datastor.has_entities(collection_name)
  if num_records == 0:
    insert_data = True
else:
  datastor.create_collection(collection_name, embedder.get_sentence_embedding_dimension())
  insert_data = True

print(f"count >>> {num_records} insert_data >>> {insert_data}")

count >>> 0 insert_data >>> True


In [13]:
# Adding data to Milvus with enhanced duplicate check
if not insert_data:
    existing_hashes = set()
else:
    existing_hashes = get_existing_hashes_milvus(datastor.get_all_records(collection_name))

for i in tqdm(range(0, len(all_documents), batch_size), desc="Adding data to Milvus"):
    batch_embeddings = all_embeddings[i:i + batch_size]
    batch_metadatas = all_metadatas[i:i + batch_size]
    batch_documents = all_documents[i:i + batch_size]
    batch_ids = []

    # Generate hashes for each document in the batch
    for doc, metadata in zip(batch_documents, batch_metadatas):
        doc_hash = generate_hash(doc, metadata)
        if doc_hash not in existing_hashes:
            batch_ids.append(doc_hash)
            existing_hashes.add(doc_hash)  # Add hash to local set to avoid duplicates in the same batch
        else:
            print(f"Skipping duplicate document: {doc[:50]}...")  # Print a preview of the duplicate doc

    # Add non-duplicate documents to the database
    if batch_ids:  # Ensure there are non-duplicate documents to add
        # Add the batch to the Milvus collection
        if store_client == "Milvus" and insert_data:
            datastor.insert(collection_name,
                metadata=batch_metadatas,
                documents=batch_documents,
                embeddings=np.array(batch_embeddings),
                ids=batch_ids
            )

all records >>> 0


Adding data to Milvus:   0%|          | 0/18 [00:00<?, ?it/s]

Skipping duplicate document: [0a] The Worst Witch is a series of children's boo...
Skipping duplicate document: [0b] The series are primarily boarding school and ...
Skipping duplicate document: [0c] The first one, "The Worst Witch", was publish...
Skipping duplicate document: [0d] The books have become some of the most succes...
Skipping duplicate document: [0a] Organophosphate pesticides are one of the top...
Skipping duplicate document: [1b] Organophosphates are used as insecticides, me...
Skipping duplicate document: [1c] Symptoms include increased saliva and tear pr...
Skipping duplicate document: [1d] While onset of symptoms is often within minut...
Skipping duplicate document: [1e] Symptoms can last for days to weeks....
Skipping duplicate document: [1f] Organophosphate poisoning occurs most commonl...
Skipping duplicate document: [1g] Exposure can be from drinking, breathing in t...
Skipping duplicate document: [1h] The underlying mechanism involves the inhibit...
Skipping dupl

Adding data to Milvus:   6%|▌         | 1/18 [00:02<00:35,  2.08s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] Nevertheless, much of Samus' media reception ...
Skipping duplicate document: [0b] GameTrailers named Samus number one on a 2006...
Skipping duplicate document: [0c] GameDaily ranked Samus seventh on a list of t...
Skipping duplicate document: [0d] She was further listed on GameDaily's list of...
Skipping duplicate document: [0e] In 2008, Spike placed Samus on the top of the...
Skipping duplicate document: [0f] In 2009, UGO.com ranked Samus as 11th on a li...
Skipping duplicate document: [0a] Dallas, the third largest city in the U.S. st...
Skipping duplicate document: [0b] The tallest building in the city is the Bank ...
Skipping duplicate document: [0c] It also stands as the 3rd-tallest building in...
Skipping duplicate document: [0d] The second-tallest skyscraper in the city is ...
Skipping duplicate document: [0e] The Comerica Bank Tower, completed in 1987 an...
Skipping duplicate document: [

Adding data to Milvus:  11%|█         | 2/18 [00:03<00:29,  1.82s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] Rikers Island () is New York City's main jail...
Skipping duplicate document: [0b] Supposedly named after Abraham Rycken who bou...
Skipping duplicate document: [0c] The island was originally under in size, but ...
Skipping duplicate document: [0d] Much of the first stages of expansion was acc...
Skipping duplicate document: [0e] The island itself is politically part of the ...
Skipping duplicate document: [0a] Oxygen saturation is the fraction of oxygen-s...
Skipping duplicate document: [0b] The human body requires and regulates a very ...
Skipping duplicate document: [0c] Normal blood oxygen levels in humans are cons...
Skipping duplicate document: [0d] If the level is below 90 percent, it is consi...
Skipping duplicate document: [0e] Blood oxygen levels below 80 percent may comp...
Skipping duplicate document: [0f] Continued low oxygen levels may lead to respi...
Skipping duplicate document: [

Adding data to Milvus:  17%|█▋        | 3/18 [00:04<00:23,  1.57s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] Sergey Mikhaylovich Prokudin-Gorsky (, ;  – S...
Skipping duplicate document: [0b] He is best known for his pioneering work in c...
Skipping duplicate document: [0a] The two top stars of the Attitude Era, Stone ...
Skipping duplicate document: [0b] Triple H would also be featured prominently d...
Skipping duplicate document: [0c] Eddie Guerrero, Rey Mysterio, Kurt Angle, Chr...
Skipping duplicate document: [0d] From mid 2002 to 2003, WWE brought several pr...
Skipping duplicate document: [0e] The Great American Bash, originally a WCW pay...
Skipping duplicate document: [1f] The match came to an end when Lesnar stopped ...
Skipping duplicate document: [1g] Lesnar remained the youngest world champion i...
Skipping duplicate document: [0a] Between 1996 and 1997, the inside of the stad...
Skipping duplicate document: [0b] The stadium was imploded on August 2, 1997; t...
Skipping duplicate document: [

Adding data to Milvus:  22%|██▏       | 4/18 [00:06<00:19,  1.42s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] In "Final Crisis: Rage of the Red Lanterns", ...
Skipping duplicate document: [0b] The battery stands before a great lake of blo...
Skipping duplicate document: [0c] Harnessing the red light of rage, he sends hi...
Skipping duplicate document: [0d] Their blood spoils from within, forcing them ...
Skipping duplicate document: [0e] Additionally, the Red Lanterns are reduced to...
Skipping duplicate document: [0f] Once Atrocitus assembles a sufficient force, ...
Skipping duplicate document: [0g] Coincidentally, the Sinestro Corps have simil...
Skipping duplicate document: [0h] In turn, both groups are then ambushed by the...
Skipping duplicate document: [0i] Among the many Red Lanterns being seen by rea...
Skipping duplicate document: [0j] After being tried and found guilty for the mu...
Skipping duplicate document: [0k] While being escorted away from Oa, her ship i...
Skipping duplicate document: [

Adding data to Milvus:  28%|██▊       | 5/18 [00:07<00:16,  1.24s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] New mixes were added regularly to "DJ Hero" t...
Skipping duplicate document: [0b] Unlike "Guitar Hero" games' downloadable cont...
Skipping duplicate document: [0c] Furthermore, mixes are provided only as bundl...
Skipping duplicate document: [0d] Critics viewed the lack of individual song se...
Skipping duplicate document: [0e] The long time between the second downloadable...
Skipping duplicate document: [0f] All of the DJ Hero DLC packs (along with the ...
Skipping duplicate document: [0g] However, they can be reinstalled if the playe...
Skipping duplicate document: [0h] While the base game contained 93 mixes in tot...
Skipping duplicate document: [0i] These mixes either were at one point planned ...
Skipping duplicate document: [0j] These mixes included various songs from "Nirv...
Skipping duplicate document: [0a] The presidential election ("Reichspräsidenten...
Skipping duplicate document: [

Adding data to Milvus:  33%|███▎      | 6/18 [00:07<00:12,  1.07s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] Some of Perry's views are in opposition to Co...
Skipping duplicate document: [0b] In his 2010 book "Fed Up!...
Skipping duplicate document: [0c] ", he takes issue with the Federal government...
Skipping duplicate document: [0d] He also criticizes the 17th Amendment, which ...
Skipping duplicate document: [0e] Senators....
Skipping duplicate document: [0f] According to Perry, the 16th and 17th Amendme...
Skipping duplicate document: [0g] Congress was free to tax and spend to its hea...
Skipping duplicate document: [1h] In 1920, Benjamin Gitlow was convicted under ...
Skipping duplicate document: [1i] Prior to that time, the Bill of Rights was co...
Skipping duplicate document: [0a] Yahoo!...
Skipping duplicate document: [0b] made a deal with the online communications co...
Skipping duplicate document: [0c] Marvin Gavin, who worked at Four11 as the Dir...
Skipping duplicate document: [0d] They wer

Adding data to Milvus:  39%|███▉      | 7/18 [00:08<00:10,  1.10it/s]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0c] He referred the specimen to the genus "Felis"...
Skipping duplicate document: [0d] The species name means "fate" or "destiny", b...
Skipping duplicate document: [0e] In an 1880 article about extinct American cat...
Skipping duplicate document: [0f] Most North American finds were scanty until e...
Skipping duplicate document: [0g] "S. fatalis" has junior synonyms such as "S. ...
Skipping duplicate document: [0h] American paleontologist Annalisa Berta consid...
Skipping duplicate document: [0i] Swedish paleontologists Björn Kurtén and Lars...
Skipping duplicate document: [2j] Though some later authors used Lund's origina...
Skipping duplicate document: [2k] Some South American specimens have been refer...
Skipping duplicate document: [2k] Among those who survived the first several da...
Skipping duplicate document: [2l] It has been argued that countless numbers of ...
Skipping duplicate document: [

Adding data to Milvus:  44%|████▍     | 8/18 [00:09<00:08,  1.13it/s]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] No....
Skipping duplicate document: [0a] The Todd Bertuzzi–Steve Moore incident (also ...
Skipping duplicate document: [0b] In the first period, Steve Moore fought Vanco...
Skipping duplicate document: [0c] The Avalanche would go on to build up a large...
Skipping duplicate document: [0d] Late in the third period, Todd Bertuzzi was s...
Skipping duplicate document: [0e] After failing to instigate Moore to fight, Be...
Skipping duplicate document: [0f] Bertuzzi landed on top of him, driving Moore ...
Skipping duplicate document: [0g] Moore was knocked out and lay motionless for ...
Skipping duplicate document: [0h] The combination of the hit, fall, and piling-...
Skipping duplicate document: [0i] The incident ended Moore's professional hocke...
Skipping duplicate document: [0j] On August 19, 2014, it was reported the civil...


Adding data to Milvus:  50%|█████     | 9/18 [00:22<00:43,  4.80s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [1f] He has appeared in more films than any other ...


Adding data to Milvus:  56%|█████▌    | 10/18 [00:36<00:59,  7.44s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [1h] Throughout his life and political career, he ...


Adding data to Milvus:  61%|██████    | 11/18 [00:49<01:04,  9.28s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] Liz Rose (born in Dallas, Texas) is an Americ...
Skipping duplicate document: [0b] She has co-written twenty of Swift's official...
Skipping duplicate document: [0a] County Antrim (named after the town of Antrim...
Skipping duplicate document: [0b] Adjoined to the north-east shore of Lough Nea...
Skipping duplicate document: [0c] County Antrim has a population density of 203...
Skipping duplicate document: [0d] It is also one of the thirty-two traditional ...
Skipping duplicate document: [2g] The building was designed by Henry N. Cobb of...
Skipping duplicate document: [2h] It is one of the most recognizable buildings ...
Skipping duplicate document: [0a] "Chim Chim Cher-ee" is a song from "Mary Popp...
Skipping duplicate document: [0b] It was originally sung by Dick Van Dyke and J...
Skipping duplicate document: [0c] The song can be heard in the "Mary Poppins" s...


Adding data to Milvus:  67%|██████▋   | 12/18 [01:02<01:03, 10.57s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [1f] When the Cardinals reopened the team Hall of ...
Skipping duplicate document: [0a] The 1976 NBA draft was the 30th annual draft ...
Skipping duplicate document: [0b] The draft was held on June 8, 1976, before th...
Skipping duplicate document: [0c] In this draft, 18 NBA teams took turns select...
Skipping duplicate document: [0d] The first two picks in the draft belonged to ...
Skipping duplicate document: [0e] The Atlanta Hawks won the coin flip and were ...
Skipping duplicate document: [0f] The Hawks then traded the first pick to the H...
Skipping duplicate document: [0g] The remaining first-round picks and the subse...
Skipping duplicate document: [0h] The New York Knicks forfeited their first-rou...
Skipping duplicate document: [0i] The 76ers, the Golden State Warriors and the ...
Skipping duplicate document: [0j] A player who had finished his four-year colle...
Skipping duplicate document: [

Adding data to Milvus:  72%|███████▏  | 13/18 [01:16<00:56, 11.36s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0a] Please try again later....
Skipping duplicate document: [0a] Overview....
Skipping duplicate document: [0d] 2....
Skipping duplicate document: [0a] Making the world better, one answer at a time...
Skipping duplicate document: [0b] (2000 U.S. Census)....
Skipping duplicate document: [0f] 3...
Skipping duplicate document: [1f] ADVERTISEMENT....
Skipping duplicate document: [0b] (United States)....
Skipping duplicate document: [0a] Making the world better, one answer at a time...


Adding data to Milvus:  78%|███████▊  | 14/18 [01:29<00:47, 11.97s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0c] 1....
Skipping duplicate document: [0a] Save....
Skipping duplicate document: [0a] [ 2 syll....
Skipping duplicate document: [0b] (2000 U.S. Census)....
Skipping duplicate document: [0a] Noun....
Skipping duplicate document: [0a] Definition....
Skipping duplicate document: [0a] Yes....
Skipping duplicate document: [0f] 2....
Skipping duplicate document: [0a] Definition....
Skipping duplicate document: [0e] Supplement....
Skipping duplicate document: [0e] ADVERTISEMENT....
Skipping duplicate document: [0d] 3....
Skipping duplicate document: [0a] Save....
Skipping duplicate document: [0f] A....


Adding data to Milvus:  83%|████████▎ | 15/18 [01:42<00:37, 12.37s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0b] Source: HR Reported data as of October 2015....
Skipping duplicate document: [0a] 1....
Skipping duplicate document: [0a] 1....
Skipping duplicate document: [0c] 2....
Skipping duplicate document: [1e] 3....
Skipping duplicate document: [0a] Canada is unusual among developed countries i...
Skipping duplicate document: [0b] Canada also has a sizable manufacturing secto...
Skipping duplicate document: [0a] Noun....
Skipping duplicate document: [0d] Translations....
Skipping duplicate document: [0a] Noun....
Skipping duplicate document: [0b] Source: HR Reported data as of October 2015....
Skipping duplicate document: [0c] 1  Salary....
Skipping duplicate document: [0a] Please try again later....
Skipping duplicate document: [0a] Rating Newest Oldest....
Skipping duplicate document: [1g] 1....
Skipping duplicate document: [0a] Causes....
Skipping duplicate document: [0f] 2....
Skipping duplicate docu

Adding data to Milvus:  89%|████████▉ | 16/18 [01:56<00:25, 12.64s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0d] ADVERTISEMENT....
Skipping duplicate document: [0c] 2....
Skipping duplicate document: [1e] 3....
Skipping duplicate document: [1e] !...
Skipping duplicate document: [0a] Rating Newest Oldest....
Skipping duplicate document: [2g] 3....
Skipping duplicate document: [0b] ....
Skipping duplicate document: [0a] noun....
Skipping duplicate document: [1d] To estimate costs for your project: 1....
Skipping duplicate document: [1f] 2....
Skipping duplicate document: [0a] 6....
Skipping duplicate document: [0c] 2....
Skipping duplicate document: [0a] 2....
Skipping duplicate document: [0f] 2....
Skipping duplicate document: [0a] Definition....
Skipping duplicate document: [1g] 4....
Skipping duplicate document: [0a] Overview....
Skipping duplicate document: [0a] Report Abuse....
Skipping duplicate document: [1e] 4....
Skipping duplicate document: [0a] Report Abuse....
Skipping duplicate document: [0a] Def

Adding data to Milvus:  94%|█████████▍| 17/18 [02:09<00:12, 12.81s/it]

Inserted 2500 records into collection 'ragbench_hhm_v1'.
Skipping duplicate document: [0c] 1....
Skipping duplicate document: [0e] 2....
Skipping duplicate document: [0a] Uses....
Skipping duplicate document: [1f] ADVERTISEMENT....
Skipping duplicate document: [0h] ....
Skipping duplicate document: [0d] 1....
Skipping duplicate document: [0b] n. 1....
Skipping duplicate document: [0d] Each salary is associated with a real job pos...
Skipping duplicate document: [0b] n. 1....
Skipping duplicate document: [0c] SoulUrge Number: 1....
Skipping duplicate document: [0e] People with this name have a deep inner desir...
Skipping duplicate document: [0d] 2....
Skipping duplicate document: [0h] 3....
Skipping duplicate document: [0a] In this article....
Skipping duplicate document: [0a] Definition....


Adding data to Milvus: 100%|██████████| 18/18 [02:09<00:00,  7.22s/it]

Inserted 1159 records into collection 'ragbench_hhm_v1'.





In [None]:
questions = ['When was Rolex founded?', 'How large is the region of Macedonia?', 
             'Where is GMT Games headquartered?', 'What state is directly north of North Carolina?', 
             'When was Brown v. Board of Education?',
             
             'What star of Parks and Recreation appeared in November?', 
             'What is the capacity of the Stadium, other than Kauffman Stadium, designed by Charles Deaton ?', 
             'What was the island, on which Marinelli Glacier is located, formerly known as?', 
             'The American Sweetgum is the hostplant of what kind of bug?', 
             'The name of the Japanese rock band T-Bolan was inspired by the name of an English rock band formed in what year?',
             
             'symptoms of pregnancy before a missed period', 'monoclonal antibodies biology definition', 
             'what is iron sulfate', "who sang one day i'll fly away", 
             'describe the antebellum reform movement period'
            ]

## **Retrieve Candidates from DB **##