<a href="https://colab.research.google.com/github/harjeet88/llm-course/blob/main/data_engg/spark_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U -q "pymilvus[model]" sentence-transformers numpy

In [5]:
!pip install -q pymilvus[milvus_lite]

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
# 2. Import Libraries and Connect to Milvus Lite
from pymilvus import MilvusClient, DataType, model
import numpy as np

In [7]:
# Set the URI to a local file. This automatically uses Milvus Lite.
# A file named 'milvus_demo.db' will be created in your Colab environment.
DB_FILE = "./milvus_demo.db"
COLLECTION_NAME = "text_search_collection"
DIMENSION = 384 # Dimension used by the 'all-MiniLM-L6-v2' model

In [8]:
# Initialize the Milvus Client
try:
    client = MilvusClient(DB_FILE)
    print(f"✅ Connected to Milvus Lite at: {DB_FILE}")
except Exception as e:
    print(f"❌ Failed to connect: {e}")

✅ Connected to Milvus Lite at: ./milvus_demo.db


In [10]:
# 1. Initialize the Embedding Function (using PyMilvus's model wrapper)
# We use a fast, common model (all-MiniLM-L6-v2) with dimension 384.
embedding_fn = model.dense.SentenceTransformerEmbeddingFunction(
    model_name='all-MiniLM-L6-v2',
    device='cpu' # Use CPU for Colab environment stability
)

# 2. Drop existing collection if it exists
if client.has_collection(COLLECTION_NAME):
    client.drop_collection(COLLECTION_NAME)
    print(f"Dropped existing collection: {COLLECTION_NAME}")

# 3. Create the Collection
# We define the schema for structured storage
schema = client.create_schema(
    auto_id=False,
    enable_dynamic_field=True # Allows storing extra fields like 'text' easily
)

# Add fields
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=DIMENSION)
schema.add_field(field_name="text_content", datatype=DataType.VARCHAR, max_length=512)

# Define Index Parameters (Milvus Lite automatically optimizes the index)
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="vector",
    index_type="AUTOINDEX", # Use AUTOINDEX for Milvus Lite optimization
    metric_type="COSINE" # Cosine similarity for sentence embeddings
)

# Create the collection
client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema,
    index_params=index_params
)
print(f"✅ Collection '{COLLECTION_NAME}' created with dimension {DIMENSION}.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Collection 'text_search_collection' created with dimension 384.


In [11]:
# 1. Sample Documents
docs = [
    "The Milvus vector database is open-source and specializes in massive-scale vector search.",
    "Milvus Lite allows you to run a full Milvus instance locally within a Python environment like Colab.",
    "Vector embeddings are numerical representations of text, images, or audio.",
    "A typical RAG pipeline uses Milvus to retrieve relevant context for an LLM."
]

# 2. Generate Embeddings for the documents (using the correct method)
print("Generating embeddings...")
vectors = embedding_fn.encode_documents(docs) # Uses the encode_documents method of the new class

# 3. Prepare data list for insertion
data_to_insert = [
    {"id": i + 1, "vector": vectors[i], "text_content": docs[i]}
    for i in range(len(docs))
]

# 4. Insert data into the collection
result = client.insert(
    collection_name=COLLECTION_NAME,
    data=data_to_insert
)

# Load the collection into memory for searching
client.load_collection(COLLECTION_NAME)

print(f"✅ Successfully inserted {len(docs)} vectors.")
print(f"Total entities in collection: {client.get_collection_stats(COLLECTION_NAME)['row_count']}")

Generating embeddings...
✅ Successfully inserted 4 vectors.
Total entities in collection: 4


In [12]:
# 1. Define the search query
query_text = "How does Milvus help with large-scale applications?"

# 2. Generate the embedding for the query (using the correct method)
query_vector = embedding_fn.encode_queries([query_text])

# 3. Perform the vector search
print(f"\nSearching for: '{query_text}'")
search_res = client.search(
    collection_name=COLLECTION_NAME,
    data=query_vector,
    limit=2,
    output_fields=["text_content"],
    search_params={"metric_type": "COSINE", "params": {}}
)

# 4. Process and Display Results
print("--- Search Results (Top 2) ---")
for hit in search_res[0]:
    distance = hit['distance']
    text = hit['entity']['text_content']
    print(f"Distance: {distance:.4f}\nResult: {text}\n")

# 5. Clean up
client.release_collection(COLLECTION_NAME)
print("✅ Demo complete and collection released.")


Searching for: 'How does Milvus help with large-scale applications?'
--- Search Results (Top 2) ---
Distance: 0.5466
Result: A typical RAG pipeline uses Milvus to retrieve relevant context for an LLM.

Distance: 0.4939
Result: Milvus Lite allows you to run a full Milvus instance locally within a Python environment like Colab.

✅ Demo complete and collection released.
