In [2]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load pretrained dual encoder model
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L-6-v3')

# Setup ChromaDB
# Create a persistent ChromaDB client
client = chromadb.PersistentClient(path="./chroma_store")  # Your desired folder

# Create or load a collection
collection = client.get_or_create_collection("docs")
existing = collection.get()
all_ids = existing["ids"]

if all_ids:
    collection.delete(ids=all_ids)
    print(f"Deleted {len(all_ids)} documents.")
else:
    print("Collection is already empty. Nothing to delete.")


Collection is already empty. Nothing to delete.


In [4]:
# Sample documents
documents = [
    "How to grow tomatoes in pots.",
    "What is a transformer model in machine learning?",
    "Best practices for Docker security.",
    "Understanding climate change impact.",
    "Guide to UK tourist visas."
]

# Create unique IDs
doc_ids = [f"doc_{i}" for i in range(len(documents))]

# Generate embeddings
embeddings = model.encode(documents)


In [5]:
# Add to ChromaDB
for doc_id, text, emb in zip(doc_ids, documents, embeddings):
    collection.add(
        ids=[doc_id],
        documents=[text],
        embeddings=[emb.tolist()]
    )

print(f"Indexed {len(documents)} documents.")


Indexed 5 documents.


In [6]:
# Querying the collection
query = "How to get a UK visa"
query_embedding = model.encode([query])[0].tolist()

results = collection.query(query_embeddings=[query_embedding], n_results=3)
print(results["documents"][0])

['Guide to UK tourist visas.', 'How to grow tomatoes in pots.', 'Best practices for Docker security.']
