In [None]:
# !pip install chromadb 
# !pip sentence-transformers
# !pip install langchain
# !pip install sentence-transformers

In [2]:
import chromadb
from chromadb import PersistentClient
from sentence_transformers import SentenceTransformer

# Initialize local ChromaDB storage (in ./chroma_db/)
client = PersistentClient(path="./chroma_db")

# Create or load collection
collection = client.get_or_create_collection("my_docs")

# Load embedding model (LaBSE = multilingual, offline)
model = SentenceTransformer("LaBSE")  # or 'all-MiniLM-L6-v2'

# Sample texts (in multiple languages)
texts = [
    "Hello world",
    "Bonjour le monde",
    "Hola mundo",
    "ہیلو دنیا",        # Urdu
    "سلام دنیا",         # Persian
]
ids = [f"doc{i}" for i in range(len(texts))]
embeddings = model.encode(texts).tolist()

# Store in Chroma
collection.add(
    documents=texts,
    embeddings=embeddings,
    ids=ids,
    metadatas=[{"lang": "auto"} for _ in texts]
)

# Search with a new query
query = "سلام"  # Same meaning as 'hello'
query_vec = model.encode([query]).tolist()[0]

results = collection.query(
    query_embeddings=[query_vec],
    n_results=3,
    include=["documents", "distances"]
)

# Show results
for doc, dist in zip(results["documents"][0], results["distances"][0]):
    print(f"Result: {doc} | Distance: {dist:.4f}")


Result: سلام دنیا | Distance: 0.5351
Result: Hello world | Distance: 0.6217
Result: ہیلو دنیا | Distance: 0.6236
