<a href="https://colab.research.google.com/github/kukretinishtha/medium_blog/blob/medium/caching_in_vectordb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install chromadb sentence-transformers

Collecting chromadb
  Downloading chromadb-1.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.25.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentele

In [2]:
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB client
client = chromadb.Client(Settings())
collection = client.get_or_create_collection("semantic_cache_demo")

# Add some documents to ChromaDB
collection.add(
    documents=["How can I help you?", "What's your name?", "Goodbye!"],
    ids=["1", "2", "3"]
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 103MiB/s]


In [3]:
from sentence_transformers import SentenceTransformer
import hashlib
import numpy as np

# Initialize sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embedding-based cache key
def embed_cache_key(query: str, precision=2):
    embedding = model.encode(query)
    rounded = np.round(embedding, decimals=precision)
    return hashlib.md5(rounded.tobytes()).hexdigest(), embedding.tolist()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
import json

# Simulated Redis-style cache
semantic_cache = {}

# Semantic cache-aware vector search
def semantic_vector_search(query: str, top_k=2):
    key, embedding = embed_cache_key(query)

    if key in semantic_cache:
        print(f"Cache hit for '{query}'")
        return json.loads(semantic_cache[key])

    print(f"Cache miss for '{query}' → computing results")
    results = collection.query(query_embeddings=[embedding], n_results=top_k)

    semantic_cache[key] = json.dumps(results)
    return results

In [5]:
# Queries: similar in meaning but different wording
queries = ["How are you?", "What's up?", "What's going on?"]

for q in queries:
    print(f"\n🔍 Query: {q}")
    result = semantic_vector_search(q)
    print(f"📄 Top result: {result['documents'][0][0]}")


🔍 Query: How are you?
Cache miss for 'How are you?' → computing results
📄 Top result: What's your name?

🔍 Query: What's up?
Cache miss for 'What's up?' → computing results
📄 Top result: What's your name?

🔍 Query: What's going on?
Cache miss for 'What's going on?' → computing results
📄 Top result: What's your name?
