## 1. Setup and Configuration

In [1]:
import chromadb
import pickle
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [2]:
# --- IMPORTANT: CONFIGURE THIS PATH ---
# Point this to the latest run directory created by the training script.
# Example: ARTIFACTS_PATH = "../backend/artifacts/run_20240101_120000"
ARTIFACTS_PATH = "../artifacts/run-20250619_212044" # 👈 CHANGE THIS

CHROMA_STORE_PATH = "./chroma_store"
COLLECTION_NAME = "docs"

## 2. Load Artifacts

In [3]:
artifacts = Path(ARTIFACTS_PATH)

if not artifacts.exists():
    raise FileNotFoundError(f"Artifacts directory not found at {ARTIFACTS_PATH}. Please run the backend training script first.")

# Load documents
with open(artifacts / 'documents.pkl', 'rb') as f:
    documents = pickle.load(f)

# Load embeddings
embeddings = np.load(artifacts / 'document_embeddings.npy')

print(f"Loaded {len(documents)} documents and {embeddings.shape[0]} embeddings.")

Loaded 18679 documents and 18679 embeddings.


## 3. Setup ChromaDB and Index Documents

In [4]:
client = chromadb.PersistentClient(path=CHROMA_STORE_PATH)

# Delete the collection if it already exists to ensure a fresh start
if COLLECTION_NAME in [c.name for c in client.list_collections()]:
    print(f"Collection '{COLLECTION_NAME}' already exists. Deleting it.")
    client.delete_collection(name=COLLECTION_NAME)

collection = client.get_or_create_collection(name=COLLECTION_NAME)
print(f"Created new collection: '{COLLECTION_NAME}'")

Created new collection: 'docs'


In [5]:
batch_size = 500
num_docs = len(documents)

for i in tqdm(range(0, num_docs, batch_size)):
    batch_end = min(i + batch_size, num_docs)
    
    batch_docs = documents[i:batch_end]
    batch_embeddings = embeddings[i:batch_end]
    batch_ids = [f"doc_{j}" for j in range(i, batch_end)]
    
    collection.add(
        ids=batch_ids,
        documents=batch_docs,
        embeddings=batch_embeddings.tolist()
    )

print(f"\n✅ Successfully indexed {collection.count()} documents into ChromaDB.")

100%|██████████| 38/38 [00:07<00:00,  4.95it/s]


✅ Successfully indexed 18679 documents into ChromaDB.





## 4. (Optional) Test Query

In [6]:
# This is a simple test and does not use the trained query encoder.
# It finds documents similar to the embedding of another document.
results = collection.query(
    query_embeddings=[embeddings[0].tolist()],
    n_results=5
)

print("Querying with the first document as an example:")
print(documents[0])
print("\nResults:")
for doc in results['documents'][0]:
    print(f"  - {doc[:100]}...")

Querying with the first document as an example:
This is what pigments do. The light they absorb contains' just the right amount' of energy necessary to push them into the next level. Any light that does not have enough or has too much energy can not be absorbed and is reflected. The electron in the higher energy level, however, does not 'want' to stay there(i.e.

Results:
  - This is what pigments do. The light they absorb contains' just the right amount' of energy necessary...
  - It is very important to note that grinding the beans instantly raises the surface area to enable a m...
  - What should I eat if I have acid reflux? Gastroesophageal reflux disease (commonly referred to as GE...
  - There is no doubt yeast infections are unpleasant. The severity of symptoms like itching, burning, r...
  - “Hair will typically grow within a 4 week period, that being said, brows definitely can and will gro...
