## 1. Setup and Configuration

In [2]:
import chromadb
import pickle
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [1]:
# --- IMPORTANT: CONFIGURE THIS PATH ---
# Point this to the latest run directory created by the training script.
# Example: ARTIFACTS_PATH = "../backend/artifacts/run_20240101_120000"
ARTIFACTS_PATH = "../artifacts/run-20250619_232020" # 👈 CHANGE THIS

CHROMA_STORE_PATH = "./chroma_store"
COLLECTION_NAME = "docs"

## 2. Load Artifacts

In [3]:
artifacts = Path(ARTIFACTS_PATH)

if not artifacts.exists():
    raise FileNotFoundError(f"Artifacts directory not found at {ARTIFACTS_PATH}. Please run the backend training script first.")

# Load documents
with open(artifacts / 'documents.pkl', 'rb') as f:
    documents = pickle.load(f)

# Load embeddings
embeddings = np.load(artifacts / 'document_embeddings.npy')

print(f"Loaded {len(documents)} documents and {embeddings.shape[0]} embeddings.")

Loaded 185828 documents and 185828 embeddings.


## 3. Setup ChromaDB and Index Documents

In [4]:
client = chromadb.PersistentClient(path=CHROMA_STORE_PATH)

# Delete the collection if it already exists to ensure a fresh start
if COLLECTION_NAME in [c.name for c in client.list_collections()]:
    print(f"Collection '{COLLECTION_NAME}' already exists. Deleting it.")
    client.delete_collection(name=COLLECTION_NAME)

collection = client.get_or_create_collection(name=COLLECTION_NAME)
print(f"Created new collection: '{COLLECTION_NAME}'")

Collection 'docs' already exists. Deleting it.
Created new collection: 'docs'


In [5]:
batch_size = 500
num_docs = len(documents)

for i in tqdm(range(0, num_docs, batch_size)):
    batch_end = min(i + batch_size, num_docs)
    
    batch_docs = documents[i:batch_end]
    batch_embeddings = embeddings[i:batch_end]
    batch_ids = [f"doc_{j}" for j in range(i, batch_end)]
    
    collection.add(
        ids=batch_ids,
        documents=batch_docs,
        embeddings=batch_embeddings.tolist()
    )

print(f"\n✅ Successfully indexed {collection.count()} documents into ChromaDB.")

100%|██████████| 372/372 [01:57<00:00,  3.17it/s]


✅ Successfully indexed 185828 documents into ChromaDB.





## 4. (Optional) Test Query

In [6]:
# This is a simple test and does not use the trained query encoder.
# It finds documents similar to the embedding of another document.
results = collection.query(
    query_embeddings=[embeddings[0].tolist()],
    n_results=5
)

print("Querying with the first document as an example:")
print(documents[0])
print("\nResults:")
for doc in results['documents'][0]:
    print(f"  - {doc[:100]}...")

Querying with the first document as an example:
For example-The place value of 8 in 80 is tens, while the place value of 5 and 6 in 576 are hundreds and ones. In other words, in the number 80, the position of 8 is tens.t is by using these ten digits that numbers are formed. The value of ten digit in a number is determined by its place in the number and is know as the place value. For example, to write the numeral for two thousand four hundred and eighty seven, we write 2487.

Results:
  - For example-The place value of 8 in 80 is tens, while the place value of 5 and 6 in 576 are hundreds...
  - Historically, all currencies used to be gold backed – the value of money was fixed to the price of g...
  - U.S. Circulated Silver Coins. Silver coin values below are based on live silver prices at the CME. T...
  - In Roman numerals, M = 1000, D = 500, C = 100, L = 50, X = 10, V = 5, and I = 1. A single Roman nume...
  - One (1) page of text in Times New Roman with 1 margins and default line sp