## 1. Setup and Configuration

In [1]:
import chromadb
import pickle
import numpy as np
from tqdm import tqdm
from pathlib import Path
import json

def load_config(path: str):
    """Loads a JSON config file."""
    with open(path, 'r') as f:
        return json.load(f)

config = load_config('../frontend/config.json')

In [None]:
# --- IMPORTANT: CONFIGURE THIS PATH ---
# Point this to the latest run directory created by the training script.
# Example: ARTIFACTS_PATH = "../backend/artifacts/run_20240101_120000"

ARTIFACTS_PATH = config['ARTIFACTS_PATH']
print("Loading artifacts from: ", ARTIFACTS_PATH)
CHROMA_STORE_PATH = "./chroma_store"
COLLECTION_NAME = "docs"

## 2. Load Artifacts

In [None]:
artifacts = Path('../' + ARTIFACTS_PATH)

if not artifacts.exists():
    raise FileNotFoundError(f"Artifacts directory not found at {ARTIFACTS_PATH}. Please run the backend training script first.")

# Load documents
with open(artifacts / 'documents.pkl', 'rb') as f:
    documents = pickle.load(f)

# Load embeddings
embeddings = np.load(artifacts / 'document_embeddings.npy')

print(f"Loaded {len(documents)} documents and {embeddings.shape[0]} embeddings.")

## 3. Setup ChromaDB and Index Documents

In [None]:
client = chromadb.PersistentClient(path=CHROMA_STORE_PATH)

# Delete the collection if it already exists to ensure a fresh start
if COLLECTION_NAME in [c.name for c in client.list_collections()]:
    print(f"Collection '{COLLECTION_NAME}' already exists. Deleting it.")
    client.delete_collection(name=COLLECTION_NAME)

collection = client.get_or_create_collection(name=COLLECTION_NAME)
print(f"Created new collection: '{COLLECTION_NAME}'")

In [None]:
batch_size = 500
num_docs = len(documents)

for i in tqdm(range(0, num_docs, batch_size)):
    batch_end = min(i + batch_size, num_docs)
    
    batch_docs = documents[i:batch_end]
    batch_embeddings = embeddings[i:batch_end]
    batch_ids = [f"doc_{j}" for j in range(i, batch_end)]
    
    collection.add(
        ids=batch_ids,
        documents=batch_docs,
        embeddings=batch_embeddings.tolist()
    )

print(f"\n✅ Successfully indexed {collection.count()} documents into ChromaDB.")

## 4. (Optional) Test Query

In [None]:
# This is a simple test and does not use the trained query encoder.
# It finds documents similar to the embedding of another document.
results = collection.query(
    query_embeddings=[embeddings[0].tolist()],
    n_results=5
)

print("Querying with the first document as an example:")
print(documents[0])
print("\nResults:")
for doc in results['documents'][0]:
    print(f"  - {doc[:100]}...")