# Data Pesistence

- Ensures business continuity
- Cost Effective
- Allows for Horizontal Scaling

In [1]:
import chromadb
from chromadb.utils import embedding_functions
import time
import numpy as np

print("\n=== PERSISTENT STORAGE ===")
# Initialize a persistent client
print("Creating a persistent Chroma client...")
client = chromadb.PersistentClient(path="../scratch/chroma_db")

# Create embedding function
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)


=== PERSISTENT STORAGE ===
Creating a persistent Chroma client...


In [2]:
# Display results helper
def display_results(results):
  print("\nResults:")
  for i, (doc, doc_id, metadata, distance) in enumerate(zip(
      results['documents'][0],
      results['ids'][0],
      results['metadatas'][0],
      results['distances'][0]
  )):
      print(f"{i+1}. Document: {doc}")
      print(f"   ID: {doc_id}")
      print(f"   Metadata: {metadata}")
      print(f"   Distance: {distance}")
      print()

In [3]:
# Create a new collection
print("Creating a persistent collection...")
persistent_collection = client.create_collection(
  name="persistent_docs",
  embedding_function=embedding_function
)

# Add some documents
documents = [
  "This is a document that will be stored persistently",
  "Vector databases need to persist data for production use",
  "Data persistence ensures your embeddings survive restarts"
]

ids = ["pdoc1", "pdoc2", "pdoc3"]
persistent_collection.add(
      documents=documents,
      ids=ids
  )

  # Show that we can query the persistent collection
results = persistent_collection.query(
    query_texts=["persistent data storage"],
    n_results=1
)

display_results(results)

Creating a persistent collection...

Results:
1. Document: This is a document that will be stored persistently
   ID: pdoc1
   Metadata: None
   Distance: 0.7845402441433474



# Why Performance Matters

- User Experience: Slow query responses lead to poor user experiences in search applications
- Resource Utilization: Inefficient operations can consume excessive computational resources
- Scaling Challenges: Performance problems compound as your data grows
- Cost Implications: In cloud environments, inefficient operations directly impact your operating costs


In [4]:
print("\n=== PERFORMANCE CONSIDERATIONS ===")

# Create a larger collection for performance testing
collection = client.create_collection(
            name="performance_test",
            embedding_function=embedding_function)

# Generate synthetic documents
print("Generating synthetic documents for performance testing...")
words = ["AI", "machine", "learning", "vector", "database", "embedding", "neural",
          "network", "transformer", "data", "science", "engineering", "model",
          "algorithm", "optimization", "natural", "language", "processing"]

num_docs = 1000
documents = []

for i in range(num_docs):
    # Create a random document of 10-20 words
    doc_len = np.random.randint(10, 20)
    doc = " ".join(np.random.choice(words, size=doc_len))
    documents.append(doc)

ids = [f"perf_doc_{i}" for i in range(num_docs)]

# Time the addition of documents
print(f"Adding {num_docs} documents to collection...")
start_time = time.time()

# Add in batches
batch_size = 100
for i in range(0, num_docs, batch_size):
    end_idx = min(i + batch_size, num_docs)
    collection.add(
        documents=documents[i:end_idx],
        ids=ids[i:end_idx]
    )

add_time = time.time() - start_time
print(f"Time to add {num_docs} documents: {add_time:.2f} seconds")

# Time query performance
print("\nTesting query performance...")
query_times = []
num_queries = 5

for i in range(num_queries):
    query = " ".join(np.random.choice(words, size=5))
    start_time = time.time()
    collection.query(
        query_texts=[query],
        n_results=10
    )
    query_time = time.time() - start_time
    query_times.append(query_time)
    print(f"Query {i+1}: {query_time:.4f} seconds")

print(f"Average query time: {np.mean(query_times):.4f} seconds")



=== PERFORMANCE CONSIDERATIONS ===
Generating synthetic documents for performance testing...
Adding 1000 documents to collection...
Time to add 1000 documents: 4.58 seconds

Testing query performance...
Query 1: 0.0177 seconds
Query 2: 0.0162 seconds
Query 3: 0.0133 seconds
Query 4: 0.0130 seconds
Query 5: 0.0131 seconds
Average query time: 0.0147 seconds
