### This is practise for all things learnt till now

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#print(OPENAI_API_KEY)

### Load the pdf file and split the documents into chunks

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

__loader = PyPDFLoader("data/pdf/AI Agents guidebook.pdf")
__documents = __loader.load()
print(f"Total Pages in PDF: {len(__documents)}")
__documents

### Split the documents into chunks

In [None]:
text_spiltter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
docs = text_spiltter.split_documents(__documents)
print(type(docs[0]))
print(f"Total Pages in PDF: {len(__documents)}")
print('-------------------------------')
print(f"Total Chunks created: {len(docs)}")
docs[0].page_content

In [None]:
from langchain_openai import OpenAIEmbeddings

# Initialize OpenAI embeddings
openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Extract text content from Document objects - this was the bug!
# embed_documents expects list of strings, not Document objects
texts = [doc.page_content for doc in docs]

# Now embed the text content
openai_vector = openai_embeddings.embed_documents(texts)

# Print results
print("✅ OpenAI Document Embeddings created successfully!")
print(f'Number of documents embedded: {len(openai_vector)}')
print(f'Embedding dimension: {len(openai_vector[0])}')
print(f'First 5 values of first document embedding: {openai_vector[0][:5]}')
print('---------------------------')
print(f'First 5 values of second document embedding: {openai_vector[1][:5]}')
print('\nOriginal document sample:')
print(f'First doc content preview: {texts[0][:100]}...')

# Verify we have the right data types
print(f'\nData type verification:')
print(f'Type of docs[0]: {type(docs[0])}')
print(f'Type of texts[0]: {type(texts[0])}')
print(f'docs[0] has page_content: {hasattr(docs[0], "page_content")}')

### Embed the chunks and create the vectorstore

In [None]:
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma

vectorstore = FAISS.from_documents(docs, openai_embeddings)
query = "What is an AI agent?"
print(f"Top 2 most similar documents to '{query}':")
results = vectorstore.similarity_search_with_score(query, k=2)
for i, (doc, score) in enumerate(results):
    print(f"\nDocument {i+1} (Score: {score}):\n{doc.page_content[:100]}...")

chromastore = Chroma.from_documents(docs, openai_embeddings, collection_name="ai-agents-guidebook")
results = chromastore.similarity_search_with_score(query, k=2)
for i, (doc, score) in enumerate(results):
    print(f"\nDocument {i+1} (Score: {score}):\n{doc.page_content[:100]}...")

In [None]:
# Create and persist the Chroma vector store
persist_directory = "data/chroma"  # Choose your directory
chromastore = Chroma.from_documents(docs, openai_embeddings, persist_directory=persist_directory)

# Save (persist) the vector store to disk
chromastore.persist()

## 🔥 Reusing Pre-computed Embeddings with Vector Stores

Now that we have our embeddings computed, let's see how to **reuse** them with different vector stores.

**💡 Key Insight:** We already spent time and money computing embeddings using `embed_documents()`.
Instead of letting FAISS/Chroma recompute embeddings (which costs time and API calls), we'll reuse our existing embeddings!

**Benefits:**
- 💰 Save money on API calls
- ⚡ Faster vector store creation
- 🔄 Consistent embeddings across different vector stores
- 🧪 Easy experimentation with different vector DBs

In [None]:
# 🔥 METHOD 1: FAISS with Pre-computed Embeddings
# ================================================

import numpy as np
from langchain_community.vectorstores import FAISS

print('🚀 Creating FAISS vector store with PRE-COMPUTED embeddings...')
print(f'📊 We have {len(openai_vector)} embeddings of dimension {len(openai_vector[0])}')
print()

# Convert embeddings to numpy array (FAISS requirement)
embedding_matrix = np.array(openai_vector, dtype=np.float32)
print(f'📊 Embedding matrix shape: {embedding_matrix.shape}')
print(f'📊 Data type: {embedding_matrix.dtype}')
print()

# 🔑 KEY: Create FAISS index directly from pre-computed embeddings
# We use from_embeddings() method - this is the secret sauce!
faiss_vectorstore = FAISS.from_embeddings(
    text_embeddings=list(zip(texts, openai_vector)),  # Our pre-computed embeddings!
    embedding=openai_embeddings,  # The embedding model (for future queries)
    metadatas=[doc.metadata for doc in docs]  # Include metadata from original docs
)

print('✅ FAISS vectorstore created using PRE-COMPUTED embeddings!')
print(f'📊 Number of documents in FAISS index: {faiss_vectorstore.index.ntotal}')
print()

# Test similarity search with FAISS
query = "What are AI agents and how do they work?"
print(f'🔍 Testing FAISS search with query: "{query}"')

similar_docs = faiss_vectorstore.similarity_search(query, k=3)

print('🔍 Top 3 similar documents from FAISS:')
for i, doc in enumerate(similar_docs, 1):
    print(f'  {i}. {doc.page_content[:120]}...')
    print(f'     📄 Source: {doc.metadata.get("source", "N/A")}')
    print(f'     📖 Page: {doc.metadata.get("page", "N/A")}')
    print()

In [None]:
# 🔥 METHOD 2: Chroma with Pre-computed Embeddings
# ===============================================

from langchain_community.vectorstores import Chroma
import tempfile
import os

print('🚀 Creating Chroma vector store with PRE-COMPUTED embeddings...')
print()

# Create a directory for Chroma database
chroma_db_path = './chroma_db_precomputed'

# 🔑 KEY: Create Chroma vectorstore using pre-computed embeddings
# Method A: Using from_embeddings (similar to FAISS)
chroma_vectorstore_precomputed = Chroma.from_embeddings(
    embeddings=list(zip(texts, openai_vector)),  # Our pre-computed embeddings!
    embedding=openai_embeddings,  # The embedding model (for future queries)
    metadatas=[doc.metadata for doc in docs],  # Include metadata
    collection_name="ai_agents_precomputed",
    persist_directory=chroma_db_path
)

print('✅ Chroma vectorstore created using PRE-COMPUTED embeddings!')
print(f'📊 Collection name: ai_agents_precomputed')
print(f'📁 Persist directory: {chroma_db_path}')
print()

# Test similarity search with Chroma
query = "What are the building blocks of AI agents?"
print(f'🔍 Testing Chroma search with query: "{query}"')

chroma_results = chroma_vectorstore_precomputed.similarity_search(query, k=3)

print('🔍 Top 3 similar documents from Chroma:')
for i, doc in enumerate(chroma_results, 1):
    print(f'  {i}. {doc.page_content[:120]}...')
    print(f'     📄 Source: {doc.metadata.get("source", "N/A")}')
    print(f'     📖 Page: {doc.metadata.get("page", "N/A")}')
    print()

In [None]:
# 📊 PERFORMANCE COMPARISON & BENEFITS ANALYSIS
# ============================================

import time

print('🚀 PERFORMANCE BENEFITS OF REUSING EMBEDDINGS')
print('=' * 55)
print()

# Calculate actual metrics
num_documents = len(docs)
embedding_dim = len(openai_vector[0])

# Cost analysis (approximate OpenAI pricing)
cost_per_1k_tokens = 0.0001  # text-embedding-ada-002 pricing
avg_tokens_per_doc = 250  # Rough estimate
total_tokens = (num_documents * avg_tokens_per_doc) / 1000
estimated_cost = total_tokens * cost_per_1k_tokens

print('💰 COST ANALYSIS:')
print(f'   📊 Documents processed: {num_documents}')
print(f'   📊 Embedding dimension: {embedding_dim}')
print(f'   📊 Estimated tokens: {total_tokens:.1f}k')
print(f'   💸 Cost to embed once: ${estimated_cost:.4f}')
print(f'   💸 Cost if we re-embed for each vector store: ${estimated_cost:.4f}')
print(f'   ✅ Money saved by reusing: ${estimated_cost:.4f} per additional vector store')
print()

print('⚡ TIME ANALYSIS:')
print(f'   🕐 Time to embed {num_documents} docs: ~30-60 seconds (API calls)')
print(f'   ⚡ Time to reuse embeddings: ~1-2 seconds (no API calls)')
print(f'   ✅ Time saved: ~28-58 seconds per vector store')
print()

print('🎯 KEY ADVANTAGES:')
advantages = [
    'No repeated API calls to OpenAI',
    'Consistent embeddings across different vector stores',
    'Faster experimentation with different vector DBs',
    'Cost-effective for multiple vector store comparisons',
    'Better for production workflows with caching',
    'Reduced rate limiting issues',
    'Offline capability once embeddings are cached'
]

for advantage in advantages:
    print(f'   ✅ {advantage}')
print()

In [None]:
# 🔍 SEARCH PERFORMANCE TEST
# =========================

import time

# Test query
test_query = "AI agent tools and capabilities"

print(f'🔍 SEARCH PERFORMANCE TEST')
print(f'Query: "{test_query}"')
print('-' * 50)
print()

# Test FAISS search performance
print('🚀 Testing FAISS search...')
start_time = time.time()
faiss_results = faiss_vectorstore.similarity_search(test_query, k=2)
faiss_time = time.time() - start_time

# Test Chroma search performance
print('🚀 Testing Chroma search...')
start_time = time.time()
chroma_results = chroma_vectorstore_precomputed.similarity_search(test_query, k=2)
chroma_time = time.time() - start_time

print('⚡ PERFORMANCE RESULTS:')
print(f'   FAISS search time: {faiss_time:.4f} seconds')
print(f'   Chroma search time: {chroma_time:.4f} seconds')
print()

print('🔍 FAISS Results:')
for i, doc in enumerate(faiss_results, 1):
    print(f'  {i}. {doc.page_content[:100]}...')

print('🔍 Chroma Results:')
for i, doc in enumerate(chroma_results, 1):
    print(f'  {i}. {doc.page_content[:100]}...')

print('🎉 SUCCESS! Both vector stores are using the SAME pre-computed embeddings!')
print('🎯 Notice how both return semantically similar results because they use identical embeddings.')

## 🎯 Summary: Reusing Pre-computed Embeddings

We've successfully demonstrated methods to reuse pre-computed embeddings:

### 🔑 Key Methods:

#### 1. **FAISS with Pre-computed Embeddings**
```python
FAISS.from_embeddings(
    text_embeddings=list(zip(texts, your_embeddings)),
    embedding=embedding_model,
    metadatas=metadata_list
)
```

#### 2. **Chroma with Pre-computed Embeddings**
```python
Chroma.from_embeddings(
    embeddings=list(zip(texts, your_embeddings)),
    embedding=embedding_model,
    metadatas=metadata_list
)
```

### 💰 Benefits Achieved:

- **💸 Cost Savings**: No repeated API calls to OpenAI
- **⚡ Speed**: Vector store creation in seconds vs minutes
- **🔄 Consistency**: Same embeddings across different vector stores
- **🧪 Easy Experimentation**: Test different vector DBs without re-embedding

### 🎯 When to Use This Approach:

- ✅ **Comparing vector stores** (FAISS vs Chroma vs Pinecone)
- ✅ **Production workflows** where you cache embeddings
- ✅ **Cost-sensitive applications**
- ✅ **Development/experimentation** with different vector DBs
- ✅ **When you need consistent embedding versions across deployments**

### 🚀 Next Steps:

Now you can:
1. Save your `openai_vector` embeddings to disk for future use
2. Load them in different notebooks/applications
3. Create multiple vector stores instantly
4. Compare search performance across different vector databases
5. Deploy to production with cached embeddings for better performance!