# Document Embedding Pipeline

This notebook processes documents (products, images, and localizations) and generates embeddings for them using CLIP. The embeddings are stored in Qdrant vector database for similarity search.

## Environment Setup
Configure environment variables and import required dependencies.

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'

In [None]:
import sys
sys.path.append(r'c:\Users\ice\projects\iris')

from tqdm.notebook import tqdm

from iris.config.data_pipeline_config_manager import DataPipelineConfigManager
from iris.data_pipeline.mongodb_manager import MongoDBManager
from iris.data_pipeline.image_store_manager import ImageStoreManager
from iris.config.embedding_pipeline_config_manager import EmbeddingPipelineConfigManager
from iris.embedding_pipeline.embedder import Embedder
from iris.data_pipeline.qdrant_manager import QdrantManager
from iris.embedding_pipeline.embedding_handler import EmbeddingHandler
from iris.models.product import Product

from iris.models.document import Document
from iris.protocols.context_protocols import HasMongoDBContext, HasImageContext
from pathlib import Path
from PIL import Image as PILImage

from iris.utils.log import logger

## Initialize Pipeline Components

Set up necessary managers and handlers:
1. Data pipeline configuration
2. MongoDB connection
3. Image store for accessing image data
4. Embedding pipeline with CLIP model
5. Qdrant vector database connection

In [None]:
# Initialize configuration managers
config_manager = DataPipelineConfigManager()

# Initialize MongoDB manager
mongodb_config = config_manager.mongodb_config
mongodb_manager = MongoDBManager(mongodb_config)

# Initialize image store manager
image_store_config = config_manager.image_store_config
image_store_manager = ImageStoreManager(image_store_config)

# Initialize embedder
embedding_config_manager = EmbeddingPipelineConfigManager()
embedder = Embedder(embedding_config_manager.clip_config)
qdrant_manager = QdrantManager(config_manager.qdrant_config)
embedding_handler = EmbeddingHandler(embedder=embedder, qdrant_manager=qdrant_manager)

## Context Setup

Create a combined context that provides access to both MongoDB and image storage functionality.
This allows for seamless document and image processing throughout the pipeline.

In [None]:
class FullContext:
    def __init__(self, mongodb_context: HasMongoDBContext, image_context: HasImageContext):
        self.mongodb_context = mongodb_context
        self.config = mongodb_context.config
        self.image_store_context = image_context

    def find_all(self, collection: str, document_hashes: list[str]) -> list["Document"]:
        return self.mongodb_context.find_all(collection, document_hashes)
        
    def get_pil_image(
        self, 
        image_id: str | None = None,
        path: Path | None = None, 
        url: str | None = None
    ) -> tuple[PILImage.Image, Path]:
        return self.image_store_context.get_pil_image(image_id=image_id, path=path, url=url)
    
full_context = FullContext(
    mongodb_context=mongodb_manager, 
    image_context=image_store_manager
)

## Process Product Localizations

Load products from MongoDB and update their localization references:
1. Fetch all products from the database
2. For each product, load associated localizations from their product images
3. Update the product record with new localization information

In [None]:
with mongodb_manager as mongodb, embedding_handler as emb_handl:
    products: list[Product] = mongodb.find_all(
        mongodb.config.product_collection
    )

    for product in tqdm(products, desc="Loading product localizations"):    
        product.load_localization_hashes(emb_handl, full_context)
        logger.info(f"Product: {product.metadata['title']} identified {len(product.localization_hashes)} localization(s) from {len(product.image_hashes)} image(s).")

        mongodb.upsert(mongodb.config.product_collection, product)

## Generate Embeddings

Process different types of documents and generate embeddings:
1. Products
2. Image metadata
3. Store embeddings in Qdrant for vector similarity search

In [None]:
document_collections = [
    mongodb_config.localization_collection,
    mongodb_config.product_collection,
    mongodb_config.image_metadata_collection,
]

embedding_collections = [
    qdrant_manager.qdrant_config.localization_collection,
    qdrant_manager.qdrant_config.product_collection,
    qdrant_manager.qdrant_config.image_collection,
]

with embedding_handler as emb_handl:
    for doc_col, emb_col in zip(document_collections, embedding_collections):
        logger.info(f" ----- Processing collection: {doc_col} ----- ")
        
        # Fetch documents from MongoDB
        documents = mongodb_manager.find_all(doc_col)
        
        for document in tqdm(documents, desc=f"Processing {doc_col}"):
            embedding = emb_handl.get_embedding(document, full_context, emb_col)