## Advanced RAG - Data Ingestion Pipeline
### Load Extracted Data into Qdrant with Multimodal Embeddings

**Learning Objectives:**
- Load extracted markdown, tables, and images from 06-01
- Create hybrid embeddings (dense + sparse)
- Store in Qdrant with rich metadata
- Support multimodal search (text + images)

**Prerequisites:**
- Run 06-01 notebook first to extract PDFs into markdown/images/tables
- Qdrant server running on localhost:6333

**What This Notebook Does:**
1. Load markdown files (split by page breaks)
2. Load tables with context
3. Load images with multimodal embeddings
4. Store all in single Qdrant collection with content_type metadata
5. Enable hybrid retrieval with deduplication

### Setup and Imports

In [None]:
from dotenv import load_dotenv
load_dotenv()

import hashlib
from pathlib import Path

from langchain_google_vertexai import VertexAIEmbeddings
from langchain_qdrant import QdrantVectorStore, RetrievalMode, FastEmbedSparse
from langchain_core.documents import Document

### Configuration

In [None]:
# Paths
MARKDOWN_DIR = "data/rag-data/rag-markdown"
TABLES_DIR = "data/rag-data/rag-tables"
IMAGES_DIR = "data/rag-data/rag-images"

# Qdrant Configuration
COLLECTION_NAME = "financial_docs"
EMBEDDING_MODEL = "multimodalembedding@001"  # Vertex AI Multimodal

### Initialize Embeddings and Vector Store

**Hybrid Retrieval**: Combines dense (semantic) and sparse (keyword) search for better results

In [None]:
# Multimodal embeddings (Vertex AI) - works for text AND images
embeddings = VertexAIEmbeddings(model_name=EMBEDDING_MODEL)

# Sparse embeddings (BM25)
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

# Initialize vector store with hybrid retrieval
vector_store = QdrantVectorStore.from_documents(
    documents=[],
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    collection_name=COLLECTION_NAME,
    url="http://localhost:6333",
    retrieval_mode=RetrievalMode.HYBRID,
    force_recreate=True
)

### Helper Functions

In [None]:
def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from markdown filename.
    
    Expected format: {company} {doc_type} {quarter} {year}.md
    Examples:
    - amazon 10-k 2024.md
    - amazon 10-q q1 2024.md
    
    Returns:
        dict with company_name, doc_type, fiscal_year, fiscal_quarter
    """
    name = filename.replace('.md', '')
    parts = name.split()
    
    metadata = {}
    metadata['company_name'] = parts[0]
    metadata['doc_type'] = parts[1]
    
    if len(parts) == 4:
        metadata['fiscal_quarter'] = parts[2]
        metadata['fiscal_year'] = int(parts[3])
    else:
        metadata['fiscal_quarter'] = None
        metadata['fiscal_year'] = int(parts[2])
    
    return metadata


def compute_file_hash(file_path: str) -> str:
    """Compute SHA-256 hash of file content for deduplication."""
    sha256_hash = hashlib.sha256()
    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

### Track Processed Files

In [None]:
# Get already processed files from Qdrant
all_points = vector_store.client.scroll(
    collection_name=COLLECTION_NAME,
    limit=10000,
    with_payload=True
)

processed_hashes = set(
    point.payload.get('file_hash') 
    for point in all_points[0] 
    if point.payload.get('file_hash')
)

print(f"Already processed: {len(processed_hashes)} files")

### Ingestion Functions

In [None]:
def ingest_markdown_to_vectordb(md_path: Path):
    """Ingest markdown file into Qdrant vector store."""
    print(f"Processing: {md_path.name}")
    
    file_hash = compute_file_hash(md_path)
    if file_hash in processed_hashes:
        print(f"  [SKIP] Already processed")
        return
    
    # Read markdown content
    markdown_text = md_path.read_text(encoding='utf-8')
    
    # Split by page breaks
    page_break = "<!-- page break -->"
    pages = markdown_text.split(page_break)
    
    # Get metadata from filename
    file_metadata = extract_metadata_from_filename(md_path.name)
    
    documents = []
    for page_num, page_text in enumerate(pages, start=1):
        if page_text.strip():
            metadata = file_metadata.copy()
            metadata['content_type'] = 'text'
            metadata['page'] = page_num
            metadata['file_hash'] = file_hash
            metadata['source_file'] = md_path.name
            
            doc = Document(page_content=page_text.strip(), metadata=metadata)
            documents.append(doc)
    
    vector_store.add_documents(documents=documents)
    processed_hashes.add(file_hash)
    
    print(f"  [DONE] Ingested {len(documents)} pages")


def ingest_tables_to_vectordb(company_dir: Path):
    """Ingest table files into Qdrant vector store."""
    table_count = 0
    for doc_dir in company_dir.iterdir():
        if doc_dir.is_dir():
            for table_file in doc_dir.glob("table_*.md"):
                file_hash = compute_file_hash(table_file)
                if file_hash in processed_hashes:
                    continue
                
                # Read table content
                table_content = table_file.read_text(encoding='utf-8')
                
                # Extract metadata from parent directory name
                doc_name = doc_dir.name
                file_metadata = extract_metadata_from_filename(doc_name + '.md')
                
                # Extract table number
                table_num = int(table_file.stem.split('_')[1])
                
                metadata = file_metadata.copy()
                metadata['content_type'] = 'table'
                metadata['table_number'] = table_num
                metadata['file_hash'] = file_hash
                metadata['source_file'] = table_file.name
                
                doc = Document(page_content=table_content, metadata=metadata)
                vector_store.add_documents([doc])
                processed_hashes.add(file_hash)
                table_count += 1
    
    if table_count > 0:
        print(f"  [DONE] Ingested {table_count} tables")


def ingest_images_to_vectordb(company_dir: Path):
    """Ingest images with multimodal embeddings into Qdrant vector store."""
    image_count = 0
    for doc_dir in company_dir.iterdir():
        if doc_dir.is_dir():
            for image_file in doc_dir.glob("page_*.png"):
                file_hash = compute_file_hash(image_file)
                if file_hash in processed_hashes:
                    continue
                
                # Extract page number from filename
                page_num = int(image_file.stem.split('_')[1])
                
                # Extract metadata from parent directory name
                doc_name = doc_dir.name
                file_metadata = extract_metadata_from_filename(doc_name + '.md')
                
                metadata = file_metadata.copy()
                metadata['content_type'] = 'image'
                metadata['page'] = page_num
                metadata['image_path'] = str(image_file)
                metadata['file_hash'] = file_hash
                
                # Embed image directly using multimodal embeddings
                image_embedding = embeddings.embed_image(uri=str(image_file))
                
                # Create document with image embedding
                doc = Document(
                    page_content=f"Visual content from page {page_num}",
                    metadata=metadata
                )
                
                # Add with custom embedding
                vector_store.add_embeddings([(doc, image_embedding)])
                processed_hashes.add(file_hash)
                image_count += 1
    
    if image_count > 0:
        print(f"  [DONE] Ingested {image_count} images")

### Process All Extracted Data

In [None]:
# Process markdown files
print("=== Ingesting Markdown Files ===")
markdown_path = Path(MARKDOWN_DIR)
md_files = list(markdown_path.rglob("*.md"))
print(f"Found {len(md_files)} markdown files\n")

for md_path in md_files:
    ingest_markdown_to_vectordb(md_path)

# Process tables
print("\n=== Ingesting Tables ===")
tables_path = Path(TABLES_DIR)
for company_dir in tables_path.iterdir():
    if company_dir.is_dir():
        print(f"Processing {company_dir.name}...")
        ingest_tables_to_vectordb(company_dir)

# Process images
print("\n=== Ingesting Images ===")
images_path = Path(IMAGES_DIR)
for company_dir in images_path.iterdir():
    if company_dir.is_dir():
        print(f"Processing {company_dir.name}...")
        ingest_images_to_vectordb(company_dir)

print("\n[ALL DONE]")

### Verify Ingestion

In [None]:
collection_info = vector_store.client.get_collection(COLLECTION_NAME)
print(f"Total documents in Qdrant: {collection_info.points_count}")

### Test Hybrid Search

In [None]:
# Test hybrid search
query = "What is Amazon's revenue?"
results = vector_store.similarity_search(query, k=5)

for i, doc in enumerate(results, 1):
    print(f"\n{i}. Type: {doc.metadata.get('content_type')} | Page: {doc.metadata.get('page')}")
    print(f"   Company: {doc.metadata.get('company_name')} | Year: {doc.metadata.get('fiscal_year')}")
    print(f"   Content: {doc.page_content[:200]}...")