## Data Ingestion for Deep RAG

In this notebook, we'll load extracted data into Qdrant vector database:

- **Markdown**: Page-level chunks with metadata
- **Tables**: Separate documents with context and page numbers
- **Images**: Multimodal embeddings for visual content
- **Hybrid Search**: Dense (semantic) + Sparse (keyword) embeddings

**Prerequisites:**
- Run notebook 06-01 first to extract PDFs
- Qdrant server running on localhost:6333

**Output:**
- Single Qdrant collection with all content types
- Rich metadata for filtering (company, year, quarter, doc_type, page)
- Deduplication using file hashes

### 1. Setup and Imports

In [None]:
from dotenv import load_dotenv
load_dotenv()

import hashlib
from pathlib import Path

from langchain_google_vertexai import VertexAIEmbeddings
from langchain_qdrant import QdrantVectorStore, RetrievalMode, FastEmbedSparse
from langchain_core.documents import Document

### 2. Configuration

In [None]:
# Paths
MARKDOWN_DIR = "data/rag-data/markdown"
TABLES_DIR = "data/rag-data/tables"
IMAGES_DIR = "data/rag-data/images"

# Qdrant Configuration
COLLECTION_NAME = "financial_docs"
EMBEDDING_MODEL = "multimodalembedding@001"

### 3. Initialize Vector Store

In [None]:
# Multimodal embeddings (Vertex AI) - works for text AND images
embeddings = VertexAIEmbeddings(model_name=EMBEDDING_MODEL)

# Sparse embeddings (BM25 for keyword matching)
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

# Initialize vector store with hybrid retrieval
vector_store = QdrantVectorStore.from_documents(
    documents=[],
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    collection_name=COLLECTION_NAME,
    url="http://localhost:6333",
    retrieval_mode=RetrievalMode.HYBRID,
    force_recreate=True
)

print(f"✓ Vector store initialized: {COLLECTION_NAME}")

### 4. Helper Functions

In [None]:
def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from filename.
    
    Expected format: CompanyName DocType [Quarter] Year.md
    Examples:
        - Amazon 10-K 2024.md
        - Amazon 10-Q Q1 2024.md
    """
    name = filename.replace('.md', '').replace('.pdf', '')
    parts = name.split()
    
    return {
        'company_name': parts[0],
        'doc_type': parts[1],
        'fiscal_quarter': parts[2] if len(parts) == 4 else None,
        'fiscal_year': int(parts[-1])
    }

In [None]:
def compute_file_hash(file_path: Path) -> str:
    """Compute SHA-256 hash for deduplication."""
    sha256_hash = hashlib.sha256()
    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

In [None]:
def get_processed_hashes() -> set:
    """Get file hashes already in Qdrant."""
    all_points = vector_store.client.scroll(
        collection_name=COLLECTION_NAME,
        limit=10000,
        with_payload=True
    )
    
    hashes = set(
        point.payload.get('file_hash') 
        for point in all_points[0] 
        if point.payload.get('file_hash')
    )
    
    print(f"Already processed: {len(hashes)} files")
    return hashes

### 5. Ingestion Functions

In [None]:
def ingest_markdown_file(md_path: Path, processed_hashes: set):
    """Ingest markdown file split by pages."""
    file_hash = compute_file_hash(md_path)
    if file_hash in processed_hashes:
        print(f"  [SKIP] {md_path.name}")
        return 0
    
    # Read and split by page breaks
    markdown_text = md_path.read_text(encoding='utf-8')
    pages = markdown_text.split("<!-- page break -->")
    
    # Get metadata from filename
    file_metadata = extract_metadata_from_filename(md_path.name)
    
    # Create documents for each page
    documents = []
    for page_num, page_text in enumerate(pages, start=1):
        if page_text.strip():
            metadata = file_metadata.copy()
            metadata['content_type'] = 'text'
            metadata['page'] = page_num
            metadata['file_hash'] = file_hash
            metadata['source_file'] = md_path.name
            
            documents.append(Document(page_content=page_text.strip(), metadata=metadata))
    
    # Add to vector store
    if documents:
        vector_store.add_documents(documents=documents)
        processed_hashes.add(file_hash)
        print(f"  ✓ {md_path.name} ({len(documents)} pages)")
    
    return len(documents)

In [None]:
def ingest_table_file(table_path: Path, doc_name: str, processed_hashes: set):
    """
    Ingest a single table file.
    
    Args:
        table_path: Path to table file (e.g., table_1_page_5.md)
        doc_name: Parent document name for metadata
        processed_hashes: Set of already processed file hashes
    """
    file_hash = compute_file_hash(table_path)
    if file_hash in processed_hashes:
        return 0
    
    # Read table content
    table_content = table_path.read_text(encoding='utf-8')
    
    # Extract metadata from filename
    file_metadata = extract_metadata_from_filename(doc_name + '.md')
    
    # Extract table number and page number from filename
    # Format: table_1_page_5.md
    stem = table_path.stem  # table_1_page_5
    parts = stem.split('_')
    table_num = int(parts[1])  # 1
    page_num = int(parts[3]) if len(parts) >= 4 else None  # 5
    
    # Create metadata
    metadata = file_metadata.copy()
    metadata['content_type'] = 'table'
    metadata['table_number'] = table_num
    metadata['page'] = page_num
    metadata['file_hash'] = file_hash
    metadata['source_file'] = table_path.name
    
    # Add to vector store
    doc = Document(page_content=table_content, metadata=metadata)
    vector_store.add_documents([doc])
    processed_hashes.add(file_hash)
    
    return 1

In [None]:
def ingest_image_file(image_path: Path, doc_name: str, processed_hashes: set):
    """
    Ingest a single image file with multimodal embeddings.
    
    Args:
        image_path: Path to image file (e.g., page_5.png)
        doc_name: Parent document name for metadata
        processed_hashes: Set of already processed file hashes
    """
    file_hash = compute_file_hash(image_path)
    if file_hash in processed_hashes:
        return 0
    
    # Extract page number from filename (page_5.png)
    page_num = int(image_path.stem.split('_')[1])
    
    # Extract metadata from parent document name
    file_metadata = extract_metadata_from_filename(doc_name + '.md')
    
    # Create metadata
    metadata = file_metadata.copy()
    metadata['content_type'] = 'image'
    metadata['page'] = page_num
    metadata['image_path'] = str(image_path)
    metadata['file_hash'] = file_hash
    
    # Embed image using multimodal embeddings
    image_embedding = embeddings.embed_image(uri=str(image_path))
    
    # Create document
    doc = Document(
        page_content=f"Visual content from page {page_num}",
        metadata=metadata
    )
    
    # Add with custom embedding
    vector_store.add_embeddings([(doc, image_embedding)])
    processed_hashes.add(file_hash)
    
    return 1

In [None]:
def ingest_company_tables(company_dir: Path, processed_hashes: set) -> int:
    """Ingest all tables for a company."""
    table_count = 0
    
    for doc_dir in company_dir.iterdir():
        if doc_dir.is_dir():
            for table_file in doc_dir.glob("table_*.md"):
                table_count += ingest_table_file(table_file, doc_dir.name, processed_hashes)
    
    return table_count

In [None]:
def ingest_company_images(company_dir: Path, processed_hashes: set) -> int:
    """Ingest all images for a company."""
    image_count = 0
    
    for doc_dir in company_dir.iterdir():
        if doc_dir.is_dir():
            for image_file in doc_dir.glob("page_*.png"):
                image_count += ingest_image_file(image_file, doc_dir.name, processed_hashes)
    
    return image_count

### 6. Process All Data

In [None]:
# Get already processed files
processed_hashes = get_processed_hashes()

# Process markdown files
print("\n=== Ingesting Markdown Files ===")
markdown_path = Path(MARKDOWN_DIR)
md_files = list(markdown_path.rglob("*.md"))
print(f"Found {len(md_files)} markdown files\n")

total_pages = 0
for idx, md_path in enumerate(md_files, 1):
    print(f"[{idx}/{len(md_files)}]", end=" ")
    total_pages += ingest_markdown_file(md_path, processed_hashes)

print(f"\nTotal pages ingested: {total_pages}")

In [None]:
# Process tables
print("\n=== Ingesting Tables ===")
tables_path = Path(TABLES_DIR)
company_dirs = [d for d in tables_path.iterdir() if d.is_dir()]
print(f"Found {len(company_dirs)} companies\n")

total_tables = 0
for idx, company_dir in enumerate(company_dirs, 1):
    print(f"[{idx}/{len(company_dirs)}] {company_dir.name}...", end=" ")
    count = ingest_company_tables(company_dir, processed_hashes)
    total_tables += count
    print(f"✓ {count} tables")

print(f"\nTotal tables ingested: {total_tables}")

In [None]:
# Process images
print("\n=== Ingesting Images ===")
images_path = Path(IMAGES_DIR)
company_dirs = [d for d in images_path.iterdir() if d.is_dir()]
print(f"Found {len(company_dirs)} companies\n")

total_images = 0
for idx, company_dir in enumerate(company_dirs, 1):
    print(f"[{idx}/{len(company_dirs)}] {company_dir.name}...", end=" ")
    count = ingest_company_images(company_dir, processed_hashes)
    total_images += count
    print(f"✓ {count} images")

print(f"\nTotal images ingested: {total_images}")

### 7. Verify Ingestion

In [None]:
collection_info = vector_store.client.get_collection(COLLECTION_NAME)
print(f"\n=== Collection Summary ===")
print(f"Total documents: {collection_info.points_count}")
print(f"Vector size: {collection_info.config.params.vectors.size}")

### 8. Test Hybrid Search

In [None]:
# Test hybrid search
query = "What is Amazon's revenue?"
results = vector_store.similarity_search(query, k=5)

print(f"Query: {query}\n")
for i, doc in enumerate(results, 1):
    print(f"{i}. Type: {doc.metadata.get('content_type')} | Page: {doc.metadata.get('page')}")
    print(f"   Company: {doc.metadata.get('company_name')} | Year: {doc.metadata.get('fiscal_year')}")
    print(f"   Content: {doc.page_content[:150]}...\n")