## Data Ingestion for Deep RAG

In this notebook, we'll load extracted data into Qdrant vector database:

- **Markdown**: Page-level chunks with metadata
- **Tables**: Separate documents with context and page numbers
- **Images**: Text descriptions embedded (generated in notebook 06-01b)
- **Hybrid Search**: Dense (semantic) + Sparse (keyword) embeddings

**Prerequisites:**
- Run notebook 06-01 first to extract PDFs
- Run notebook 06-01b to generate image descriptions
- Qdrant server running on localhost:6333
- Google API key set in .env file

**Output:**
- Single Qdrant collection with all content types
- Rich metadata for filtering (company, year, quarter, doc_type, page)
- Deduplication using file hashes

**Make Sure You Have Your QDRANT Vector DB Docker Running**

https://qdrant.tech/

### 1. Setup and Imports

In [None]:
from dotenv import load_dotenv
load_dotenv()

import hashlib
from pathlib import Path

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_qdrant import QdrantVectorStore, RetrievalMode, FastEmbedSparse
from langchain_core.documents import Document
from qdrant_client import QdrantClient

### 2. Configuration

In [None]:
# Paths
MARKDOWN_DIR = "data/rag-data/markdown"
TABLES_DIR = "data/rag-data/tables"
IMAGES_DESC_DIR = "data/rag-data/images_desc"

# Qdrant Configuration
COLLECTION_NAME = "financial_docs"
EMBEDDING_MODEL = "models/gemini-embedding-001"

### 3. Initialize Embeddings and Client

In [None]:
# Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL)
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

### 4. Create or Recreate Collection

In [None]:
# Create vector store
vector_store = QdrantVectorStore.from_documents(
    documents=[],  # Empty initialization
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    url="http://localhost:6333",
    collection_name=COLLECTION_NAME,
    retrieval_mode=RetrievalMode.HYBRID,
    force_recreate=False
)

In [None]:
vector_store._client

### 5. Helper Functions

In [None]:
# Reuse from 06-01
def extract_metadata_from_filename(filename: str):
    """Extract metadata from filename."""
    
    filename = filename.replace('.pdf', '').replace('.md', '')
    parts = filename.split()

    return {
        'company_name': parts[0],
        'doc_type': parts[1],
        'fiscal_quarter': parts[2] if len(parts)==4 else None,
        'fiscal_year': parts[-1]
    }

In [None]:
def compute_file_hash(file_path: Path) -> str:
    """Compute SHA-256 hash for deduplication."""
    sha256_hash = hashlib.sha256()
    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

In [None]:
all_points = vector_store.client.scroll(
        collection_name=COLLECTION_NAME,
        limit=10000,
        with_payload=True
    )
    
all_points[0][0].payload['metadata']['file_hash']

In [None]:
def get_processed_hashes():
    """Get all file hashes from Qdrant."""
    
    processed_hashes = set()
    offset = None
    
    while True:
        points, offset = vector_store.client.scroll(
            collection_name=COLLECTION_NAME,
            limit=10000,
            offset=offset,
            with_payload=True
        )
        
        if not points:
            break
            
        processed_hashes.update(
            point.payload['metadata'].get('file_hash') 
            for point in points
        )
        
        if offset is None:
            break
    
    return processed_hashes

In [None]:
processed_hashes = get_processed_hashes()
len(processed_hashes)

In [None]:
import re
def extract_page_number(file_path: Path) -> int | None:
    """Extract page number from filename (e.g., table_1_page_5.md -> 5)."""
    match = re.search(r'_page_(\d+)', file_path.stem)
    return int(match.group(1)) if match else None

### 6. Ingestion Function

In [None]:
def ingest_file(file_path: Path, processed_hashes: set):
    """Ingest markdown files (text, tables, images) into vector store."""

    # Skip if already processed
    file_hash = compute_file_hash(file_path)
    if file_hash in processed_hashes:
        return

    # Determine content type and document name from path
    path_str = str(file_path)
    if 'markdown' in path_str:
        content_type = 'text'
        doc_name = file_path.name
    elif 'tables' in path_str:
        content_type = 'table'
        doc_name = file_path.parent.name
    elif 'images_desc' in path_str:
        content_type = 'image'
        doc_name = file_path.parent.name
    else:
        return  # Skip unknown types

    # Read content
    content = file_path.read_text(encoding='utf-8')

    # Build base metadata
    base_metadata = extract_metadata_from_filename(doc_name)
    base_metadata.update({
        'content_type': content_type,
        'file_hash': file_hash,
        'source_file': file_path.name
    })

    # Create and add documents based on content type
    if content_type == 'text':
        # Split markdown by page breaks
        pages = content.split("<!-- page break -->")
        documents = [
            Document(page_content=page_text.strip(), metadata={**base_metadata, 'page': i})
            for i, page_text in enumerate(pages, start=1)
            if page_text.strip()
        ]
        vector_store.add_documents(documents)

    else:  # table or image
        page_num = extract_page_number(file_path)
        metadata = {**base_metadata, 'page': page_num} if page_num else base_metadata
        vector_store.add_documents([Document(page_content=content, metadata=metadata)])

    processed_hashes.add(file_hash)

In [None]:
### 7. Ingest All Data

In [None]:
# Get all markdown files from all directories
base_path = Path("data/rag-data")
all_md_files = list(base_path.rglob("*.md"))

# Ingest everything with single function
for md_file in all_md_files:
    ingest_file(md_file, processed_hashes)

print("All data ingested successfully")

### 8. Verify Ingestion

In [None]:
collection_info = vector_store.client.get_collection(COLLECTION_NAME)
collection_info

### 9. Test Search

In [None]:
# Test hybrid search
query = "What is Amazon's revenue?"
results = vector_store.similarity_search(query, k=5)

results