 I put Gemini 2.0 Flash Lite to the test as an OCR solution for PDFs within a Retrieval-Augmented Generation (RAG) system. Instead of using traditional document loaders in LangChain, I explore how Gemini can extract text more effectively, especially when dealing with complex layouts.

In [None]:
pip install --upgrade google-genai pypdf2 langchain-core tqdm

In [None]:
from concurrent.futures import ThreadPoolExecutor
from langchain_core.documents import Document
from PyPDF2 import PdfReader, PdfWriter
from dotenv import load_dotenv
from google.genai import types
from google import genai
from typing import List
from tqdm import tqdm
import threading
import base64
import json
import io
import os
load_dotenv()

In [None]:
# Setting up an API key for the Google API library.

API_KEY = "[your-api-key]"

if not API_KEY or API_KEY == "[your-api-key]":
    API_KEY = os.getenv('GOOGLE_API_KEY')

client = genai.Client(api_key=API_KEY)

In [None]:
def get_pdf_reader_and_metadata(pdf_path):
    """
    Creates a PDF reader and extracts metadata from the PDF file.

    Args:
    - pdf_path (str): The file path of the input PDF.

    Returns:
    - tuple: (PdfReader object, metadata dictionary)
    """
    reader = PdfReader(pdf_path)
    metadata = {
        "total_pages": len(reader.pages),
        "title": reader.metadata.get('/Title', ''),
        "author": reader.metadata.get('/Author', ''),
        "creation_date": reader.metadata.get('/CreationDate', '')
    }
    return reader, metadata

def extract_page(reader, page_number):
    """
    Extracts a single page from a PDF reader and returns it as bytes.

    Args:
    - reader (PdfReader): The PDF reader object
    - page_number (int): The page number to extract (0-indexed)

    Returns:
    - tuple: (bytes of the PDF page, None if page number is invalid)
    """
    # Check if the page number is valid
    total_pages = len(reader.pages)
    if page_number < 0 or page_number >= total_pages:
        print(f"Page {page_number} is out of range. This PDF has {total_pages} pages.")
        return None

    # Initialize the PDF writer and add the specified page
    writer = PdfWriter()
    writer.add_page(reader.pages[page_number])

    # Write to bytes buffer instead of file
    buffer = io.BytesIO()
    writer.write(buffer)

    return buffer.getvalue()

In [None]:


def generate(b64Page, metadata):

  document1_1 = types.Part.from_bytes(
      data=b64Page,
      mime_type="application/pdf",
  )
  textsi_1 = """You are tasked with converting a PDF document to text in markdown format. Your goal is to accurately represent the content, structure, and layout of the original PDF while using markdown syntax. Follow these instructions carefully:
      To convert this PDF content to markdown format, follow these steps:

      1. Document Structure:
       - Preserve the overall structure of the document.
       - Use appropriate markdown syntax for headers, subheaders, and sections.
       - Maintain the original hierarchy of the document.

      2. Text Formatting:
       - Convert basic text to plain markdown text.
       - Use markdown syntax for bold (**text**), italic (*text*), and strikethrough (~~text~~) where applicable.
       - Preserve any special characters or symbols as they appear in the original document.

      3. Headers:
       - Use the appropriate number of hash symbols (#) to represent different header levels.
       - Example: # for H1, ## for H2, ### for H3, and so on.

      4. Paragraphs:
       - Separate paragraphs with a blank line.
       - Preserve any indentation or special formatting within paragraphs.

      5. Lists:
       - Use - for unordered lists and 1. 2. 3. for ordered lists.
       - Maintain the original indentation for nested lists.

      6. Tables:
       - Convert tables to markdown table format.
       - Use | to separate columns and - to create the header row.
       - Align columns using : in the header row (e.g., |:---:| for center alignment).

      7. Links:
       - Convert hyperlinks to markdown format: [link text](URL)

      8. Images:
       - For each image/chart in the PDF, insert a placeholder in the following format:
        [Image Description]
       - Provide a full description of the image in place of \"Image Description\".
      - describe the image in detail, like you would describe it to a blind person.

      9. Footnotes:
       - Use markdown footnote syntax: [^1] for the reference and [^1]: Footnote text for the footnote content.
       - Place all footnotes at the end of the document.

      10. Code Blocks:
       - Use triple backticks (```) to enclose code blocks.
       - Specify the language after the opening backticks if applicable.

      11. Blockquotes:
       - Use > to indicate blockquotes.
       - For nested blockquotes, use multiple > symbols.

      12. Horizontal Rules:
       - Use three or more hyphens (---) on a line by themselves to create a horizontal rule.

      13. Special Elements:
       - If there are any special elements in the PDF (e.g., mathematical equations, diagrams), describe them in plain text within square brackets.

      14. Preserve Layout:
       - Maintain the original layout as much as possible, including line breaks and spacing.
       - Use empty lines and appropriate markdown syntax to recreate the visual structure of the document.

      Once you have converted the entire PDF content to markdown format.  Ensure that all elements of the original document are accurately represented in the markdown version."""

  # model = "gemini-2.0-flash-001"
  model = "gemini-2.0-flash-lite-preview-02-05"
  contents = [
    types.Content(
      role="user",
      parts=[
        document1_1,
        types.Part.from_text(text="""convert this file""")
      ]
    ),
  ]
  generate_content_config = types.GenerateContentConfig(
    temperature = 0,
    top_p = 0.95,
    max_output_tokens = 8192,
    response_modalities = ["TEXT"],
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
    response_mime_type = "application/json",
    response_schema = {
      "type": "OBJECT",
      "properties": {
          "page_content": {
              "type": "STRING",
              "description": "The content of the document."
          },
      },
      "required": ["page_content"]
    },    
    system_instruction=[types.Part.from_text(text=textsi_1)],
  )

  response = client.models.generate_content(
    model = model,
    contents = contents,
    config = generate_content_config,
  )
  response_dict = json.loads(response.text)
    
    # Create a Document object
  return Document(
      page_content=response_dict['page_content'],
      metadata=metadata  # Using the metadata from extract_page_from_pdf
  )

# Single Page Demo

In [None]:
pdf_path = "<your file path>"
page_number = 0  # Remember, the first page is 0
reader, metadata = get_pdf_reader_and_metadata(pdf_path)
metadata['page'] = page_number + 1
b64Page = extract_page(reader, page_number)
document_page = generate(b64Page, metadata)
print(document_page)

# Load full PDF file

In [None]:
def process_all_pages(pdf_path):
    """
    Processes all pages in a PDF and returns an array of Document objects.
    
    Args:
    - pdf_path (str): The file path of the input PDF.
    
    Returns:
    - list: Array of Document objects, one for each page
    """
    # Initialize the PDF reader and get metadata
    reader, metadata = get_pdf_reader_and_metadata(pdf_path)
    documents = []
    
    # Process each page
    for page_number in tqdm(range(len(reader.pages)), desc="Processing pages"):
        # Update metadata for current page
        page_metadata = metadata.copy()
        page_metadata['page'] = page_number + 1
        
        # Extract and process page
        b64_page = extract_page(reader, page_number)
        document_page = generate(b64_page, page_metadata)
        documents.append(document_page)
    
    return documents

In [None]:
pdf_path = "<Your file path>"
all_documents = process_all_pages(pdf_path)

# Parallel processing

In [None]:
def process_page(args) -> tuple[int, Document]:
    """
    Process a single page and return it with its index for ordering.
    
    Args:
    - args: tuple containing (page_number, pdf_path, metadata)
    
    Returns:
    - tuple: (page_number, Document)
    """
    page_number, pdf_path, metadata = args
    
    # Create a new reader instance for this page
    reader = PdfReader(pdf_path)
    
    # Update metadata for current page
    page_metadata = metadata.copy()
    page_metadata['page'] = page_number + 1
    
    # Extract and process page
    b64_page = extract_page(reader, page_number)
    document_page = generate(b64_page, page_metadata)
    
    return page_number, document_page

def process_all_pages_parallel(pdf_path: str, batch_size: int = 3) -> List[Document]:
    """
    Processes all pages in a PDF in parallel batches and returns an ordered array of Document objects.
    
    Args:
    - pdf_path (str): The file path of the input PDF
    - batch_size (int): Number of pages to process in parallel (default: 3)
    
    Returns:
    - list: Ordered array of Document objects, one for each page
    """
    # Get initial metadata and page count
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    metadata = {
        "total_pages": total_pages,
        "title": reader.metadata.get('/Title', ''),
        "author": reader.metadata.get('/Author', ''),
        "creation_date": reader.metadata.get('/CreationDate', '')
    }
    documents = [None] * total_pages  # Pre-allocate list with correct size
    
    # Create progress bar
    pbar = tqdm(total=total_pages, desc="Processing pages")
    pbar_lock = threading.Lock()
    
    def process_batch(start_idx: int, end_idx: int):
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            # Create arguments for each page in the batch
            batch_args = [(i, pdf_path, metadata) for i in range(start_idx, min(end_idx, total_pages))]
            
            # Process the batch in parallel
            for page_number, document in executor.map(process_page, batch_args):
                documents[page_number] = document
                with pbar_lock:
                    pbar.update(1)
    
    # Process pages in batches
    try:
        for batch_start in range(0, total_pages, batch_size):
            batch_end = batch_start + batch_size
            process_batch(batch_start, batch_end)
    finally:
        pbar.close()
    
    return documents

In [None]:
pdf_path = "<Your file path>"
batch_size = 70
all_documents = process_all_pages_parallel(pdf_path, batch_size)