In [1]:
print("hola")

hola


In [2]:
import base64
import os
from mistralai import Mistral

In [3]:
def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None

# Path to your pdf
pdf_path = "C:/MoovMedia/ocr_exploration/input/The-Future-100-2025_part_1.pdf"

# Getting the base64 string
base64_pdf = encode_pdf(pdf_path)

api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)

ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": f"data:application/pdf;base64,{base64_pdf}" 
    },
    include_image_base64=True
)


KeyboardInterrupt: 

In [None]:
ocr_response.pages[11]

In [12]:
import base64
import os
import io
from typing import Optional, Dict, Any
import google.generativeai as genai
from PIL import Image
import fitz  # PyMuPDF for PDF processing

def encode_pdf(pdf_path: str) -> Optional[str]:
    """
    Encode the PDF to base64.
    
    Args:
        pdf_path: Path to the PDF file
        
    Returns:
        Base64 encoded string of the PDF or None if error
    """
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

def extract_text_from_pdf_with_gemini(pdf_path: str, api_key: str, start_page: int = 10, end_page: int = 12) -> Dict[str, Any]:
    """
    Extract text from PDF using Gemini Vision API for specific page range.
    
    Args:
        pdf_path: Path to the PDF file
        api_key: Google API key for Gemini
        start_page: First page to extract (1-indexed, default: 10)
        end_page: Last page to extract (1-indexed, default: 12)
        
    Returns:
        Dictionary containing extracted text and metadata
    """
    # Configure Gemini
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel('gemini-1.5-flash')
    
    # Open PDF and extract pages as images
    pdf_document = fitz.open(pdf_path)
    extracted_text = []
    
    # Validate page range
    total_pages_in_pdf = len(pdf_document)
    
    # Adjust for 0-indexed pages in PyMuPDF
    start_page_idx = start_page - 1
    end_page_idx = end_page - 1
    
    # Validate page range
    if start_page_idx < 0 or end_page_idx >= total_pages_in_pdf:
        print(f"Error: Page range {start_page}-{end_page} is invalid for PDF with {total_pages_in_pdf} pages")
        pdf_document.close()
        return None
    
    if start_page_idx > end_page_idx:
        print(f"Error: Start page ({start_page}) cannot be greater than end page ({end_page})")
        pdf_document.close()
        return None
    
    print(f"Starting OCR extraction...")
    print(f"Total pages in PDF: {total_pages_in_pdf}")
    print(f"Extracting pages: {start_page} to {end_page}")
    print("-" * 50)
    
    for page_num in range(start_page_idx, end_page_idx + 1):
        actual_page_num = page_num + 1  # Convert back to 1-indexed for display
        print(f"Processing page {actual_page_num}...")
        
        page = pdf_document[page_num]
        
        # Convert page to image
        pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))  # Higher resolution
        img_data = pix.tobytes("png")
        
        # Create PIL Image
        image = Image.open(io.BytesIO(img_data))
        
        # Prepare prompt for Gemini
        prompt = """
        Please extract all text from this image. 
        Maintain the original formatting and structure.
        If there are tables, preserve the table structure.
        Return only the extracted text without any additional commentary.
        """
        
        try:
            # Generate content with Gemini
            response = model.generate_content([prompt, image])
            page_text = response.text
            extracted_text.append({
                'page': actual_page_num,
                'text': page_text
            })
            print(f"✓ Page {actual_page_num} extracted successfully")
            
        except Exception as e:
            print(f"✗ Error processing page {actual_page_num}: {e}")
            extracted_text.append({
                'page': actual_page_num,
                'text': f"Error extracting text: {e}"
            })
    
    pdf_document.close()
    
    print("-" * 50)
    print(f"OCR extraction completed!")
    print(f"Successfully processed: {len([p for p in extracted_text if not p['text'].startswith('Error')])}/{len(extracted_text)} pages")
    
    return {
        'total_pages_extracted': len(extracted_text),
        'pages': extracted_text,
        'start_page': start_page,
        'end_page': end_page,
        'total_pages_in_pdf': total_pages_in_pdf
    }

def save_extracted_text_to_file(extracted_data: Dict[str, Any], output_path: str) -> None:
    """
    Save extracted text to a file.
    
    Args:
        extracted_data: Dictionary containing extracted text
        output_path: Path where to save the output file
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"OCR Extraction Results\n")
        f.write(f"Pages extracted: {extracted_data['start_page']} to {extracted_data['end_page']}\n")
        f.write(f"Total pages in PDF: {extracted_data['total_pages_in_pdf']}\n")
        f.write("=" * 50 + "\n\n")
        
        for page_data in extracted_data['pages']:
            f.write(f"Page {page_data['page']}\n")
            f.write("-" * 30 + "\n")
            f.write(page_data['text'])
            f.write("\n\n")

# Usage example
if __name__ == "__main__":
    # Set your API key
    api_key = os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        print("Please set GOOGLE_API_KEY environment variable")
        exit(1)
    
    # Path to your PDF
    pdf_path = "C:/MoovMedia/ocr_exploration/input/The-Future-100-2025_part_1.pdf"
    
    # Extract text from pages 10-12 only
    result = extract_text_from_pdf_with_gemini(pdf_path, api_key, start_page=10, end_page=12)
    
    # Save to file
    output_path = "C:/MoovMedia/ocr_exploration/output/extracted_text_gemini.txt"
    save_extracted_text_to_file(result, output_path)
    
    print(f"Results saved to {output_path}")

Starting OCR extraction...
Total pages in PDF: 46
Extracting pages: 10 to 12
--------------------------------------------------
Processing page 10...
✓ Page 10 extracted successfully
Processing page 11...
✓ Page 11 extracted successfully
Processing page 12...
✓ Page 12 extracted successfully
--------------------------------------------------
OCR extraction completed!
Successfully processed: 3/3 pages
Results saved to C:/MoovMedia/ocr_exploration/output/extracted_text_gemini.txt
