In [2]:
import pytesseract
from PIL import Image
import pypdfium2 as pdfium
import io
import os

In [None]:


# --- Configuration ---
# IMPORTANT: You must set the path to the Tesseract executable 
# if it is not automatically found by pytesseract.
# 
# Example for Windows:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def ocr_scanned_pdf(pdf_path: str) -> str:
    """
    Performs OCR on a scanned PDF file and returns the extracted text.

    Args:
        pdf_path: The file path to the scanned PDF.

    Returns:
        A string containing all text extracted from the PDF, 
        or an error message if the file cannot be processed.
    """
    if not os.path.exists(pdf_path):
        return f"Error: PDF file not found at {pdf_path}"

    try:
        # Load the PDF file using pypdfium2
        pdf_document = pdfium.PdfDocument(pdf_path)
        num_pages = len(pdf_document)
        full_text = []

        print(f"Starting OCR on {num_pages} pages...")

        for i in range(num_pages):
            page = pdf_document.get_page(i)
            
            # Render the page to a bitmap (image)
            # Scale factor 2 provides good resolution for OCR
            bitmap = page.render(scale=2)
            
            # Convert the bitmap to a PIL Image object
            image = bitmap.to_pil()
            
            # --- OCR Processing ---
            # Use pytesseract to extract text from the image
            text = pytesseract.image_to_string(image)
            
            print(f"--- Page {i+1} OCR Completed ---")
            full_text.append(text)

        return "\n".join(full_text)

    except pytesseract.TesseractNotFoundError:
        return "Error: Tesseract is not installed or not in your PATH. Please install it or set 'pytesseract.pytesseract.tesseract_cmd'."
    except Exception as e:
        return f"An error occurred during processing: {e}"

# --- Example Usage ---
if __name__ == "__main__":
    # Replace 'path/to/your/scanned_document.pdf' with your actual file path
    pdf_file = 'path/to/your/scanned_document.pdf' 
    
    # --- IMPORTANT: Ensure you have a scanned PDF file here for testing ---
    # For demonstration, we'll use a placeholder path:
    # If you want to test this, create a simple PDF with a picture of text.
    
    # Example using a dummy file path (update this):
    dummy_pdf_file = "sample_scanned_document.pdf" 
    
    # Assuming 'sample_scanned_document.pdf' exists and is a scanned image PDF
    if os.path.exists(dummy_pdf_file):
        extracted_text = ocr_scanned_pdf(dummy_pdf_file)
        
        print("\n====================================")
        print("         EXTRACTED TEXT")
        print("====================================\n")
        print(extracted_text)
    else:
        print(f"Please replace '{dummy_pdf_file}' with the path to an existing scanned PDF to run the example.")