In [16]:
pip install PyMuPDF





In [17]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import os

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [None]:
def pdf_to_images_with_pymupdf(pdf_path, output_folder, dpi=300):
    """
    Converts PDF pages to images using PyMuPDF.

    :param pdf_path: Path to the PDF file
    :param output_folder: Folder to store output images
    :param dpi: Resolution for the images (default: 300)
    :return: List of image file paths
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Ensure output directory exists
    os.makedirs(output_folder, exist_ok=True)

    image_paths = []  

    # Loop through all pages in the PDF
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]  # Load the page
        pixmap = page.get_pixmap(dpi=dpi)  # Render the page as a pixmap
        output_file = os.path.join(output_folder, f"page_{page_num + 1}.png")
        pixmap.save(output_file)  # Save the rendered page as an image
        image_paths.append(output_file)  # Store image path
        print(f"Saved page {page_num + 1} as {output_file}")

    # Close the PDF document
    pdf_document.close()
    return image_paths

def perform_ocr_on_images(image_paths, lang="fra"):
    """
    Performs OCR on a list of images using Tesseract.

    :param image_paths: List of image file paths
    :param lang: Language for OCR (default: French - 'fra')
    :return: Extracted text as a string
    """
    extracted_text = ""
    for image_path in image_paths:
        print(f"Performing OCR on {image_path}...")
        text = pytesseract.image_to_string(Image.open(image_path), lang=lang)
        extracted_text += f"\n\n=== Extracted Text from {os.path.basename(image_path)} ===\n\n{text}"
    return extracted_text

def pdf_to_text_with_ocr(pdf_path, output_folder, output_text_file, lang="fra", dpi=300):
    """
    Converts a PDF file to text using PyMuPDF and Tesseract OCR.

    :param pdf_path: Path to the PDF file
    :param output_folder: Folder to store intermediate images
    :param output_text_file: File to save extracted text
    :param lang: Language for OCR (default: French - 'fra')
    :param dpi: Resolution for images (default: 300)
    :return: None
    """
    # Step 1: Convert PDF to images
    print("Converting PDF pages to images...")
    image_paths = pdf_to_images_with_pymupdf(pdf_path, output_folder, dpi)

    # Step 2: Perform OCR on images
    print("Performing OCR on extracted images...")
    extracted_text = perform_ocr_on_images(image_paths, lang)

    # Step 3: Save extracted text to file
    with open(output_text_file, "w", encoding="utf-8") as f:
        f.write(extracted_text)
    print(f"Text extraction complete. Output saved to {output_text_file}")

# Input and output paths
# pdf_file = "tome_12.pdf"  
# output_dir = "output_12"  
# output_text = "extracted_text_12.txt"  

# # Convert PDF to text with OCR
# pdf_to_text_with_ocr(pdf_file, output_dir, output_text)


Converting PDF pages to images...
Saved page 1 as output_12\page_1.png
Saved page 2 as output_12\page_2.png
Saved page 3 as output_12\page_3.png
Saved page 4 as output_12\page_4.png
Saved page 5 as output_12\page_5.png
Saved page 6 as output_12\page_6.png
Saved page 7 as output_12\page_7.png
Saved page 8 as output_12\page_8.png
Saved page 9 as output_12\page_9.png
Saved page 10 as output_12\page_10.png
Saved page 11 as output_12\page_11.png
Saved page 12 as output_12\page_12.png
Saved page 13 as output_12\page_13.png
Saved page 14 as output_12\page_14.png
Saved page 15 as output_12\page_15.png
Saved page 16 as output_12\page_16.png
Saved page 17 as output_12\page_17.png
Saved page 18 as output_12\page_18.png
Saved page 19 as output_12\page_19.png
Saved page 20 as output_12\page_20.png
Saved page 21 as output_12\page_21.png
Saved page 22 as output_12\page_22.png
Saved page 23 as output_12\page_23.png
Saved page 24 as output_12\page_24.png
Saved page 25 as output_12\page_25.png
Saved pag

In [18]:
import os
from PIL import Image
import pytesseract

def perform_ocr_on_existing_images(image_folder, output_text_file, lang="fra"):
    """
    Performs OCR on all images in a folder using Tesseract and saves the extracted text to a file.

    :param image_folder: Folder containing image files
    :param output_text_file: File to save the extracted text
    :param lang: Language for OCR (default: French - 'fra')
    :return: None
    """
    extracted_text = ""

    # Sort images to maintain page order
    image_files = sorted([
        f for f in os.listdir(image_folder)
        if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff'))
    ])

    for image_file in image_files:
        image_path = os.path.join(image_folder, image_file)
        print(f"Performing OCR on {image_path}...")
        text = pytesseract.image_to_string(Image.open(image_path), lang=lang)
        extracted_text += f"\n\n=== Extracted Text from {image_file} ===\n\n{text}"

    with open(output_text_file, "w", encoding="utf-8") as f:
        f.write(extracted_text)

    print(f"OCR complete. Text saved to {output_text_file}")

# Example usage:
image_folder = "BW_12"
output_text_file = "extracted_text_12.txt"
perform_ocr_on_existing_images(image_folder, output_text_file)


Performing OCR on BW_12\page_001.png...
Performing OCR on BW_12\page_002.png...
Performing OCR on BW_12\page_003.png...
Performing OCR on BW_12\page_004.png...
Performing OCR on BW_12\page_005.png...
Performing OCR on BW_12\page_006.png...
Performing OCR on BW_12\page_007.png...
Performing OCR on BW_12\page_008.png...
Performing OCR on BW_12\page_009.png...
Performing OCR on BW_12\page_010.png...
Performing OCR on BW_12\page_011.png...
Performing OCR on BW_12\page_012.png...
Performing OCR on BW_12\page_013.png...
Performing OCR on BW_12\page_014.png...
Performing OCR on BW_12\page_015.png...
Performing OCR on BW_12\page_016.png...
Performing OCR on BW_12\page_017.png...
Performing OCR on BW_12\page_018.png...
Performing OCR on BW_12\page_019.png...
Performing OCR on BW_12\page_020.png...
Performing OCR on BW_12\page_021.png...
Performing OCR on BW_12\page_022.png...
Performing OCR on BW_12\page_023.png...
Performing OCR on BW_12\page_024.png...
Performing OCR on BW_12\page_025.png...
