In [24]:
pip install PyPDF2 PyMuPDF

Note: you may need to restart the kernel to use updated packages.


In [25]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import os

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [3]:
import PyPDF2

def create_subset_pdf(input_path, num_pages=20):
    """
    Creates a new PDF containing the first n pages of the input PDF
    
    Args:
        input_path (str): Path to the input PDF file
        num_pages (int): Number of pages to include in subset (default 20)
    
    Returns:
        str: Path to the generated subset PDF file
    """
    # Open the input PDF
    with open(input_path, 'rb') as file:
        # Create PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Create PDF writer object
        pdf_writer = PyPDF2.PdfWriter()
        
        # Get the actual number of pages to extract (min of requested pages or total pages)
        pages_to_extract = min(num_pages, len(pdf_reader.pages))
        
        # Add pages to writer
        for page_num in range(pages_to_extract):
            pdf_writer.add_page(pdf_reader.pages[page_num])
        
        # Generate output filename
        output_path = input_path.rsplit('.', 1)[0] + f'_first_{pages_to_extract}pages.pdf'
        
        # Write the subset PDF to file
        with open(output_path, 'wb') as output_file:
            pdf_writer.write(output_file)
            
    return output_path

sample = create_subset_pdf("tome_12.pdf")

In [20]:
import fitz  # PyMuPDF
import cv2
import numpy as np
import os
from PIL import Image

# Function 1: Contrast/Brightness Adjustment
def preprocess_image_1(image):
    img = np.array(image)
    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    lower_beige = np.array([10, 20, 100], dtype=np.uint8)
    upper_beige = np.array([40, 100, 255], dtype=np.uint8)
    mask = cv2.inRange(hsv, lower_beige, upper_beige)
    img[mask > 0] = [255, 255, 255]
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    adjusted = cv2.convertScaleAbs(gray, alpha=1.4, beta=-20)
    result = cv2.cvtColor(adjusted, cv2.COLOR_GRAY2RGB)
    return Image.fromarray(result)

# Function 2: Adaptive Thresholding + Blending
def preprocess_image_2(image):
    img = np.array(image)
    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    lower_beige = np.array([10, 20, 100], dtype=np.uint8)
    upper_beige = np.array([40, 100, 255], dtype=np.uint8)
    mask = cv2.inRange(hsv, lower_beige, upper_beige)
    img[mask > 0] = [255, 255, 255]
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    adaptive = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
        cv2.THRESH_BINARY, blockSize=25, C=5)
    result = cv2.addWeighted(gray, 0.7, adaptive, 0.3, 0)
    result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB)
    return Image.fromarray(result)

# Function 3: Looser beige detection range
def preprocess_image_3(image):
    img = np.array(image)
    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    #lower_beige = np.array([5, 10, 80], dtype=np.uint8)   # Extended lower bound
    #upper_beige = np.array([45, 130, 255], dtype=np.uint8)  # Extended upper bound
    lower_beige = np.array([15, 15, 140], dtype=np.uint8)
    upper_beige = np.array([35, 80, 255], dtype=np.uint8)
    mask = cv2.inRange(hsv, lower_beige, upper_beige)
    img[mask > 0] = [255, 255, 255]
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    adjusted = cv2.convertScaleAbs(gray, alpha=1.4, beta=-20)
    result = cv2.cvtColor(adjusted, cv2.COLOR_GRAY2RGB)
    return Image.fromarray(result)

# Function 4: Strong beige removal with morphological operations
def preprocess_image_4(image):
    img = np.array(image)
    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    
    # Use a more conservative beige detection range
    lower_beige = np.array([15, 15, 140], dtype=np.uint8)
    upper_beige = np.array([35, 80, 255], dtype=np.uint8)
    mask = cv2.inRange(hsv, lower_beige, upper_beige)

    # Very mild morphological smoothing to preserve text edges
    kernel = np.ones((2, 2), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

    img[mask > 0] = [255, 255, 255]  # Replace beige with white

    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # Mild contrast enhancement instead of thresholding
    adjusted = cv2.convertScaleAbs(gray, alpha=1.2, beta=-10)
    result = cv2.cvtColor(adjusted, cv2.COLOR_GRAY2RGB)

    return Image.fromarray(result)


# Load a sample page from the PDF and apply all four functions
def compare_preprocessing_methods(pdf_path, output_folder, page_num=0):
    doc = fitz.open(pdf_path)
    os.makedirs(output_folder, exist_ok=True)

    # Render the target page to an image
    page = doc[page_num]
    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # High resolution
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    # Apply all four processing methods
    methods = [
        preprocess_image_1,
        preprocess_image_2,
        preprocess_image_3,
        preprocess_image_4
    ]

    for idx, func in enumerate(methods, start=1):
        print(f"Applying preprocess_image_{idx}...")
        result = func(img)
        result.save(os.path.join(output_folder, f"page_{page_num+1:03d}_v{idx}.png"))

    print("✅ Done. All versions saved.")

# Example usage
pdf_path = sample  
output_folder = "BW_comparison"
compare_preprocessing_methods(pdf_path, output_folder, page_num=14)


Applying preprocess_image_1...
Applying preprocess_image_2...
Applying preprocess_image_3...
Applying preprocess_image_4...
✅ Done. All versions saved.


In [27]:
def ocr_four_versions_on_page(image_dir, base_filename="page_015", lang="fra"):
    """
    Perform OCR on four preprocessed images (v1 to v4) of a single PDF page.

    :param image_dir: Directory where preprocessed images are stored
    :param base_filename: Base name of the image files (without _vX suffix or extension)
    :param lang: OCR language (default: French)
    :return: Dictionary of OCR results for each version
    """
    ocr_results = {}
    
    for i in range(1, 5):
        version_file = os.path.join(image_dir, f"{base_filename}_v{i}.png")
        print(f"🔎 OCR on {version_file}")
        text = pytesseract.image_to_string(Image.open(version_file), lang=lang)
        ocr_results[f"v{i}"] = text
    
    return ocr_results

In [28]:
image_folder = "BW_comparison"
ocr_texts = ocr_four_versions_on_page(image_folder)

# Print or compare results
for version, text in ocr_texts.items():
    print(f"\n===== OCR Result for {version} =====\n")
    print(text[:1000])  # Show only first 1000 characters


🔎 OCR on BW_comparison\page_015_v1.png
🔎 OCR on BW_comparison\page_015_v2.png
🔎 OCR on BW_comparison\page_015_v3.png
🔎 OCR on BW_comparison\page_015_v4.png

===== OCR Result for v1 =====

(8)

1072.

28 janvier 1814.

BRIEVET DE PERFECTIONNEMENT DE DIX ANS,

Pour des perfectionnemens apportés aux procédés de
filtration des caux de boisson par les filtres de charbon,
de l’invention de MM. Smith et Cuchet (1),

Au sieur J. Ducommun, à Paris.

Lesdeux tuyaux en usage dans la fontaine domestique de MM.Smith
et Cuchet sont remplacés par un seul tuyau, qui ne descend pas
plus bas que la cloison horizontale de séparation, appelée pa-
nache.

Au lieu de tuyaux de plomb, les fontaines sont fabriquées avec
des tuyaux pratiqués dans l'épaisseur de la poterie, ce qui leur
donne une solidité parfaite.

Le plateau métallique est remplacé par un plateau en ardoise,
ce qui donne äla-fois économie . salubrité et solidité. Le cham-
pignon en plomb est aussi remplacé par une boîte en ‘faience ,
qui prése