In [2]:
pip install pytesseract pdf2image pymupdf pdfminer.six

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six)
  Downloading cryptography-44.0.2-cp39-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.

In [10]:
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import os

def extract_text_with_pymupdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text.strip(), doc

def convert_pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

def redact_text_on_image(image, old_text):
    ocr_data = pytesseract.image_to_data(image, lang='eng+fra', output_type=pytesseract.Output.DICT)
    
    draw = ImageDraw.Draw(image)
    for i, word in enumerate(ocr_data["text"]):
        if old_text.lower() in word.lower():
            (x, y, w, h) = (ocr_data["left"][i], ocr_data["top"][i], ocr_data["width"][i], ocr_data["height"][i])
            draw.rectangle([x, y, x + w, y + h], fill="black")

    return image

def modify_pdf(pdf_path, old_text):
    extracted_text, doc = extract_text_with_pymupdf(pdf_path)
    
    if extracted_text:
        # Le PDF contient du texte sélectionnable
        output_pdf_path = pdf_path.replace(".pdf", "_redacted.pdf")
        for page in doc:
            text_instances = page.search_for(old_text)
            for rect in text_instances:
                page.add_redact_annot(rect, fill=(0, 0, 0))
            page.apply_redactions()
        doc.save(output_pdf_path)
        doc.close()
        print(f"PDF texte modifié sauvegardé sous {output_pdf_path}")
    else:
        # Le PDF est un scan, traiter avec OCR
        images = convert_pdf_to_images(pdf_path)
        redacted_images = [redact_text_on_image(img, old_text) for img in images]
        output_pdf_path = pdf_path.replace(".pdf", "_redacted.pdf")
        redacted_images[0].save(output_pdf_path, save_all=True, append_images=redacted_images[1:])
        print(f"PDF image modifié sauvegardé sous {output_pdf_path}")
    
    return output_pdf_path

pdf_file = "in.pdf"
modified_pdf = modify_pdf(pdf_file, "Jorge")

✅ PDF texte modifié sauvegardé sous in_redacted.pdf
