In [3]:
from PIL import Image
import pytesseract
from reportlab.pdfgen import canvas
import cv2
import numpy as np
import os
import layoutparser as lp
from table_transformer import TableExtractionPipeline

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust if needed


In [None]:
print('Running OCR on image...')

def preprocess_image(image_path):
    image = cv2.imread(image_path)

    # Gentle denoising
    denoised = cv2.fastNlMeansDenoisingColored(image, None, 5, 10, 7, 21)

    # Convert to LAB and enhance contrast
    lab = cv2.cvtColor(denoised, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    cl = clahe.apply(l)
    merged = cv2.merge((cl, a, b))
    enhanced = cv2.cvtColor(merged, cv2.COLOR_LAB2BGR)

    # Save preprocessed image
    os.makedirs("output/exp/hyper", exist_ok=True)
    cv2.imwrite("output/exp/hyper/new-prepro-img.jpg", enhanced)

    return image, enhanced


def image_to_searchable_pdf(image_path, pdf_path):
    original_img, preprocessed_img = preprocess_image(image_path)

    height, width = original_img.shape[:2]

    # Save original image as RGB for the PDF background
    temp_image_path = "temp_image.jpg"
    Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB)).save(temp_image_path)

    # Use Tesseract to extract layout-aware data
    custom_config = r'--oem 3 --psm 12 -l eng'
    pil_image = Image.fromarray(preprocessed_img)
    data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT, config=custom_config)

    # Create canvas with same size as image
    c = canvas.Canvas(pdf_path, pagesize=(width, height))
    c.drawImage(temp_image_path, 0, 0, width=width, height=height)

    # Transparent text overlay
    c.setFillColorRGB(255, 255, 255, alpha=1)
    c.setFont("Helvetica", 20)

    # Group words by line
    lines = {}
    for i in range(len(data['text'])):
        if int(data['conf'][i]) > 60 and data['text'][i].strip():
            key = (data['block_num'][i], data['par_num'][i], data['line_num'][i])
            if key not in lines:
                lines[key] = []
            lines[key].append(i)

    for key in lines:
        word_indices = lines[key]
        line_text = " ".join([data['text'][i] for i in word_indices])
        x = min([data['left'][i] for i in word_indices])

        # Use average for better alignment
        avg_top = int(np.mean([data['top'][i] for i in word_indices]))
        avg_height = int(np.mean([data['height'][i] for i in word_indices]))

        # Optional tweak: small offset to better center text
        y_adjusted = height - avg_top - int(avg_height * 0.8)

        c.drawString(x, y_adjusted, line_text)


    c.save()
    os.remove(temp_image_path)
    print(f"✅ Searchable PDF saved to: {pdf_path}")

if __name__ == "__main__":
    image_to_searchable_pdf("input/cert.jpg", "output/exp/hyper/new-output.pdf")
    print("Searchable PDF created!")


Running OCR on image...
✅ Searchable PDF saved to: output/exp/hyper/new-output.pdf
Searchable PDF created!
Detection model initialized.
Detection model weights loaded.


Using CPU. Note: This module is much faster with a GPU.


Structure model initialized.
Structure model weights loaded.
OCR reader initialized.
'numpy.ndarray' object has no attribute 'read'
Table 0 is None and will be skipped.
Table 1 is None and will be skipped.


AttributeError: 'list' object has no attribute 'to_csv'