In [49]:
# 20250701_png_to_pdf_ocr.py
# Generate a large-font PNG, embed it in a PDF at same size, convert back to image and OCR extract text.

from PIL import Image, ImageDraw, ImageFont
from reportlab.pdfgen import canvas
from pdf2image import convert_from_path
import pytesseract

# 1. Create PNG with big text
text = "Hello OCR!"
font_size = 100

# Intentar cargar una TTF; si falla, usar fuente por defecto
try:
    font = ImageFont.truetype("arial.ttf", font_size)
except (OSError, IOError):
    font = ImageFont.load_default()
    print("Warning: TrueType font unavailable, using default bitmap font.")

# Medir el texto con un Draw temporal
tmp_img = Image.new("RGB", (1, 1))
tmp_draw = ImageDraw.Draw(tmp_img)
bbox = tmp_draw.textbbox((0, 0), text, font=font)  # (x0, y0, x1, y1)
w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]

# Crear la imagen definitiva con padding
img = Image.new("RGB", (w + 20, h + 20), "white")
draw = ImageDraw.Draw(img)
draw.text((10, 10), text, fill="black", font=font)
png_path = "20250701_big_text.png"
img.save(png_path)

# 2. Embed PNG in PDF at its native size
pdf_path = "20250701_big_text.pdf"
c = canvas.Canvas(pdf_path, pagesize=(w + 20, h + 20))
c.drawImage(png_path, 0, 0, width=w + 20, height=h + 20)
c.showPage()
c.save()

# 3. Convert PDF back to images
pages = convert_from_path(pdf_path)

# 4. OCR each page and save text
ocr_text = ""
for page in pages:
    ocr_text += pytesseract.image_to_string(page) + "\n"

txt_path = "20250701_ocr_output.txt"
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(ocr_text)

print(f"Generated: {png_path}, {pdf_path}, {txt_path}")


Generated: 20250701_big_text.png, 20250701_big_text.pdf, 20250701_ocr_output.txt


In [51]:
# 20250701_png_to_pdf_ocr.py
# Generate a large-font PNG (uncropped), embed it in a PDF at same size, convert back to image and OCR extract text.

from PIL import Image, ImageDraw, ImageFont
from reportlab.pdfgen import canvas
from pdf2image import convert_from_path
import pytesseract

# 1. Create PNG with big text
text = "Hello OCR!"
font_size = 100

# Try loading a TTF; fallback to default if unavailable
try:
    font = ImageFont.truetype("arial.ttf", font_size)
except (OSError, IOError):
    font = ImageFont.load_default()

# Measure text bounding box (may include negative offsets)
tmp_img = Image.new("RGB", (1, 1))
tmp_draw = ImageDraw.Draw(tmp_img)
x0, y0, x1, y1 = tmp_draw.textbbox((0, 0), text, font=font)
w, h = x1 - x0, y1 - y0
padding = 10

# Create final image with padding and correct origin
img = Image.new("RGB", (w + padding*2, h + padding*2), "white")
draw = ImageDraw.Draw(img)
draw.text((padding - x0, padding - y0), text, fill="black", font=font)  # shift by -x0,-y0 to avoid cropping

png_path = "20250701_big_text.png"
img.save(png_path)

# 2. Embed PNG in PDF at its native size
pdf_path = "20250701_big_text.pdf"
c = canvas.Canvas(pdf_path, pagesize=(w + padding*2, h + padding*2))
c.drawImage(png_path, 0, 0, width=w + padding*2, height=h + padding*2)
c.showPage()
c.save()

# 3. Convert PDF back to images
pages = convert_from_path(pdf_path)

# 4. OCR each page and save text
ocr_text = ""
for page in pages:
    ocr_text += pytesseract.image_to_string(page) + "\n"

txt_path = "20250701_ocr_output.txt"
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(ocr_text)

print(f"Generated: {png_path}, {pdf_path}, {txt_path}")



Generated: 20250701_big_text.png, 20250701_big_text.pdf, 20250701_ocr_output.txt
