In [1]:
import fitz  # PyMuPDF
from PIL import Image
import os
import ast  # To safely parse the tuple from size.txt
from PyPDF2 import PdfReader, PdfWriter, Transformation
import copy

In [2]:
def scale_box_to_pdf(jpg_box, jpg_size, pdf_size):
    x1, y1, x2, y2 = jpg_box
    jpg_width, jpg_height = jpg_size
    pdf_width, pdf_height = pdf_size

    scale_x = pdf_width / jpg_width
    scale_y = pdf_height / jpg_height

    scaled_x1 = x1 * scale_x
    scaled_y1 = y1 * scale_y
    scaled_x2 = x2 * scale_x
    scaled_y2 = y2 * scale_y

    return [scaled_x1, scaled_y1, scaled_x2, scaled_y2]

In [3]:
def insert_images(pdf_path, image_folder):
    """
    Insert cropped images back into the original PDF at scaled coordinates.

    Args:
        pdf_path (str): Path to the input PDF.
        image_folder (str): Folder containing index.txt, size.txt, and cropped images folder ('images').
    """
    # Load size.txt
    size_path = os.path.join(image_folder, 'size.txt')
    with open(size_path, 'r') as f:
        jpg_size = ast.literal_eval(f.readline().strip())
        pdf_size = ast.literal_eval(f.readline().strip())

    # Load index.txt
    index_path = os.path.join(image_folder, 'index.txt')
    with open(index_path, 'r') as f:
        lines = f.readlines()

    # Open PDF
    pdf_doc = fitz.open(pdf_path)
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_pdf = f"{pdf_name}_insert_images.pdf"

    # Use first page (or extend later if needed)
    page = pdf_doc[0]

    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue

        img_id, x1, y1, x2, y2 = parts
        img_id = int(img_id)
        x1, y1, x2, y2 = map(float, [x1, y1, x2, y2])

        img_path = os.path.join(image_folder, f"images/{img_id}.jpg")
        if not os.path.exists(img_path):
            print(f"Warning: Image {img_path} not found, skipping.")
            continue

        # Scale coordinates
        scaled_box = scale_box_to_pdf([x1, y1, x2, y2], jpg_size, pdf_size)
        rect = fitz.Rect(scaled_box)

        # Insert image
        page.insert_image(rect, filename=img_path)

    # Save PDF
    pdf_doc.save(output_pdf)
    pdf_doc.close()
    print(f"Saved output PDF as {output_pdf}")

In [4]:
def scale_pdf(input_path, output_path, target_width=1025, target_height=1025):
    """
    Scale a PDF to the target dimensions ensuring both page size and content are scaled.

    Args:
        input_path (str): Path to the input PDF file
        output_path (str): Path where the scaled PDF will be saved
        target_width (int): Target width in pixels (default: 1025)
        target_height (int): Target height in pixels (default: 1025)
    """
    # Read the original PDF
    reader = PdfReader(input_path)
    writer = PdfWriter()

    # Convert target dimensions from pixels to points (72 points = 1 inch)
    # Assuming 72 DPI resolution
    target_width_pts = target_width
    target_height_pts = target_height

    # Process each page
    for page_num in range(len(reader.pages)):
        # Get the original page
        original_page = reader.pages[page_num]

        # Get original page dimensions
        mediabox = original_page.mediabox
        orig_width = float(mediabox.width)
        orig_height = float(mediabox.height)

        # Calculate scaling factors
        width_scale = target_width_pts / orig_width
        height_scale = target_height_pts / orig_height

        # Create a copy of the page to work with
        page = copy.deepcopy(original_page)

        # Apply scaling transformation to the content
        transform = Transformation().scale(width_scale, height_scale)
        page.add_transformation(transform)

        # Update the mediabox to the new dimensions
        # PyPDF2 uses a coordinate system with (0,0) at the bottom left
        page.mediabox.lower_left = (0, 0)
        page.mediabox.upper_right = (target_width_pts, target_height_pts)

        # Also update cropbox and trimbox if they exist
        if "/CropBox" in page:
            page.cropbox.lower_left = (0, 0)
            page.cropbox.upper_right = (target_width_pts, target_height_pts)

        if "/TrimBox" in page:
            page.trimbox.lower_left = (0, 0)
            page.trimbox.upper_right = (target_width_pts, target_height_pts)

        if "/ArtBox" in page:
            page.artbox.lower_left = (0, 0)
            page.artbox.upper_right = (target_width_pts, target_height_pts)

        if "/BleedBox" in page:
            page.bleedbox.lower_left = (0, 0)
            page.bleedbox.upper_right = (target_width_pts, target_height_pts)

        # Add the scaled page to the output PDF
        writer.add_page(page)

    # Write the result to the output file
    with open(output_path, "wb") as output_file:
        writer.write(output_file)

    print(f"PDF scaled successfully to {target_width}x{target_height}.")
    print(f"Both page dimensions and content have been scaled. Saved to {output_path}")

In [5]:
def scale_pdf_from_folder(pdf_name, folder_name):
    """
    Read target dimensions from size.txt, check current PDF size, and rescale if needed.
    Save scaled PDF in current directory.
    """
    size_file_path = os.path.join(folder_name, 'size.txt')
    pdf_input_path = pdf_name
    pdf_output_path = f"{os.path.splitext(pdf_name)[0]}_scale.pdf"

    # Read target dimensions
    with open(size_file_path, 'r') as f:
        lines = f.readlines()
        if len(lines) < 2:
            raise ValueError("size.txt does not contain at least two lines.")
        size_line = lines[1].strip()
        try:
            target_width, target_height = eval(size_line)
            if not (isinstance(target_width, (int, float)) and isinstance(target_height, (int, float))):
                raise ValueError("Size values must be numbers.")
        except Exception as e:
            raise ValueError(f"Invalid size tuple in size.txt: {size_line}") from e

    # Check current PDF size (first page)
    reader = PdfReader(pdf_input_path)
    first_page = reader.pages[0]
    orig_width = float(first_page.mediabox.width)
    orig_height = float(first_page.mediabox.height)

    # Compare with target size (allow tiny tolerance)
    tolerance = 0.01
    if abs(orig_width - target_width) < tolerance and abs(orig_height - target_height) < tolerance:
        print(f"No scaling needed. PDF already matches target size {target_width}x{target_height}.")
    else:
        scale_pdf(pdf_input_path, pdf_output_path, target_width, target_height)

In [8]:
def insert_images(pdf_path, image_folder):
    """
    Insert cropped images back into the PDF, rescaling the PDF first if needed.

    Args:
        pdf_path (str): Path to the input PDF.
        image_folder (str): Folder containing pdf_coor.txt, size.txt, and cropped images folder ('images').
    """
    import ast
    import os
    from PyPDF2 import PdfReader
    import fitz

    # Load size.txt
    size_path = os.path.join(image_folder, 'size.txt')
    with open(size_path, 'r') as f:
        jpg_size = ast.literal_eval(f.readline().strip())
        pdf_size = ast.literal_eval(f.readline().strip())
    target_width, target_height = pdf_size

    # Check current PDF size
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    orig_width = float(first_page.mediabox.width)
    orig_height = float(first_page.mediabox.height)

    tolerance = 0.01
    if abs(orig_width - target_width) < tolerance and abs(orig_height - target_height) < tolerance:
        scaled_pdf_path = pdf_path
        print("PDF size matches target. Proceeding to insert images.")
    else:
        print("PDF size does not match target. Rescaling PDF first...")
        folder_name = image_folder
        pdf_name = os.path.basename(pdf_path)
        scale_pdf_from_folder(pdf_name, folder_name)
        scaled_pdf_path = f"{os.path.splitext(pdf_name)[0]}_scale.pdf"

    # Load pdf_coor.txt
    pdf_coor_path = os.path.join(image_folder, 'pdf_coor.txt')
    if not os.path.exists(pdf_coor_path):
        raise FileNotFoundError(f"pdf_coor.txt not found in folder: {image_folder}")

    with open(pdf_coor_path, 'r') as f:
        lines = f.readlines()

    # Open scaled PDF
    pdf_doc = fitz.open(scaled_pdf_path)
    pdf_name = os.path.splitext(os.path.basename(scaled_pdf_path))[0]
    output_pdf = f"{pdf_name}_insert_images.pdf"

    # Use first page (or extend later if needed)
    page = pdf_doc[0]

    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue

        img_id, x1, y1, x2, y2 = parts
        img_id = int(img_id)
        x1, y1, x2, y2 = map(float, [x1, y1, x2, y2])

        img_path = os.path.join(image_folder, f"images/{img_id}.jpg")
        if not os.path.exists(img_path):
            print(f"Warning: Image {img_path} not found, skipping.")
            continue

        rect = fitz.Rect(x1, y1, x2, y2)

        # Insert image
        page.insert_image(rect, filename=img_path)

    # Save PDF
    pdf_doc.save(output_pdf)
    pdf_doc.close()
    print(f"Saved output PDF as {output_pdf}")


In [9]:
insert_images('blank_Math_notation.pdf', 'Math_notation')

PDF size does not match target. Rescaling PDF first...
PDF scaled successfully to 612.0x792.0.
Both page dimensions and content have been scaled. Saved to blank_Math_notation_scale.pdf
Saved output PDF as blank_Math_notation_scale_insert_images.pdf
