In [1]:
import fitz  # PyMuPDF

def extract_images_from_pdf(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Iterate through each page
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        images = page.get_images(full=True)
        
        # Iterate through each image
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"{output_folder}/image_{page_number + 1}_{img_index + 1}.{image_ext}"
            
            # Save the image
            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
    
    print("Images extracted successfully.")



In [9]:
import fitz  # PyMuPDF
from PIL import Image, ImageOps
import io
import os

def extract_images_from_pdf(pdf_path, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Open PDF
    pdf_document = fitz.open(pdf_path)
    
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        image_list = page.get_images(full=True)
        
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            
            # Check if image has mask
            if base_image["colorspace"] == 1 and base_image.get("mask", False):
                # Handle inverted masks
                image_data = base_image["image"]
                img = Image.open(io.BytesIO(image_data))
                
                # Convert to RGB and invert if needed
                if img.mode != "RGB":
                    img = img.convert("RGB")
                img = ImageOps.invert(img)
            else:
                # Regular image processing
                image_data = base_image["image"]
                img = Image.open(io.BytesIO(image_data))
                if img.mode != "RGB":
                    img = img.convert("RGB")
            
            # Save image
            output_path = f"{output_folder}/image_{page_number + 1}_{img_index + 1}.png"
            img.save(output_path, "PNG")
    
    pdf_document.close()
    print(f"Extracted {len(image_list)} images from page {page_number + 1}")


In [10]:
# Example usage
pdf_path = "pdfs/De-novo-dna-catch-bonds.pdf"
output_folder = "pdfs/images/denovo"
extract_images_from_pdf(pdf_path, output_folder)

Extracted 1 images from page 20


In [7]:
from PIL import Image, ImageOps

def invert_image(image_path, output_path):
    # Open image
    image = Image.open(image_path)
    
    # Convert to RGB if not already
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    # Invert the colors
    inverted_image = ImageOps.invert(image)
    
    # Save inverted image
    inverted_image.save(output_path)



In [None]:
# Example usage
input_path = "pdfs/images/image_12_1.png"  # Replace with your image path
output_path = "pdfs/images/fixed_image_12_1.png"    # Replace with desired output path
invert_image(input_path, output_path)

- Above code snippets have certain drawbacks
    - the Biological-applications paper's images get extracted as negatives (inverted color)
    - the De-novo DNA catch bonds paper's few images get extracted as individual components, not as one single image as seen in the pdf document

Below attempt is to solve the second problem (stitiching together a single image from component images)

In [3]:
import fitz
from PIL import Image
import numpy as np
import io
import cv2

def extract_and_stitch_images(pdf_path, page_num=0):
    # Open PDF and get page
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_num)
    
    # Get images and their positions
    images = page.get_images(full=True)
    image_data = []
    
    # Extract image data and positions
    for img in images:
        xref = img[0]
        base = doc.extract_image(xref)
        image_bytes = base["image"]
        pil_img = Image.open(io.BytesIO(image_bytes))
        
        # Convert to RGB if necessary
        if pil_img.mode != 'RGB':
            pil_img = pil_img.convert('RGB')
        
        # Convert to numpy array for OpenCV processing
        cv_img = np.array(pil_img)
        rect = fitz.Rect(img[3], img[4], img[5], img[6])
        image_data.append({
            'image': cv_img,
            'rect': rect,
            'width': cv_img.shape[1],
            'height': cv_img.shape[0]
        })
    
    # Create output canvas
    page_width, page_height = int(page.rect.width), int(page.rect.height)
    output = np.zeros((page_height, page_width, 3), dtype=np.uint8)
    mask = np.zeros((page_height, page_width), dtype=np.uint8)
    
    # Sort images by area (larger images first)
    image_data.sort(key=lambda x: x['width'] * x['height'], reverse=True)
    
    for img_info in image_data:
        x0, y0 = int(img_info['rect'].x0), int(img_info['rect'].y0)
        img = img_info['image']
        h, w = img.shape[:2]
        
        # Create ROI
        roi = output[y0:y0+h, x0:x0+w]
        img_mask = mask[y0:y0+h, x0:x0+w]
        
        # Blend images in overlapping regions
        alpha = 0.5
        overlap_mask = img_mask > 0
        
        if overlap_mask.any():
            # In overlapping regions, blend with existing image
            roi[overlap_mask] = cv2.addWeighted(
                roi[overlap_mask], 1-alpha,
                img[overlap_mask], alpha, 0
            )
            # In non-overlapping regions, use new image
            roi[~overlap_mask] = img[~overlap_mask]
        else:
            # No overlap, just copy the image
            roi[:] = img
        
        # Update mask
        mask[y0:y0+h, x0:x0+w] = 255
    
    # Convert back to PIL Image
    result = Image.fromarray(output)
    return result



In [4]:
# Usage
pdf_path = "pdfs/De-novo-dna-catch-bonds.pdf"

result_image = extract_and_stitch_images(pdf_path, 1)
result_image.save("pdfs/images/denovo/stitched/page_2.png")

ValueError: could not convert string to float: 'ICCBased'

In [2]:
import cv2
print(cv2.__version__)

4.10.0
