In [1]:
pdf_file = "data/sel12fotopages.pdf"

In [None]:
# Selecting Pages
import pymupdf

doc = pymupdf.open(pdf_file) # open a document
doc.select([0, 1]) # select the 1st & 2nd page of the document
doc.save("just-page-one-and-two.pdf") # save the document

In [4]:
import fitz  # PyMuPDF

def extract_images_from_pdf(pdf_path, output_folder="data/extracted_images"):
    try:
        doc = fitz.open(pdf_path)
        image_count = 0
        for page_num in range(doc.page_count):
            page = doc[page_num]
            image_list = page.get_images(full=True)  # Get image info
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                with open(f"{output_folder}/page{page_num+1}_img{img_index}.{image_ext}", "wb") as f:
                    f.write(image_bytes)
                image_count += 1
        return f"Extracted {image_count} images to {output_folder}"
    except fitz.fitz.FileNotFoundError:
        return "Error: PDF file not found."
    except Exception as e:
        return f"An error occurred: {e}"


In [7]:
#Screenshots
from PIL import Image

def pdf_page_to_image(pdf_path, output_folder="data/pdf_screenshots", zoom_x=2.0, zoom_y=2.0):  # zoom increases resolution
    try:
        doc = fitz.open(pdf_path)
        mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

        for page_num in range(doc.page_count):
            page = doc[page_num]
            pix = page.get_pixmap(matrix=mat) # render page to an image
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            img.save(f"{output_folder}/page{page_num + 1}.png")
        return f"Screenshots saved to {output_folder}"
    except fitz.fitz.FileNotFoundError:
        return "Error: PDF file not found."
    except Exception as e:
        return f"An error occurred: {e}"


In [5]:

result = extract_images_from_pdf(pdf_file)
print(result)

Extracted 96 images to data/extracted_images


In [10]:

result = pdf_page_to_image(pdf_file)
print(result)

Screenshots saved to data/pdf_screenshots


In [None]:
mozliwosc wyboru strony
utworz folder jak nie istnieje
clenup usun zawartosc folderu

In [11]:
import pymupdf
from PIL import Image
import os
import shutil

doc = pymupdf.open(pdf_file) # open a document
# sel=doc.select([0, 1]) # select the 1st & 2nd page of the document
def pdf_select_pages(doc, start_page, end_page):

    selected_pdf = doc.select(list(range(start_page-1, end_page))) # select the 1st & 2nd page of the document
    return doc

doc=pdf_select_pages(doc, 1, 2)
doc

Document('data/sel12fotopages.pdf')

In [12]:

def ensure_folder_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        return f"Folder '{folder_path}' created."
    else:
        return f"Folder '{folder_path}' already exists."


In [13]:

def remove_folder_content(folder_path):
    try:
        if os.path.exists(folder_path):
            shutil.rmtree(folder_path)
            os.makedirs(folder_path)  # Recreate the folder after deleting its content
            return f"All contents of the folder '{folder_path}' have been removed."
        else:
            return f"Folder '{folder_path}' does not exist."
    except Exception as e:
        return f"An error occurred: {e}"



In [24]:
#Screenshots

def pdf_page_to_image(doc, output_folder="data/pdf_screenshots", zoom_x=2.0, zoom_y=2.0):  # zoom increases resolution
    ensure_folder_exists(output_folder)
    # get doc filename
    just_filename = os.path.splitext(os.path.basename(doc.name))[0] 

    try:
        
        mat = pymupdf.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

        for page_num in range(doc.page_count):
            page = doc[page_num]
            pix = page.get_pixmap(matrix=mat) # render page to an image
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            img.save(f"{output_folder}/{just_filename}_p{page_num + 1}.png")
        
        return f"Screenshots saved to {output_folder}"
    except pymupdf.fitz.FileNotFoundError:
        return "Error: PDF file not found."
    except Exception as e:
        return f"An error occurred: {e}"


In [25]:
pdf_page_to_image(doc)

'Screenshots saved to data/pdf_screenshots'

In [23]:
remove_folder_content("data/pdf_screenshots")

"All contents of the folder 'data/pdf_screenshots' have been removed."