# Convert a pdf to an image pdf

sources: 

https://levelup.gitconnected.com/4-python-libraries-to-convert-pdf-to-images-7a09eba83a09


https://pymupdf.readthedocs.io/en/latest/recipes-images.html

## Walk-Through

In [1]:
import os
import shutil
import fitz
# !pip3 install PyMuPDF

### Convert pdf to imgs

In [3]:
pdffile = "Handout_Trust.pdf"
doc = fitz.open(pdffile)
zoom = 4
mat = fitz.Matrix(zoom, zoom)
count = 0


folder_name = pdffile.replace('.pdf','_imgs')
if not os.path.exists(folder_name):
    os.mkdir(folder_name)
    
    
# Count variable is to get the number of pages in the pdf
for p in doc:
    count += 1
for i in range(count):
    val = os.path.join(folder_name,f"image_{i+1000000}.png")
    page = doc.load_page(i)
    pix = page.get_pixmap(matrix=mat)
    pix.save(val)
doc.close()

### Convert folder of imgs to pdf

In [26]:
import os, fitz
# import PySimpleGUI as psg  # for showing a progress bar
doc = fitz.open()  # PDF with the pictures
imgdir = folder_name # "D:/2012_10_05"  # where the pics are
imglist = os.listdir(imgdir)  # list of them
imglist = [f for f in imglist if '.png' in f]
imgcount = len(imglist)  # pic count

for i, f in enumerate(imglist):
    img = fitz.open(os.path.join(imgdir, f))  # open pic as document
    rect = img[0].rect  # pic dimension
    pdfbytes = img.convert_to_pdf()  # make a PDF stream
    img.close()  # no longer needed
    imgPDF = fitz.open("pdf", pdfbytes)  # open stream as PDF
    page = doc.new_page(width = rect.width,  # new page with ...
                       height = rect.height)  # pic dimension
    page.show_pdf_page(rect, imgPDF, 0)  # image fills the page
    # psg.EasyProgressMeter("Import Images",  # show our progress
    #     i+1, imgcount)

doc.save("all-my-pics.pdf")

## Wrap Up Functions 

### pdf_to_imgs

In [5]:
def pdf_to_imgs(pdf_file,save=False):
    '''
    f(x): make a folder of images out of a pdf file
    in  : pdf file path
    out : if out='folder' => saves to same name folder all the images corresponding to pages, else returns a list of images
    '''
    doc = fitz.open(pdf_file)
    zoom = 4
    mat = fitz.Matrix(zoom, zoom)
    count = 0

    folder_name = pdf_file.replace('.pdf','_imgs')
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)
    
    for p in doc:
        count += 1
    if save == True:
        for i in range(count):
            val = os.path.join(folder_name,f"image_{i+1000000}.png")
            page = doc.load_page(i)
            pix = page.get_pixmap(matrix=mat)
            pix.save(val)    
        doc.close()
        return f'Images saved to {folder_name}'
    else:
        img_ls = [] 
        for i in range(count):
            val = os.path.join(folder_name,f"image_{i+1000000}.png")
            page = doc.load_page(i)
            pix = page.get_pixmap(matrix=mat)
            img_ls.append(pix)
        doc.close()
        return img_ls

In [None]:
test = pdf_to_imgs("input.pdf")

### imgs_to_pdf

In [6]:
import os, fitz

def imgs_to_pdf(imgdir , save = True):
    doc = fitz.open() 
    imglist = os.listdir(imgdir)  
    imglist = [f for f in imglist if '.png' in f]
    imgcount = len(imglist)  
    
    for i, f in enumerate(imglist):
        img = fitz.open(os.path.join(imgdir, f)) 
        rect = img[0].rect  
        pdfbytes = img.convert_to_pdf() 
        img.close() 
        imgPDF = fitz.open("pdf", pdfbytes)  
        page = doc.new_page(width = rect.width, height = rect.height) 
        page.show_pdf_page(rect, imgPDF, 0)  
    if save:
        new_pdf_name = imgdir.replace('_imgs','_img.pdf')
        c = 1
        while os.path.exists(new_pdf_name):
            new_pdf_name = imgdir.replace('_imgs',f'_img_{c}.pdf')
            c += 1
        doc.save(new_pdf_name)
        print(f"{new_pdf_name} created successfully")
    
    if os.path.exists(imgdir):
        os.rmdir(imgdir)
    
    return doc

# FINAL MODULE : pdf_to_img_df()

In [9]:
def pdf_to_img_pdf(pdf):
    '''
    f(x): it takes a pdf and creates a new pdf out of images of it.
    in  : pdf file path
    out : new pdf made of png images
    '''
    # 1° creates folder with imgs:
    file_name = pdf_to_imgs("input.pdf",save=True)
    
    # 2° create files:
    imgs_to_pdf(file_name)
    
    # 3° remove_all:
    delete_folder('temp')
    


def pdf_to_imgs(pdf_file,save=True):
    '''
    f(x): it creates a temp folder with png images from the original pdf and provides a file_name for the final pdf
    in  : pdf file path
    out : file_name 
    '''
    doc = fitz.open(pdf_file)
    zoom = 4
    mat = fitz.Matrix(zoom, zoom)
    count = 0

    folder_name = 'temp'
    os.makedirs(folder_name, exist_ok=True)
    
    for p in doc:
        count += 1
    for i in range(count):
        val = os.path.join(folder_name,f"image_{i+1000000}.png")
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=mat)
        pix.save(val)    
    doc.close()
    print('Images saved to temp')
    
    return pdf_file.replace('.pdf','')
 
def imgs_to_pdf(file_name, save = True):
    doc = fitz.open() 
    imglist = os.listdir('temp')  
    imglist = [f for f in imglist if f.endswith('.png')]  # Check for PNG files
    imgcount = len(imglist)  
    
    for i, f in enumerate(imglist):
        img = fitz.open(os.path.join('temp', f)) 
        rect = img[0].rect  
        pdfbytes = img.convert_to_pdf() 
        img.close() 
        imgPDF = fitz.open("pdf", pdfbytes)  
        page = doc.new_page(width = rect.width, height = rect.height) 
        page.show_pdf_page(rect, imgPDF, 0)  

    if save:
        new_pdf_name = file_name + '_img.pdf'
        c = 1
        while os.path.exists(new_pdf_name):
            new_pdf_name = file_name + f'_img_{c}.pdf'
            c += 1
        doc.save(new_pdf_name)
        print(f"{new_pdf_name} created successfully")
    return doc   

def delete_folder(folder_path):
    '''
    f(x): Deletes a folder and all its contents.
    in  : folder_path (str) - Path of the folder to be deleted.
    '''
    # Delete the folder and all its contents
    try:
        shutil.rmtree(folder_path)
        print(f"Folder '{folder_path}' and its contents deleted.")
    except:
        print(f"{folder_path} not deleted")


In [12]:
import fitz  # PyMuPDF
import os
import shutil

def pdf_to_img_pdf(pdf):
    '''
    f(x): it takes a pdf and creates a new pdf out of images of it.
    in  : pdf file path
    out : new pdf made of png images
    '''
    # 1° creates folder with imgs:
    file_name = pdf_to_imgs(pdf, save=True)
    
    # 2° create files:
    new_pdf_path = imgs_to_pdf(file_name)
    
    if new_pdf_path:
        # 3° remove_all if PDF saved:
        delete_folder('temp')
    else:
        print("PDF creation failed. Temp folder not deleted.")


def pdf_to_imgs(pdf_file, save=True):
    '''
    f(x): it creates a temp folder with png images from the original pdf and provides a file_name for the final pdf
    in  : pdf file path
    out : file_name 
    '''
    doc = fitz.open(pdf_file)
    zoom = 4
    mat = fitz.Matrix(zoom, zoom)
    count = 0

    folder_name = 'temp'
    os.makedirs(folder_name, exist_ok=True)
    
    for p in doc:
        count += 1
    for i in range(count):
        val = os.path.join(folder_name, f"image_{i + 1000000}.png")
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=mat)
        pix.save(val)    
    doc.close()
    print('Images saved to temp')
    
    return pdf_file.replace('.pdf','')
 

def imgs_to_pdf(file_name, save=True):
    '''
    Converts images from temp folder back to a PDF.
    '''
    doc = fitz.open() 
    imglist = os.listdir('temp')  
    imglist = [f for f in imglist if f.endswith('.png')]  # Only process PNG files
    imglist.sort()  # Ensure images are in order
    
    if not imglist:
        print("No images found in the 'temp' folder.")
        return None
    
    for f in imglist:
        img = fitz.open(os.path.join('temp', f)) 
        rect = img[0].rect  
        pdfbytes = img.convert_to_pdf() 
        img.close() 
        imgPDF = fitz.open("pdf", pdfbytes)  
        page = doc.new_page(width=rect.width, height=rect.height) 
        page.show_pdf_page(rect, imgPDF, 0)  
    
    if save:
        new_pdf_name = file_name + '_img.pdf'
        c = 1
        while os.path.exists(new_pdf_name):
            new_pdf_name = file_name + f'_img_{c}.pdf'
            c += 1
        try:
            doc.save(new_pdf_name)
            print(f"{new_pdf_name} created successfully")
        except Exception as e:
            print(f"Error saving PDF: {e}")
            return None
        finally:
            doc.close()
        
        return


In [13]:
#test = pdf_to_imgs("teo_sdt_0_dist.pdf")
pdf_to_img_pdf("teo_sdt_0_dist.pdf")

Images saved to temp
teo_sdt_0_dist_img.pdf created successfully
PDF creation failed. Temp folder not deleted.


In [58]:
pdf_to_img_pdf("input.pdf")

Images saved to temp
input_img.pdf created successfully
Folder 'temp' and its contents deleted.


# Backup

I found a fast way around that doesnt save the pngs...

but the size of the final pdfs is *large* because images are not png compressed... much larger.

In [None]:
import fitz

def convert_to_image_only_pdf(input_pdf_path, output_pdf_path):
    '''
    f(x): Converts a PDF to an image-only PDF.
    in  : input_pdf_path (str) - Path of the input PDF file.
          output_pdf_path (str) - Path to save the output image-only PDF file.
    '''
    # Open the input PDF
    pdf = fitz.open(input_pdf_path)

    # Create a new PDF with image-only pages
    new_pdf = fitz.open()
    for page in pdf:
        pix = page.get_pixmap()
        new_page = new_pdf.new_page(width=pix.width, height=pix.height)
        new_page.insert_image(new_page.rect, pixmap=pix)

    # Save the new PDF
    new_pdf.save(output_pdf_path)
    new_pdf.close()
    pdf.close()

# Example usage:
input_pdf_path = "input.pdf"
output_pdf_path = "image_only_example.pdf"
convert_to_image_only_pdf(input_pdf_path, output_pdf_path)

In [45]:
import fitz
import os

def convert_pdf_to_images_and_compile(pdf_path, dpi = 300):
    '''
    This function is the same as the previous one but allows you to change dots per inches.
    '''
    
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Create a new PDF document
    output_pdf = fitz.open()

    # Iterate through each page of the PDF
    for page_number in range(pdf_document.page_count):
        # Get the page
        page = pdf_document.load_page(page_number)
        
        # Convert the page to a pixmap
        # pixmap = page.get_pixmap()
        pixmap = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
        
        # Add the pixmap image to the output PDF document
        output_page = output_pdf.new_page(width=pixmap.width, height=pixmap.height)
        output_page.insert_image(output_page.rect, pixmap=pixmap)


    output_pdf_path = pdf_path.replace('.pdf','_.pdf')
    # Save the output PDF document
    output_pdf.save(output_pdf_path)
    
    # Close the documents
    pdf_document.close()
    output_pdf.close()

# Example usage:
input_pdf_path = "Handout_Trust.pdf"
output_pdf_path = "test.pdf"
convert_pdf_to_images_and_compile(input_pdf_path)
