# PDF Resolution & Size Converter

Convert a PDF to a smaller, lower-resolution version by rendering each page as a JPEG at your chosen DPI and quality.

**Steps:**
1. Run the setup cell below.
2. Set your input/output paths and options in the main cell.
3. Run the main cell to create your optimized PDF.


In [None]:
# Package Installation

!python3 -m venv PDF --clear
!source PDF/bin/activate && pip install --upgrade pip
!source PDF/bin/activate && pip install PyMuPDF Pillow tqdm

print("Setup complete! Ready to convert images to PDF.")

In [4]:
import os
import io
import fitz  
from tqdm import tqdm
import multiprocessing
from PIL import Image

# Input PDF path
pdf_path = "path/to/your/input.pdf"  # Change this to your input PDF path

# Define the output path and the target resolution (DPI)
output_pdf_path = os.path.splitext(pdf_path)[0] + "_dwn.pdf"
dpi = 120 # Target DPI. Lower values mean lower resolution and smaller file size.                                                              
jpeg_quality = 60 # JPEG quality (0-100), lower is smaller file                            

# Get original file stats
original_file_size = os.path.getsize(pdf_path)
original_doc = fitz.open(pdf_path)
num_pages = original_doc.page_count
original_doc.close()  # Reopen in each process

# Detect number of available workers
num_procs = os.cpu_count() or 1 # Set to lower the number of processes in large files to avoid memory issues
print(f"Detected {num_procs} CPU cores for multiprocessing.")
print(f"Input PDF: {pdf_path}")
print(f"Total pages to process: {num_pages}")
print(f"Target DPI for downsampling: {dpi}")
print(f"JPEG quality: {jpeg_quality}")
print(f"Output PDF will be saved as: {output_pdf_path}\n")

def process_chunk_mp(args):
    start, end, pdf_path, dpi, jpeg_quality = args
    print(f"Process handling pages {start} to {end-1}")
    doc = fitz.open(pdf_path)
    chunk_results = []
    for page_num in range(start, end):
        page = doc.load_page(page_num)
        # Render page as pixmap
        downsampled_pix = page.get_pixmap(dpi=dpi)
        # Convert pixmap to PIL Image
        img = Image.frombytes("RGB", [downsampled_pix.width, downsampled_pix.height], downsampled_pix.samples)
        # Save as JPEG to bytes
        img_bytes_io = io.BytesIO()
        img.save(img_bytes_io, format="JPEG", quality=jpeg_quality)
        img_bytes = img_bytes_io.getvalue()
        chunk_results.append((page_num, img_bytes, downsampled_pix.width, downsampled_pix.height))
    doc.close()
    print(f"Process finished pages {start} to {end-1}")
    return chunk_results

# Split pages into chunks based on number of processes
chunk_size = (num_pages + num_procs - 1) // num_procs
chunks = [(i*chunk_size, min((i+1)*chunk_size, num_pages), pdf_path, dpi, jpeg_quality) for i in range(num_procs) if i*chunk_size < num_pages]

print(f"Splitting {num_pages} pages into {len(chunks)} chunks for multiprocessing.\n")

with multiprocessing.Pool(processes=num_procs) as pool:
    results = []
    for chunk_result in tqdm(pool.imap_unordered(process_chunk_mp, chunks), total=len(chunks), desc="Process chunks"):
        results.extend(chunk_result)

# Sort results by page number to preserve order
results.sort(key=lambda x: x[0])

print("\nAssembling downsampled pages into new PDF (multiprocessing, JPEG)...")
new_doc = fitz.open()
for idx, (page_num, img_bytes, width, height) in enumerate(tqdm(results, desc="Writing pages")):
    img_stream = io.BytesIO(img_bytes)
    new_page_rect = fitz.Rect(0, 0, width, height)
    new_page = new_doc.new_page(width=new_page_rect.width, height=new_page_rect.height)
    new_page.insert_image(new_page_rect, stream=img_stream)
    if (idx+1) % 10 == 0 or (idx+1) == len(results):
        print(f"  Written {idx+1}/{len(results)} pages...")

new_doc.save(output_pdf_path)
new_doc.close()

new_file_size = os.path.getsize(output_pdf_path)

print("\nPDF processing complete (multiprocessing, JPEG).")
print(f"Original file size: {original_file_size/1024/1024:.2f} MB")
print(f"Downsampled file size: {new_file_size/1024/1024:.2f} MB")
print(f"Downsampled file saved at: {output_pdf_path}")


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/your/input.pdf'