In [1]:
import os

In [2]:
from PyPDF2 import PdfReader, PdfWriter

In [3]:
# function for split 
def split_pdf(input_pdf_path, output_dir):
    """
    Splits a PDF into individual pages and saves them in the output directory.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    with open(input_pdf_path, 'rb') as file:
        reader = PdfReader(file)
        for i in range(len(reader.pages)):
            page = reader.pages[i]
            writer = PdfWriter()
            writer.add_page(page)
            output_file_path = os.path.join(output_dir, f"page_{i+1}.pdf")
            with open(output_file_path, 'wb') as output_file:
                writer.write(output_file)
            print(f"Saved page {i+1} to {output_file_path}")

In [4]:
def process_all_pdfs_in_directory(directory_path, output_dir):
    """
    Processes all PDFs in the given directory, splitting each into individual pages.
    """
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            split_pdf(pdf_path, output_dir)
            print(f"Processed {filename}")

In [5]:
directory_path = "/Users/dominiclakshan/Desktop/pdfs/input_pdfs"
output_dir = "/Users/dominiclakshan/Desktop/pdfs/output_pdfs"
process_all_pdfs_in_directory(directory_path, output_dir)

Saved page 1 to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/page_1.pdf
Saved page 2 to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/page_2.pdf
Saved page 3 to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/page_3.pdf
Saved page 4 to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/page_4.pdf
Processed file-sample_150kB.pdf


In [9]:
# splitting pdfs by page nums
def split_pdf_by_pages_in_directory(directory_path, output_dir, page_numbers):
    """
    Iterates over all PDF files in the given directory, splits each into individual pages specified by page_numbers,
    and saves them in the output directory.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            with open(pdf_path, 'rb') as file:
                reader = PdfReader(file)
                for page_number in page_numbers:
                    page = reader.pages[page_number - 1]  # Pages are 0-indexed
                    writer = PdfWriter()
                    writer.add_page(page)
                    output_file_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_page_{page_number}.pdf")
                    with open(output_file_path, 'wb') as output_file:
                        writer.write(output_file)
                    print(f"Saved page {page_number} of {filename} to {output_file_path}")

In [10]:
directory_path = "/Users/dominiclakshan/Desktop/pdfs/input_pdfs"
output_dir = "/Users/dominiclakshan/Desktop/pdfs/output_pdfs"
page_numbers = [1, 3]  # Array of page numbers to split

split_pdf_by_pages_in_directory(directory_path, output_dir, page_numbers)

Saved page 1 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/file-sample_150kB_page_1.pdf
Saved page 3 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/file-sample_150kB_page_3.pdf


In [14]:
def split_pdf_into_purchase_orders_in_directory(directory_path, output_dir, purchase_order_starts):
    """
    Iterates over all PDF files in the given directory, splits each into separate documents for each purchase order,
    starting at the specified page numbers, and saves them in the output directory.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            with open(pdf_path, 'rb') as file:
                reader = PdfReader(file)
                
                for i, start_page in enumerate(purchase_order_starts, start=1):
                    current_page = start_page
                    writer = PdfWriter()
                    
                    while current_page <= len(reader.pages):
                        page = reader.pages[current_page - 1]  # Pages are 0-indexed
                        writer.add_page(page)
                        current_page += 1
                    
                    output_file_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_purchase_order_{i}.pdf")
                    with open(output_file_path, 'wb') as output_file:
                        writer.write(output_file)
                    print(f"Saved purchase order {i} of {filename} to {output_file_path}")

In [17]:
input_pdf_path = "/Users/dominiclakshan/Desktop/pdfs/input_pdfs"  # Path to the PDF you want to split
output_dir = "/Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages"  # Directory to save the split PDFs
purchase_order_starts = [1, 3, 4]  # Start page numbers for each purchase order

split_pdf_into_purchase_orders_in_directory(directory_path, output_dir, purchase_order_starts)

Saved purchase order 1 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages/file-sample_150kB_purchase_order_1.pdf
Saved purchase order 2 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages/file-sample_150kB_purchase_order_2.pdf
Saved purchase order 3 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages/file-sample_150kB_purchase_order_3.pdf


In [32]:
def split_pdf_into_purchase_orders(directory_path, output_dir, purchase_order_starts):
    """
    Iterates over all PDF files in the given directory, splits each into separate documents for each purchase order,
    starting at the specified page numbers, and saves them in the output directory.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            with open(pdf_path, 'rb') as file:
                reader = PdfReader(file)
                writer = PdfWriter()
                
                for i, start_page in enumerate(purchase_order_starts, start=1):
                    # Calculate the end page for the current purchase order
                    # Subtract 1 from the next start page if it exists; otherwise, consider the last page of the PDF
                    if i < len(purchase_order_starts):
                        end_page = purchase_order_starts[i] - 1
                    else:
                        end_page = len(reader.pages)
                    
                    # Ensure the end page does not exceed the total number of pages
                    end_page = min(end_page, len(reader.pages))
                    
                    # Extract pages for the current purchase order
                    for page_num in range(start_page - 1, end_page):
                        page = reader.pages[page_num]
                        writer.add_page(page)
                    
                    # Save the extracted pages to a new PDF
                    output_file_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_purchase_order_{i}.pdf")
                    with open(output_file_path, 'wb') as output_file:
                        writer.write(output_file)
                    print(f"Saved purchase order {i} of {filename} to {output_file_path}")
                    # Reset the writer for the next purchase order
                    writer = PdfWriter()

In [34]:
directory_path = "/Users/dominiclakshan/Desktop/pdfs/input_pdfs"
output_dir = "/Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages"
purchase_order_starts = [1, 2, 3, 4]  # Start page numbers for each purchase order

split_pdf_into_purchase_orders(directory_path, output_dir, purchase_order_starts)

Saved purchase order 1 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages/file-sample_150kB_purchase_order_1.pdf
Saved purchase order 2 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages/file-sample_150kB_purchase_order_2.pdf
Saved purchase order 3 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages/file-sample_150kB_purchase_order_3.pdf
Saved purchase order 4 of file-sample_150kB.pdf to /Users/dominiclakshan/Desktop/pdfs/output_pdfs/pdfs_with_defined_pages/file-sample_150kB_purchase_order_4.pdf
