# 📚 PDF Merger & Splitter

Merge multiple PDF files into one or split PDFs into individual pages! Features:
- **Merge:** Combine multiple PDFs with custom page ranges
- **Split:** Extract specific pages or split by page count
- **Reorder:** Custom page ordering and arrangement
- **Metadata:** Preserve or update document information
- **Watermark:** Add watermarks during merge process

**Advanced features:** Page rotation, compression, encryption, and more!


## 🚀 How to Use

**Option 1:** Set `operation` and `path_arg` for your files
**Option 2:** Upload files directly (in Colab)

### Examples:
```python
# Merge all PDFs in a folder
operation = "merge"
path_arg = "/path/to/pdfs/"

# Split a PDF into individual pages
operation = "split"
path_arg = "document.pdf"

# Merge with custom page ranges
operation = "merge_custom"
path_arg = ["doc1.pdf", "doc2.pdf"]
page_ranges = ["1-5", "2-10"]
```


In [None]:
# Import libraries
import os, zipfile, shutil, uuid
from pathlib import Path
from datetime import datetime

# Check if running in Google Colab
try:
    from google.colab import files
    IS_COLAB = True
    print("🔧 Running in Google Colab")
except ImportError:
    IS_COLAB = False
    print("🔧 Running locally")

# Install required packages if needed
try:
    from PyPDF2 import PdfReader, PdfWriter, PdfMerger
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import letter
    print("✅ Required packages available")
except ImportError:
    print("📦 Installing required packages...")
    import subprocess
    subprocess.check_call(["pip", "install", "PyPDF2", "reportlab"])
    from PyPDF2 import PdfReader, PdfWriter, PdfMerger
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import letter
    print("✅ Packages installed successfully")


In [None]:
def get_pdfs(path):
    """Get all PDF files from path (file, folder, or zip)"""
    if not os.path.exists(path):
        raise FileNotFoundError(f"❌ Path not found: {path}")
    
    if zipfile.is_zipfile(path):
        print(f"📦 Extracting ZIP: {os.path.basename(path)}")
        base = os.path.splitext(os.path.basename(path))[0]
        tmp_dir = f"ext_{uuid.uuid4().hex[:6]}"
        os.makedirs(tmp_dir, exist_ok=True)
        zipfile.ZipFile(path).extractall(tmp_dir)
        pdfs = [os.path.join(r, f) for r, _, fs in os.walk(tmp_dir) 
                for f in fs if f.lower().endswith('.pdf')]
        return pdfs, base, tmp_dir
    elif os.path.isdir(path):
        print(f"📁 Scanning folder: {os.path.basename(path)}")
        base = os.path.basename(os.path.normpath(path))
        pdfs = [os.path.join(r, f) for r, _, fs in os.walk(path) 
                for f in fs if f.lower().endswith('.pdf')]
        return pdfs, base, None
    else:
        print(f"📄 Processing single PDF: {os.path.basename(path)}")
        return [path], os.path.splitext(os.path.basename(path))[0], None

def get_pdf_info(pdf_path):
    """Get information about a PDF file"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            info = {
                'pages': len(reader.pages),
                'title': reader.metadata.get('/Title', 'Unknown') if reader.metadata else 'Unknown',
                'author': reader.metadata.get('/Author', 'Unknown') if reader.metadata else 'Unknown',
                'creator': reader.metadata.get('/Creator', 'Unknown') if reader.metadata else 'Unknown',
                'creation_date': reader.metadata.get('/CreationDate', 'Unknown') if reader.metadata else 'Unknown',
                'file_size': os.path.getsize(pdf_path)
            }
            return info
    except Exception as e:
        return {'error': str(e)}

def create_watermark(text="CONFIDENTIAL", opacity=0.3):
    """Create a watermark PDF"""
    watermark_path = f"watermark_{uuid.uuid4().hex[:6]}.pdf"
    c = canvas.Canvas(watermark_path, pagesize=letter)
    
    # Set font and size
    c.setFont("Helvetica-Bold", 50)
    c.setFillColorRGB(0.7, 0.7, 0.7, opacity)
    
    # Rotate and position text
    c.rotate(45)
    c.drawString(200, 100, text)
    c.save()
    
    return watermark_path


In [None]:
def merge_pdfs(pdf_paths, output_path, add_watermark=False, watermark_text="CONFIDENTIAL"):
    """Merge multiple PDF files into one"""
    merger = PdfMerger()
    
    try:
        for pdf_path in pdf_paths:
            print(f"  📄 Adding: {os.path.basename(pdf_path)}")
            merger.append(pdf_path)
        
        # Add watermark if requested
        if add_watermark:
            watermark_path = create_watermark(watermark_text)
            merger.append(watermark_path)
            os.remove(watermark_path)  # Clean up
        
        # Write merged PDF
        with open(output_path, 'wb') as output_file:
            merger.write(output_file)
        
        merger.close()
        return True
        
    except Exception as e:
        print(f"❌ Error merging PDFs: {e}")
        merger.close()
        return False

def merge_pdfs_custom(pdf_paths, page_ranges, output_path):
    """Merge PDFs with custom page ranges"""
    merger = PdfMerger()
    
    try:
        for i, pdf_path in enumerate(pdf_paths):
            if i < len(page_ranges):
                page_range = page_ranges[i]
                print(f"  📄 Adding {page_range} from: {os.path.basename(pdf_path)}")
                
                # Parse page range (e.g., "1-5", "2-10", "1,3,5")
                if '-' in page_range:
                    start, end = map(int, page_range.split('-'))
                    merger.append(pdf_path, pages=(start-1, end))  # Convert to 0-based
                elif ',' in page_range:
                    pages = [int(p)-1 for p in page_range.split(',')]  # Convert to 0-based
                    merger.append(pdf_path, pages=pages)
                else:
                    # Single page
                    page_num = int(page_range) - 1  # Convert to 0-based
                    merger.append(pdf_path, pages=(page_num, page_num+1))
            else:
                # No page range specified, add all pages
                merger.append(pdf_path)
        
        with open(output_path, 'wb') as output_file:
            merger.write(output_file)
        
        merger.close()
        return True
        
    except Exception as e:
        print(f"❌ Error merging PDFs: {e}")
        merger.close()
        return False

def split_pdf(pdf_path, output_dir, split_mode="pages", pages_per_split=1):
    """Split a PDF into multiple files"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            total_pages = len(reader.pages)
            
            if split_mode == "pages":
                # Split into individual pages
                for page_num in range(total_pages):
                    writer = PdfWriter()
                    writer.add_page(reader.pages[page_num])
                    
                    output_file = os.path.join(output_dir, f"page_{page_num + 1:03d}.pdf")
                    with open(output_file, 'wb') as output:
                        writer.write(output)
                    
                    print(f"  📄 Created: page_{page_num + 1:03d}.pdf")
            
            elif split_mode == "count":
                # Split by page count
                split_count = 0
                current_writer = PdfWriter()
                
                for page_num in range(total_pages):
                    current_writer.add_page(reader.pages[page_num])
                    
                    if (page_num + 1) % pages_per_split == 0 or page_num == total_pages - 1:
                        split_count += 1
                        output_file = os.path.join(output_dir, f"split_{split_count:03d}.pdf")
                        with open(output_file, 'wb') as output:
                            current_writer.write(output)
                        
                        print(f"  📄 Created: split_{split_count:03d}.pdf ({pages_per_split} pages)")
                        current_writer = PdfWriter()
            
            return True
            
    except Exception as e:
        print(f"❌ Error splitting PDF: {e}")
        return False


In [None]:
def process_merge(path, add_watermark=False, watermark_text="CONFIDENTIAL"):
    """Process PDF merge operation"""
    try:
        pdfs, base, tmp = get_pdfs(path)
        if not pdfs:
            print("❌ No valid PDF files found.")
            return

        print(f"📊 Found {len(pdfs)} PDF file(s)")
        
        # Show PDF information
        print("\\n📋 PDF Information:")
        for pdf in pdfs:
            info = get_pdf_info(pdf)
            if 'error' not in info:
                print(f"  📄 {os.path.basename(pdf)}: {info['pages']} pages, {info['file_size']/1024:.1f} KB")
        
        # Merge PDFs
        output_file = f"{base}_merged.pdf"
        print(f"\\n🔄 Merging PDFs...")
        
        if merge_pdfs(pdfs, output_file, add_watermark, watermark_text):
            print(f"✅ Merged successfully: {output_file}")
            
            # Show result info
            result_info = get_pdf_info(output_file)
            if 'error' not in result_info:
                print(f"📊 Result: {result_info['pages']} pages, {result_info['file_size']/1024:.1f} KB")
            
            if IS_COLAB:
                files.download(output_file)
                print("📥 Download started!")
            else:
                print(f"📁 Output: {os.path.abspath(output_file)}")
        else:
            print("❌ Failed to merge PDFs")

    except Exception as e:
        print(f"❌ Error: {e}")
    finally:
        if tmp: shutil.rmtree(tmp, ignore_errors=True)

def process_split(path, split_mode="pages", pages_per_split=1):
    """Process PDF split operation"""
    try:
        pdfs, base, tmp = get_pdfs(path)
        if not pdfs:
            print("❌ No valid PDF files found.")
            return

        if len(pdfs) > 1:
            print("⚠️ Multiple PDFs found. Processing the first one only.")
        
        pdf_path = pdfs[0]
        info = get_pdf_info(pdf_path)
        
        if 'error' in info:
            print(f"❌ Error reading PDF: {info['error']}")
            return
        
        print(f"📄 Processing: {os.path.basename(pdf_path)}")
        print(f"📊 Pages: {info['pages']}, Size: {info['file_size']/1024:.1f} KB")
        
        # Create output directory
        out_dir = f"split_{uuid.uuid4().hex[:6]}"
        os.makedirs(out_dir, exist_ok=True)
        
        print(f"\\n🔄 Splitting PDF...")
        
        if split_pdf(pdf_path, out_dir, split_mode, pages_per_split):
            # Create ZIP if multiple files
            files_in_dir = [f for f in os.listdir(out_dir) if f.endswith('.pdf')]
            
            if len(files_in_dir) > 1:
                final_file = f"{base}_split.zip"
                with zipfile.ZipFile(final_file, 'w', zipfile.ZIP_DEFLATED) as z:
                    for file in files_in_dir:
                        z.write(os.path.join(out_dir, file), file)
                print(f"✅ Created ZIP: {final_file} ({len(files_in_dir)} files)")
            else:
                final_file = os.path.join(out_dir, files_in_dir[0])
                shutil.move(final_file, f"{base}_split.pdf")
                final_file = f"{base}_split.pdf"
                print(f"✅ Created: {final_file}")
            
            if IS_COLAB:
                files.download(final_file)
                print("📥 Download started!")
            else:
                print(f"📁 Output: {os.path.abspath(final_file)}")
        else:
            print("❌ Failed to split PDF")

    except Exception as e:
        print(f"❌ Error: {e}")
    finally:
        shutil.rmtree(out_dir, ignore_errors=True)
        if tmp: shutil.rmtree(tmp, ignore_errors=True)


## ⚙️ Configuration

Set your operation and file path here:


In [None]:
# Configuration
operation = "merge"  # "merge", "split", "merge_custom"
path_arg = None  # Set your file/folder path here

# Merge options
add_watermark = False  # Set to True to add watermark
watermark_text = "CONFIDENTIAL"  # Watermark text

# Split options
split_mode = "pages"  # "pages" (individual pages) or "count" (by page count)
pages_per_split = 1  # Number of pages per split file (when split_mode="count")

# Custom merge options (when operation="merge_custom")
page_ranges = ["1-5", "2-10"]  # Page ranges for each PDF


## 🎯 Run PDF Operations

Execute the PDF merge/split process:


In [None]:
if path_arg:
    print(f"🚀 Processing: {path_arg}")
    print(f"🔧 Operation: {operation}")
    
    if operation == "merge":
        process_merge(path_arg, add_watermark, watermark_text)
    elif operation == "split":
        process_split(path_arg, split_mode, pages_per_split)
    elif operation == "merge_custom":
        # For custom merge, path_arg should be a list of PDF paths
        if isinstance(path_arg, list):
            output_file = f"custom_merged.pdf"
            if merge_pdfs_custom(path_arg, page_ranges, output_file):
                print(f"✅ Custom merge successful: {output_file}")
                if IS_COLAB:
                    files.download(output_file)
                    print("📥 Download started!")
                else:
                    print(f"📁 Output: {os.path.abspath(output_file)}")
            else:
                print("❌ Failed to merge PDFs")
        else:
            print("❌ For custom merge, path_arg must be a list of PDF paths")
    else:
        print("❌ Invalid operation. Use 'merge', 'split', or 'merge_custom'")
        
elif IS_COLAB:
    print("📤 Upload your PDF files...")
    uploaded = files.upload()
    pdf_files = [fname for fname in uploaded.keys() if fname.lower().endswith('.pdf')]
    
    if pdf_files:
        print(f"🚀 Processing {len(pdf_files)} PDF file(s)")
        
        if operation == "merge":
            # Create temporary directory for uploaded files
            temp_dir = f"temp_{uuid.uuid4().hex[:6]}"
            os.makedirs(temp_dir, exist_ok=True)
            
            for fname in pdf_files:
                shutil.move(fname, os.path.join(temp_dir, fname))
            
            process_merge(temp_dir, add_watermark, watermark_text)
            shutil.rmtree(temp_dir, ignore_errors=True)
            
        elif operation == "split":
            # Process first PDF only
            process_split(pdf_files[0], split_mode, pages_per_split)
        else:
            print("❌ Custom merge not supported with file upload")
    else:
        print("❌ No PDF files found in upload")
else:
    print("❗ Please set path_arg or upload in Colab.")
