# 📄 PDF to HTML Converter

Convert PDF files to HTML format easily! Supports:
- Single PDF files
- Multiple PDF files (creates ZIP file)
- ZIP files containing PDFs
- Folders with PDF files

**Features:**
- Preserves text content
- Maintains basic formatting
- Extracts images (optional)
- Clean HTML output


## 🚀 How to Use

**Option 1:** Set `path_arg` to your file/folder path
**Option 2:** Upload files directly (in Colab)

### Examples:
```python
# Single PDF
path_arg = "document.pdf"

# Folder with PDFs  
path_arg = "/path/to/pdfs/"

# ZIP file
path_arg = "documents.zip"
```


In [None]:
# Import libraries
import os, zipfile, shutil, uuid
from pathlib import Path

# Check if running in Google Colab
try:
    from google.colab import files
    IS_COLAB = True
    print("🔧 Running in Google Colab")
except ImportError:
    IS_COLAB = False
    print("🔧 Running locally")

# Install required packages if needed
try:
    import pdfplumber
    import fitz  # PyMuPDF
    print("✅ Required packages available")
except ImportError:
    print("📦 Installing required packages...")
    import subprocess
    subprocess.check_call(["pip", "install", "pdfplumber", "PyMuPDF"])
    import pdfplumber
    import fitz
    print("✅ Packages installed successfully")


In [None]:
def get_pdfs(path):
    """Get all PDF files from path (file, folder, or zip)"""
    if not os.path.exists(path):
        raise FileNotFoundError(f"❌ Path not found: {path}")
    
    if zipfile.is_zipfile(path):
        print(f"📦 Extracting ZIP: {os.path.basename(path)}")
        base = os.path.splitext(os.path.basename(path))[0]
        tmp_dir = f"ext_{uuid.uuid4().hex[:6]}"
        os.makedirs(tmp_dir, exist_ok=True)
        zipfile.ZipFile(path).extractall(tmp_dir)
        pdfs = [os.path.join(r, f) for r, _, fs in os.walk(tmp_dir) 
                for f in fs if f.lower().endswith('.pdf')]
        return pdfs, base, tmp_dir
    elif os.path.isdir(path):
        print(f"📁 Scanning folder: {os.path.basename(path)}")
        base = os.path.basename(os.path.normpath(path))
        pdfs = [os.path.join(r, f) for r, _, fs in os.walk(path) 
                for f in fs if f.lower().endswith('.pdf')]
        return pdfs, base, None
    else:
        print(f"📄 Processing single PDF: {os.path.basename(path)}")
        return [path], os.path.splitext(os.path.basename(path))[0], None


In [None]:
def extract_text_with_formatting(page):
    """Extract text with formatting information"""
    formatted_text = []
    
    # Extract characters with their properties
    chars = page.chars
    if not chars:
        return page.extract_text() or ""
    
    # Group characters into words and lines
    words = []
    current_word = ""
    current_font = None
    current_size = None
    
    for char in chars:
        char_text = char.get('text', '')
        font = char.get('fontname', '')
        size = char.get('size', 12)
        
        if char_text == ' ':
            if current_word:
                words.append({
                    'text': current_word,
                    'font': current_font,
                    'size': current_size,
                    'x0': char.get('x0', 0),
                    'y0': char.get('y0', 0)
                })
                current_word = ""
        else:
            current_word += char_text
            current_font = font
            current_size = size
    
    # Add last word
    if current_word:
        words.append({
            'text': current_word,
            'font': current_font,
            'size': current_size,
            'x0': chars[-1].get('x0', 0),
            'y0': chars[-1].get('y0', 0)
        })
    
    return words

def pdf_to_html(pdf_path, extract_images=True):
    """Convert a single PDF to HTML with proper layout and styling"""
    html_content = []
    html_content.append("<!DOCTYPE html>")
    html_content.append("<html><head><meta charset='utf-8'>")
    html_content.append(f"<title>{os.path.basename(pdf_path)}</title>")
    html_content.append("<style>")
    html_content.append("""
        body { 
            font-family: 'Times New Roman', serif; 
            margin: 0; 
            padding: 20px; 
            background: #f5f5f5;
            line-height: 1.6;
        }
        .container {
            max-width: 800px;
            margin: 0 auto;
            background: white;
            box-shadow: 0 0 10px rgba(0,0,0,0.1);
        }
        .page { 
            margin-bottom: 20px; 
            padding: 40px;
            min-height: 1000px;
            background: white;
            position: relative;
            border-bottom: 1px solid #ddd;
        }
        .page-number { 
            position: absolute;
            bottom: 20px;
            right: 40px;
            color: #666; 
            font-size: 12px;
            font-weight: bold;
        }
        .text-content {
            font-size: 12pt;
            color: #333;
            text-align: justify;
        }
        .paragraph {
            margin-bottom: 12px;
            text-indent: 24px;
        }
        .heading {
            font-weight: bold;
            margin: 20px 0 10px 0;
            color: #000;
        }
        .h1 { font-size: 18pt; }
        .h2 { font-size: 16pt; }
        .h3 { font-size: 14pt; }
        .bold { font-weight: bold; }
        .italic { font-style: italic; }
        .underline { text-decoration: underline; }
        .image-placeholder {
            border: 2px dashed #ccc;
            padding: 20px;
            margin: 20px 0;
            text-align: center;
            background: #f9f9f9;
            color: #666;
        }
        .table {
            border-collapse: collapse;
            width: 100%;
            margin: 20px 0;
        }
        .table td, .table th {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        .table th {
            background-color: #f2f2f2;
            font-weight: bold;
        }
        .word {
            display: inline;
        }
        .large-text { font-size: 14pt; }
        .small-text { font-size: 10pt; }
        .bold-text { font-weight: bold; }
    """)
    html_content.append("</style></head><body>")
    html_content.append("<div class='container'>")
    
    try:
        # Extract text using pdfplumber with layout analysis
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                html_content.append(f"<div class='page'>")
                html_content.append(f"<div class='page-number'>Page {page_num}</div>")
                html_content.append("<div class='text-content'>")
                
                # Extract text with formatting
                words = extract_text_with_formatting(page)
                if words:
                    current_line = []
                    current_y = None
                    
                    for word in words:
                        word_y = word.get('y0', 0)
                        word_text = word.get('text', '')
                        word_size = word.get('size', 12)
                        word_font = word.get('font', '')
                        
                        # Check if we're on a new line
                        if current_y is None or abs(word_y - current_y) > 5:
                            if current_line:
                                # Process current line
                                line_html = " ".join(current_line)
                                html_content.append(f"<div class='paragraph'>{line_html}</div>")
                                current_line = []
                            current_y = word_y
                        
                        # Apply formatting based on font size and name
                        css_classes = []
                        if word_size > 14:
                            css_classes.append("large-text")
                        elif word_size < 10:
                            css_classes.append("small-text")
                        
                        if 'bold' in word_font.lower() or 'black' in word_font.lower():
                            css_classes.append("bold-text")
                        
                        if css_classes:
                            word_html = f"<span class='{' '.join(css_classes)}'>{word_text}</span>"
                        else:
                            word_html = word_text
                        
                        current_line.append(word_html)
                    
                    # Add last line
                    if current_line:
                        line_html = " ".join(current_line)
                        html_content.append(f"<div class='paragraph'>{line_html}</div>")
                
                # Extract tables if any
                tables = page.extract_tables()
                for table in tables:
                    if table:
                        html_content.append("<table class='table'>")
                        for i, row in enumerate(table):
                            if i == 0:  # Header
                                html_content.append("<tr>")
                                for cell in row:
                                    html_content.append(f"<th>{cell or ''}</th>")
                                html_content.append("</tr>")
                            else:  # Data rows
                                html_content.append("<tr>")
                                for cell in row:
                                    html_content.append(f"<td>{cell or ''}</td>")
                                html_content.append("</tr>")
                        html_content.append("</table>")
                
                # Extract images if requested
                if extract_images and page.images:
                    for img_num, img in enumerate(page.images, 1):
                        html_content.append(f"<div class='image-placeholder'>")
                        html_content.append(f"📷 Image {img_num}: {img.get('name', 'Unknown')}")
                        html_content.append(f"<br>Position: {img.get('x0', 0):.0f}, {img.get('y0', 0):.0f}")
                        html_content.append("</div>")
                
                html_content.append("</div>")
                html_content.append("</div>")
    
    except Exception as e:
        html_content.append(f"<div style='color: red; padding: 20px;'>Error processing PDF: {e}</div>")
    
    html_content.append("</div>")
    html_content.append("</body></html>")
    return "\n".join(html_content)


In [None]:
def process(path, extract_images=True):
    """Process PDFs and convert to HTML"""
    try:
        pdfs, base, tmp = get_pdfs(path)
        if not pdfs:
            print("❌ No valid PDF files found.")
            return

        print(f"📊 Found {len(pdfs)} PDF file(s)")
        
        out_dir = f"htmls_{uuid.uuid4().hex[:6]}"
        os.makedirs(out_dir, exist_ok=True)
        
        html_paths = []
        for i, pdf in enumerate(pdfs, 1):
            print(f"🔄 Converting {i}/{len(pdfs)}: {os.path.basename(pdf)}")
            html_content = pdf_to_html(pdf, extract_images)
            
            html_name = os.path.splitext(os.path.basename(pdf))[0] + ".html"
            html_path = os.path.join(out_dir, html_name)
            
            with open(html_path, 'w', encoding='utf-8') as f:
                f.write(html_content)
            
            html_paths.append(html_path)
            print(f"  ✅ Created: {html_name}")

        if len(html_paths) == 1:
            final_file = f"{base}.html"
            shutil.move(html_paths[0], final_file)
            print(f"✅ Final output: {final_file}")
        else:
            final_file = f"{base}_htmls.zip"
            with zipfile.ZipFile(final_file, 'w', zipfile.ZIP_DEFLATED) as z:
                for html in html_paths:
                    z.write(html, os.path.basename(html))
            print(f"✅ Created ZIP: {final_file} ({len(html_paths)} HTML files)")

        if IS_COLAB:
            files.download(final_file)
            print("📥 Download started!")
        else:
            print(f"📁 Output: {os.path.abspath(final_file)}")

    except Exception as e:
        print(f"❌ Error: {e}")
    finally:
        shutil.rmtree(out_dir, ignore_errors=True)
        if tmp: shutil.rmtree(tmp, ignore_errors=True)


## ⚙️ Configuration

Set your file path and options here:


In [None]:
# Configuration
path_arg = None  # Set your file/folder path here
extract_images = True  # Set to False to skip image extraction


## 🎯 Run Conversion

Execute the conversion process:


In [None]:
if path_arg:
    print(f"🚀 Processing: {path_arg}")
    process(path_arg, extract_images)
elif IS_COLAB:
    print("📤 Upload your PDF files...")
    uploaded = files.upload()
    for fname in uploaded.keys():
        if fname.lower().endswith('.pdf'):
            print(f"🚀 Processing: {fname}")
            process(fname, extract_images)
        else:
            print(f"⚠️ Skipping non-PDF file: {fname}")
else:
    print("❗ Please set path_arg or upload in Colab.")
