# 🌐 Web Scraper & PDF Converter

Convert web pages to PDF documents! Features:
- **Single page conversion:** Convert individual URLs to PDF
- **Batch processing:** Convert multiple URLs from a list
- **Website crawling:** Follow links and convert entire sections
- **Custom styling:** Apply CSS modifications for better PDF output
- **Screenshot mode:** Capture full-page screenshots as PDF
- **Content filtering:** Extract specific content before conversion

**Advanced features:** Headless browsing, JavaScript rendering, mobile view simulation!


## 🚀 How to Use

**Option 1:** Set `urls` and `mode` for your conversion
**Option 2:** Use `crawl_mode` for automatic link discovery

### Examples:
```python
# Single URL
urls = ["https://example.com"]
mode = "pdf"

# Multiple URLs
urls = ["https://site1.com", "https://site2.com"]
mode = "screenshot"

# Crawl a website
crawl_mode = True
start_url = "https://example.com"
max_pages = 10
```


In [None]:
# Import libraries
import os, zipfile, shutil, uuid, json, re, time
from pathlib import Path
from datetime import datetime
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup

# Check if running in Google Colab
try:
    from google.colab import files
    IS_COLAB = True
    print("🔧 Running in Google Colab")
except ImportError:
    IS_COLAB = False
    print("🔧 Running locally")

# Install required packages if needed
try:
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from webdriver_manager.chrome import ChromeDriverManager
    print("✅ Required packages available")
except ImportError:
    print("📦 Installing required packages...")
    import subprocess
    subprocess.check_call(["pip", "install", "selenium", "webdriver-manager", "beautifulsoup4", "requests"])
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from webdriver_manager.chrome import ChromeDriverManager
    print("✅ Packages installed successfully")


In [None]:
def setup_driver(headless=True, mobile=False):
    """Setup Chrome WebDriver with appropriate options"""
    chrome_options = Options()
    
    if headless:
        chrome_options.add_argument("--headless")
    
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    if mobile:
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1")
        chrome_options.add_argument("--window-size=375,667")
    
    # PDF generation options
    chrome_options.add_argument("--disable-web-security")
    chrome_options.add_argument("--allow-running-insecure-content")
    
    try:
        if IS_COLAB:
            # In Colab, use system Chrome
            driver = webdriver.Chrome(options=chrome_options)
        else:
            # Local installation with webdriver-manager
            driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
        
        return driver
    except Exception as e:
        print(f"❌ Error setting up WebDriver: {e}")
        return None

def clean_filename(url):
    """Convert URL to a clean filename"""
    parsed = urlparse(url)
    domain = parsed.netloc.replace('www.', '')
    path = parsed.path.replace('/', '_').replace('.', '_')
    if not path or path == '_':
        path = 'index'
    
    # Remove invalid characters
    filename = f"{domain}_{path}"
    filename = re.sub(r'[<>:"/\\\\|?*]', '_', filename)
    filename = filename[:100]  # Limit length
    
    return filename

def get_page_info(driver, url):
    """Get basic information about a web page"""
    try:
        title = driver.title
        current_url = driver.current_url
        
        # Get page dimensions
        width = driver.execute_script("return document.body.scrollWidth")
        height = driver.execute_script("return document.body.scrollHeight")
        
        # Get meta information
        meta_info = {}
        try:
            meta_info['description'] = driver.find_element(By.CSS_SELECTOR, 'meta[name="description"]').get_attribute('content')
        except:
            pass
        
        try:
            meta_info['keywords'] = driver.find_element(By.CSS_SELECTOR, 'meta[name="keywords"]').get_attribute('content')
        except:
            pass
        
        return {
            'title': title,
            'url': current_url,
            'dimensions': {'width': width, 'height': height},
            'meta': meta_info
        }
    except Exception as e:
        return {'error': str(e)}

def apply_custom_css(driver, css_rules):
    """Apply custom CSS to improve PDF output"""
    if not css_rules:
        return
    
    css_script = f"""
    var style = document.createElement('style');
    style.innerHTML = `{css_rules}`;
    document.head.appendChild(style);
    """
    
    try:
        driver.execute_script(css_script)
        time.sleep(1)  # Wait for CSS to apply
    except Exception as e:
        print(f"⚠️ Error applying custom CSS: {e}")


In [None]:
def convert_to_pdf(driver, url, output_path, wait_time=3):
    """Convert a web page to PDF"""
    try:
        print(f"  🌐 Loading: {url}")
        driver.get(url)
        
        # Wait for page to load
        time.sleep(wait_time)
        
        # Wait for any dynamic content
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
        except:
            pass
        
        # Get page info
        page_info = get_page_info(driver, url)
        
        # Generate PDF
        print(f"  📄 Converting to PDF...")
        pdf_options = {
            'paperFormat': 'A4',
            'printBackground': True,
            'marginTop': 0.5,
            'marginBottom': 0.5,
            'marginLeft': 0.5,
            'marginRight': 0.5
        }
        
        pdf_data = driver.execute_cdp_cmd('Page.printToPDF', pdf_options)
        
        # Save PDF
        import base64
        with open(output_path, 'wb') as f:
            f.write(base64.b64decode(pdf_data['data']))
        
        return {
            'success': True,
            'file': output_path,
            'info': page_info
        }
        
    except Exception as e:
        return {
            'success': False,
            'error': str(e),
            'url': url
        }

def convert_to_screenshot(driver, url, output_path, wait_time=3):
    """Convert a web page to screenshot PDF"""
    try:
        print(f"  🌐 Loading: {url}")
        driver.get(url)
        
        # Wait for page to load
        time.sleep(wait_time)
        
        # Wait for any dynamic content
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
        except:
            pass
        
        # Get page info
        page_info = get_page_info(driver, url)
        
        # Take full page screenshot
        print(f"  📸 Taking screenshot...")
        driver.save_screenshot(output_path)
        
        return {
            'success': True,
            'file': output_path,
            'info': page_info
        }
        
    except Exception as e:
        return {
            'success': False,
            'error': str(e),
            'url': url
        }

def crawl_website(start_url, max_pages=10, same_domain_only=True):
    """Crawl a website and discover URLs to convert"""
    try:
        print(f"🕷️ Crawling website: {start_url}")
        
        visited = set()
        to_visit = [start_url]
        discovered_urls = []
        
        base_domain = urlparse(start_url).netloc
        
        while to_visit and len(discovered_urls) < max_pages:
            current_url = to_visit.pop(0)
            
            if current_url in visited:
                continue
            
            visited.add(current_url)
            
            try:
                response = requests.get(current_url, timeout=10)
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find all links
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    full_url = urljoin(current_url, href)
                    
                    # Filter URLs
                    if same_domain_only:
                        if urlparse(full_url).netloc != base_domain:
                            continue
                    
                    # Skip non-HTTP URLs and fragments
                    if not full_url.startswith(('http://', 'https://')):
                        continue
                    
                    if '#' in full_url:
                        full_url = full_url.split('#')[0]
                    
                    if full_url not in visited and full_url not in to_visit:
                        to_visit.append(full_url)
                        discovered_urls.append(full_url)
                        
                        if len(discovered_urls) >= max_pages:
                            break
                
                print(f"  ✅ Found {len(discovered_urls)} URLs so far...")
                
            except Exception as e:
                print(f"  ⚠️ Error crawling {current_url}: {e}")
                continue
        
        print(f"🕷️ Crawling complete. Found {len(discovered_urls)} URLs")
        return discovered_urls[:max_pages]
        
    except Exception as e:
        print(f"❌ Error during crawling: {e}")
        return []


In [None]:
def process_urls(urls, mode="pdf", headless=True, mobile=False, custom_css=None, wait_time=3):
    """Process a list of URLs and convert them"""
    driver = setup_driver(headless, mobile)
    if not driver:
        print("❌ Failed to setup WebDriver")
        return
    
    try:
        out_dir = f"web_conversion_{uuid.uuid4().hex[:6]}"
        os.makedirs(out_dir, exist_ok=True)
        
        results = []
        successful = 0
        failed = 0
        
        for i, url in enumerate(urls, 1):
            print(f"🔄 Processing {i}/{len(urls)}: {url}")
            
            # Clean filename
            filename = clean_filename(url)
            
            if mode == "pdf":
                output_path = os.path.join(out_dir, f"{filename}.pdf")
                result = convert_to_pdf(driver, url, output_path, wait_time)
            elif mode == "screenshot":
                output_path = os.path.join(out_dir, f"{filename}.png")
                result = convert_to_screenshot(driver, url, output_path, wait_time)
            else:
                print(f"❌ Unknown mode: {mode}")
                continue
            
            # Apply custom CSS if provided
            if custom_css and result.get('success'):
                apply_custom_css(driver, custom_css)
                # Re-convert with custom CSS
                if mode == "pdf":
                    result = convert_to_pdf(driver, url, output_path, wait_time)
                elif mode == "screenshot":
                    result = convert_to_screenshot(driver, url, output_path, wait_time)
            
            results.append(result)
            
            if result.get('success'):
                successful += 1
                print(f"  ✅ Success: {os.path.basename(result['file'])}")
            else:
                failed += 1
                print(f"  ❌ Failed: {result.get('error', 'Unknown error')}")
        
        # Create summary
        summary = {
            'total_urls': len(urls),
            'successful': successful,
            'failed': failed,
            'mode': mode,
            'timestamp': datetime.now().isoformat(),
            'results': results
        }
        
        # Save summary
        summary_path = os.path.join(out_dir, "conversion_summary.json")
        with open(summary_path, 'w', encoding='utf-8') as f:
            json.dump(summary, f, ensure_ascii=False, indent=2)
        
        # Create final output
        files_in_dir = [f for f in os.listdir(out_dir) if not f.endswith('.json')]
        
        if len(files_in_dir) == 1:
            # Single file
            final_file = files_in_dir[0]
            shutil.move(os.path.join(out_dir, final_file), final_file)
            print(f"✅ Conversion complete: {final_file}")
        else:
            # Multiple files - create ZIP
            final_file = f"web_conversion_{mode}.zip"
            with zipfile.ZipFile(final_file, 'w', zipfile.ZIP_DEFLATED) as z:
                for root, dirs, files in os.walk(out_dir):
                    for file in files:
                        z.write(os.path.join(root, file), file)
            print(f"✅ Conversion complete: {final_file} ({len(files_in_dir)} files)")
        
        print(f"📊 Summary: {successful} successful, {failed} failed")
        
        if IS_COLAB:
            files.download(final_file)
            print("📥 Download started!")
        else:
            print(f"📁 Output: {os.path.abspath(final_file)}")
        
        return summary
        
    except Exception as e:
        print(f"❌ Error during processing: {e}")
    finally:
        driver.quit()
        shutil.rmtree(out_dir, ignore_errors=True)


## ⚙️ Configuration

Set your URLs and conversion options here:


In [None]:
# Configuration
urls = ["https://example.com"]  # List of URLs to convert
mode = "pdf"  # "pdf" or "screenshot"
headless = True  # Set to False to see browser window
mobile = False  # Set to True for mobile view simulation
wait_time = 3  # Seconds to wait for page load

# Crawl mode (alternative to manual URL list)
crawl_mode = False  # Set to True to crawl a website
start_url = "https://example.com"  # Starting URL for crawling
max_pages = 10  # Maximum pages to crawl
same_domain_only = True  # Only crawl same domain

# Custom CSS for better PDF output (optional)
custom_css = """
/* Hide navigation and ads for cleaner PDF */
nav, .navigation, .navbar, .menu { display: none !important; }
.ad, .advertisement, .ads { display: none !important; }
.sidebar, .widget { display: none !important; }

/* Improve text readability */
body { font-size: 12pt !important; line-height: 1.4 !important; }
h1, h2, h3 { page-break-after: avoid !important; }
"""


## 🎯 Run Web Scraping & Conversion

Execute the web scraping and conversion process:


In [None]:
if crawl_mode:
    print(f"🕷️ Crawl mode enabled")
    print(f"🚀 Starting URL: {start_url}")
    print(f"📊 Max pages: {max_pages}")
    
    # Discover URLs by crawling
    discovered_urls = crawl_website(start_url, max_pages, same_domain_only)
    
    if discovered_urls:
        print(f"✅ Found {len(discovered_urls)} URLs to convert")
        process_urls(discovered_urls, mode, headless, mobile, custom_css, wait_time)
    else:
        print("❌ No URLs discovered during crawling")
        
elif urls and urls != ["https://example.com"]:
    print(f"🚀 Processing {len(urls)} URL(s)")
    print(f"🔧 Mode: {mode}")
    print(f"📱 Mobile view: {mobile}")
    print(f"👻 Headless: {headless}")
    
    process_urls(urls, mode, headless, mobile, custom_css, wait_time)
    
else:
    print("❗ Please configure URLs or enable crawl mode.")
    print("💡 Examples:")
    print("   urls = ['https://example.com']")
    print("   crawl_mode = True")
