# üåê Website Crawler & PDF Generator

This notebook crawls a website, analyzes content using AI, and generates professional PDFs with summaries and project ideas for each module.

## Features
- üï∑Ô∏è Crawls websites and extracts content from all pages
- ü§ñ Uses Claude AI to generate intelligent summaries
- üìã Identifies buildable projects from documentation
- üìÑ Creates professional PDF reports for each module
- üé® Beautiful formatting with tables and sections

## How to Use
1. Run the setup cell to install dependencies
2. Configure your settings (URL, API key, max pages)
3. Run the crawler and generator
4. Download your PDFs from the 'pdfs' folder

## üì¶ Step 1: Install Dependencies

In [None]:
%%capture
!pip install beautifulsoup4 requests reportlab

## ‚öôÔ∏è Step 2: Import Libraries and Define Classes

In [None]:
import os
import re
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from collections import defaultdict
import json
from datetime import datetime
import time

# PDF generation
from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY

print("‚úÖ Libraries imported successfully!")

In [None]:
class WebsiteCrawler:
    """Crawls a website and extracts content from all internal pages."""
    
    def __init__(self, base_url, max_pages=50):
        self.base_url = base_url
        self.max_pages = max_pages
        self.visited_urls = set()
        self.pages_content = []
        self.domain = urlparse(base_url).netloc
        
    def is_valid_url(self, url):
        """Check if URL is valid and belongs to the same domain."""
        parsed = urlparse(url)
        return bool(parsed.netloc) and parsed.netloc == self.domain
    
    def get_all_links(self, url, soup):
        """Extract all internal links from a page."""
        links = set()
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(url, href)
            clean_url = full_url.split('#')[0].split('?')[0]
            
            if self.is_valid_url(clean_url) and clean_url not in self.visited_urls:
                links.add(clean_url)
        return links
    
    def extract_content(self, soup, url):
        """Extract meaningful content from a page."""
        for script in soup(["script", "style", "nav", "footer", "header"]):
            script.decompose()
        
        title = soup.find('title')
        title = title.get_text().strip() if title else "Untitled"
        
        main_content = soup.find('main') or soup.find('article') or soup.find('body')
        
        if main_content:
            text = main_content.get_text(separator='\n', strip=True)
            text = re.sub(r'\n\s*\n', '\n\n', text)
            
            headings = []
            for heading in main_content.find_all(['h1', 'h2', 'h3']):
                headings.append({
                    'level': heading.name,
                    'text': heading.get_text().strip()
                })
            
            return {
                'url': url,
                'title': title,
                'content': text[:5000],
                'headings': headings,
                'word_count': len(text.split())
            }
        
        return None
    
    def crawl(self):
        """Crawl the website starting from base_url."""
        print(f"üï∑Ô∏è  Starting crawl of {self.base_url}")
        print(f"   Maximum pages: {self.max_pages}\n")
        
        urls_to_visit = {self.base_url}
        
        while urls_to_visit and len(self.visited_urls) < self.max_pages:
            url = urls_to_visit.pop()
            
            if url in self.visited_urls:
                continue
            
            try:
                print(f"üìÑ Crawling ({len(self.visited_urls) + 1}/{self.max_pages}): {url[:80]}...")
                
                response = requests.get(url, timeout=10, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                })
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                content = self.extract_content(soup, url)
                
                if content:
                    self.pages_content.append(content)
                
                self.visited_urls.add(url)
                new_links = self.get_all_links(url, soup)
                urls_to_visit.update(new_links)
                
                time.sleep(0.5)
                
            except Exception as e:
                print(f"   ‚ö†Ô∏è  Error: {str(e)}")
                self.visited_urls.add(url)
        
        print(f"\n‚úÖ Crawl complete! Visited {len(self.visited_urls)} pages")
        return self.pages_content


class ContentAnalyzer:
    """Analyzes content using Claude AI to generate summaries and project ideas."""
    
    def __init__(self, api_key=None):
        self.api_key = api_key
        self.use_ai = bool(api_key)
        if not self.use_ai:
            print("‚ö†Ô∏è  No API key provided. Using basic analysis.")
    
    def organize_by_modules(self, pages_content):
        """Organize pages into logical modules based on URL structure."""
        modules = defaultdict(list)
        
        for page in pages_content:
            parsed = urlparse(page['url'])
            path_parts = [p for p in parsed.path.split('/') if p]
            
            if not path_parts:
                module_name = "Home"
            else:
                module_name = path_parts[0].replace('-', ' ').replace('_', ' ').title()
            
            modules[module_name].append(page)
        
        return dict(modules)
    
    def analyze_with_ai(self, module_name, pages):
        """Use Claude AI to analyze content and generate insights."""
        if not self.use_ai:
            return self._basic_analysis(module_name, pages)
        
        try:
            content_summary = f"Module: {module_name}\n\n"
            for i, page in enumerate(pages[:10], 1):
                content_summary += f"Page {i}: {page['title']}\n"
                content_summary += f"Content: {page['content'][:500]}...\n\n"
            
            response = requests.post(
                'https://api.anthropic.com/v1/messages',
                headers={
                    'Content-Type': 'application/json',
                    'x-api-key': self.api_key,
                    'anthropic-version': '2023-06-01'
                },
                json={
                    'model': 'claude-sonnet-4-20250514',
                    'max_tokens': 2000,
                    'messages': [{
                        'role': 'user',
                        'content': f"""Analyze this website module and provide:

1. A concise summary (2-3 paragraphs)
2. 3-5 specific projects that could be built
3. Key technologies/concepts

{content_summary}

Format as JSON with keys: summary, buildable_projects (array), key_concepts (array)"""
                    }]
                },
                timeout=30
            )
            
            if response.status_code == 200:
                result = response.json()
                content_text = result['content'][0]['text']
                content_text = re.sub(r'```json\s*|\s*```', '', content_text).strip()
                return json.loads(content_text)
            else:
                print(f"   ‚ö†Ô∏è  API Error: {response.status_code}")
                return self._basic_analysis(module_name, pages)
                
        except Exception as e:
            print(f"   ‚ö†Ô∏è  AI Error: {str(e)}")
            return self._basic_analysis(module_name, pages)
    
    def _basic_analysis(self, module_name, pages):
        """Fallback basic analysis without AI."""
        total_words = sum(p['word_count'] for p in pages)
        all_text = ' '.join([p['content'] for p in pages])
        words = re.findall(r'\b[a-z]{4,}\b', all_text.lower())
        word_freq = defaultdict(int)
        for word in words:
            word_freq[word] += 1
        
        common_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
        
        return {
            'summary': f"This '{module_name}' module contains {len(pages)} pages with ~{total_words} words. "
                      f"Key topics: {', '.join([w[0] for w in common_words[:5]])}.",
            'buildable_projects': [
                f"Documentation site for {module_name}",
                f"Tutorial application based on {module_name}",
                f"Reference implementation tool"
            ],
            'key_concepts': [w[0] for w in common_words[:5]]
        }


class PDFGenerator:
    """Generates professional PDFs with summaries and project ideas."""
    
    def __init__(self, output_dir='./pdfs'):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.styles = getSampleStyleSheet()
        self._setup_custom_styles()
    
    def _setup_custom_styles(self):
        self.styles.add(ParagraphStyle(
            name='CustomTitle',
            parent=self.styles['Title'],
            fontSize=24,
            textColor=colors.HexColor('#1e40af'),
            spaceAfter=30,
            alignment=TA_CENTER
        ))
        
        self.styles.add(ParagraphStyle(
            name='SectionHeading',
            parent=self.styles['Heading2'],
            fontSize=14,
            textColor=colors.HexColor('#dc2626'),
            spaceBefore=12,
            spaceAfter=6
        ))
    
    def generate_pdf(self, module_name, analysis, pages, base_url):
        filename = f"{module_name.lower().replace(' ', '_')}_analysis.pdf"
        filepath = os.path.join(self.output_dir, filename)
        
        doc = SimpleDocTemplate(filepath, pagesize=letter,
                              topMargin=0.75*inch, bottomMargin=0.75*inch)
        story = []
        
        # Title
        story.append(Paragraph(f"Module: {module_name}", self.styles['CustomTitle']))
        story.append(Spacer(1, 0.2*inch))
        story.append(Paragraph("Analysis Report", self.styles['Heading2']))
        story.append(Spacer(1, 0.1*inch))
        story.append(Paragraph(f"Generated: {datetime.now().strftime('%B %d, %Y')}", 
                             self.styles['Normal']))
        story.append(Paragraph(f"Source: {base_url}", self.styles['Normal']))
        story.append(Spacer(1, 0.3*inch))
        
        # Summary
        story.append(Paragraph("üìã Summary", self.styles['SectionHeading']))
        story.append(Spacer(1, 0.1*inch))
        story.append(Paragraph(analysis.get('summary', 'No summary available.'), 
                             self.styles['Normal']))
        story.append(Spacer(1, 0.3*inch))
        
        # Projects
        story.append(Paragraph("üöÄ Things You Can Build", self.styles['SectionHeading']))
        story.append(Spacer(1, 0.1*inch))
        
        projects = analysis.get('buildable_projects', [])
        for i, project in enumerate(projects, 1):
            story.append(Paragraph(f"{i}. {project}", self.styles['Normal']))
            story.append(Spacer(1, 0.05*inch))
        
        story.append(Spacer(1, 0.3*inch))
        
        # Concepts
        story.append(Paragraph("üí° Key Concepts", self.styles['SectionHeading']))
        story.append(Spacer(1, 0.1*inch))
        concepts = ", ".join(analysis.get('key_concepts', []))
        story.append(Paragraph(concepts or "None identified", self.styles['Normal']))
        story.append(Spacer(1, 0.3*inch))
        
        # Pages table
        story.append(PageBreak())
        story.append(Paragraph("üìö Pages in Module", self.styles['SectionHeading']))
        story.append(Spacer(1, 0.2*inch))
        
        table_data = [['#', 'Title', 'Words']]
        for i, page in enumerate(pages[:20], 1):
            table_data.append([
                str(i),
                Paragraph(page['title'][:60], self.styles['Normal']),
                str(page['word_count'])
            ])
        
        t = Table(table_data, colWidths=[0.5*inch, 5*inch, 1*inch])
        t.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
            ('FONTSIZE', (0, 0), (-1, 0), 10),
            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
            ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
            ('GRID', (0, 0), (-1, -1), 1, colors.black)
        ]))
        story.append(t)
        
        doc.build(story)
        print(f"   ‚úÖ Generated: {filename}")
        return filepath

print("‚úÖ All classes defined successfully!")

## üéØ Step 3: Configure Your Settings

In [None]:
# Configuration
WEBSITE_URL = "https://docs.python.org"  # Change this to your target website
MAX_PAGES = 30  # Maximum number of pages to crawl
ANTHROPIC_API_KEY = ""  # Optional: Add your Anthropic API key for AI-powered analysis

print("‚öôÔ∏è  Configuration:")
print(f"   Website: {WEBSITE_URL}")
print(f"   Max Pages: {MAX_PAGES}")
print(f"   AI Analysis: {'Enabled' if ANTHROPIC_API_KEY else 'Disabled (using basic analysis)'}")

## üöÄ Step 4: Run the Crawler and Generate PDFs

In [None]:
# Step 1: Crawl the website
print("=" * 70)
print("Starting Website Analysis...")
print("=" * 70)
print()

crawler = WebsiteCrawler(WEBSITE_URL, max_pages=MAX_PAGES)
pages_content = crawler.crawl()

if not pages_content:
    print("‚ùå No content extracted. Please check the URL.")
else:
    print(f"\n‚úÖ Extracted content from {len(pages_content)} pages")

In [None]:
# Step 2: Organize into modules
if pages_content:
    analyzer = ContentAnalyzer(api_key=ANTHROPIC_API_KEY if ANTHROPIC_API_KEY else None)
    modules = analyzer.organize_by_modules(pages_content)
    
    print(f"\nüìä Organized into {len(modules)} modules:")
    for module_name, pages in modules.items():
        print(f"   ‚Ä¢ {module_name}: {len(pages)} pages")

In [None]:
# Step 3: Analyze and generate PDFs
if pages_content:
    pdf_generator = PDFGenerator()
    generated_pdfs = []
    
    print("\nüî¨ Analyzing modules and generating PDFs...\n")
    
    for module_name, pages in modules.items():
        print(f"üìù Processing: {module_name}")
        analysis = analyzer.analyze_with_ai(module_name, pages)
        pdf_path = pdf_generator.generate_pdf(module_name, analysis, pages, WEBSITE_URL)
        generated_pdfs.append(pdf_path)
        print()
    
    print("=" * 70)
    print("‚ú® All done!")
    print(f"Generated {len(generated_pdfs)} PDF reports")
    print("\nPDF files:")
    for pdf in generated_pdfs:
        print(f"   üìÑ {os.path.basename(pdf)}")
    print("=" * 70)

## üì• Step 5: Download PDFs

Your PDFs are saved in the `pdfs` folder. In Google Colab:
1. Click the folder icon on the left sidebar
2. Navigate to the `pdfs` folder
3. Right-click any PDF and select "Download"

Or run the cell below to zip all PDFs for easy download:

In [None]:
# Zip all PDFs for easy download
import shutil
from google.colab import files

if os.path.exists('pdfs') and os.listdir('pdfs'):
    shutil.make_archive('website_analysis_pdfs', 'zip', 'pdfs')
    print("üì¶ PDFs zipped successfully!")
    print("‚¨áÔ∏è  Downloading...")
    files.download('website_analysis_pdfs.zip')
else:
    print("‚ùå No PDFs found. Please run the analysis first.")

## üé® Optional: View Sample Analysis

Run this cell to see a preview of one module's analysis:

In [None]:
# Display a sample analysis
if 'modules' in locals() and modules:
    sample_module = list(modules.keys())[0]
    sample_pages = modules[sample_module]
    
    print(f"üìñ Sample Analysis for: {sample_module}")
    print("=" * 70)
    
    sample_analysis = analyzer.analyze_with_ai(sample_module, sample_pages)
    
    print(f"\nüìã Summary:")
    print(sample_analysis['summary'])
    
    print(f"\nüöÄ Buildable Projects:")
    for i, project in enumerate(sample_analysis['buildable_projects'], 1):
        print(f"   {i}. {project}")
    
    print(f"\nüí° Key Concepts:")
    print(f"   {', '.join(sample_analysis['key_concepts'])}")
    print("\n" + "=" * 70)