In [None]:
import sys
!{sys.executable} -m pip install aiohttp aiosqlite beautifulsoup4 pandas ipython


Collecting aiohttp
  Downloading aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting aiosqlite
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.14.2-py3-none-any.whl.metadata (3.8 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp)
  Downloading aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting attrs>=17.3.0 (from aiohttp)
  Using cached attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting frozenlist>=1.1.1 (from aiohttp)
  Downloading frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (20 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp)

In [None]:
import asyncio
import aiohttp
import aiosqlite
import logging
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse, urljoin
import time
import csv
import sys
from datetime import datetime, timezone
import re
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display, clear_output
import os
try:
    from config import SEEDS, MAX_URLS, MAX_CONCURRENT_TASKS, PER_DOMAIN_DELAY, REQUEST_TIMEOUT, KEYWORD_FILTER, ALLOWED_DOMAINS
    print("‚úÖ Config loaded successfully!")
    print(f"üéØ Target: {MAX_URLS:,} URLs")
    print(f"üåê Seeds: {len(SEEDS)} domains")
    print(f"‚ö° Concurrency: {MAX_CONCURRENT_TASKS} tasks")
except ImportError as e:
    print(f"‚ùå Error loading config: {e}")
    print("üìù Pastikan file config.py ada di directory yang sama")

ModuleNotFoundError: No module named 'aiohttp'

In [None]:
# Cell 2: Definisi Crawler Class yang Dioptimasi
class TechScopeCrawler:
    def __init__(self):
        self.setup_logging()
        self.stats = {
            'queued': 0, 'processed': 0, 'saved': 0, 
            'failed': 0, 'robots_denied': 0, 'duplicates': 0,
            'start_time': time.time()
        }
        self.url_cache = set()
        self.session = None
        self.db = None
        self.is_running = True
        self.processed_urls = set()
        
    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[logging.StreamHandler(sys.stdout)]
        )
        self.logger = logging.getLogger(__name__)
    
    def normalize_url(self, url, base_url=None):
        """Normalize URL dengan efisien"""
        try:
            if base_url:
                url = urljoin(base_url, url)
            url = url.split('#')[0]
            parsed = urlparse(url)
            scheme = parsed.scheme.lower() or 'https'
            netloc = parsed.netloc.lower().replace('www.', '')
            path = parsed.path.rstrip('/') or '/'
            normalized = f"{scheme}://{netloc}{path}"
            if parsed.query:
                normalized += f"?{parsed.query}"
            return normalized
        except Exception:
            return None
    
    def get_domain(self, url):
        """Extract domain dengan cepat"""
        try:
            return urlparse(url).netloc.lower().replace('www.', '')
        except Exception:
            return ""
    
    def should_crawl_url(self, url):
        """Optimized URL filtering"""
        if not url or url in self.processed_urls:
            return False
            
        domain = self.get_domain(url)
        if not domain:
            return False
            
        # Cek domain allowed
        if not any(allowed_domain in domain for allowed_domain in ALLOWED_DOMAINS):
            return False
        
        # Cek keyword filter
        url_lower = url.lower()
        return any(keyword in url_lower for keyword in KEYWORD_FILTER)
    
    async def init_database(self):
        """Initialize database dengan error handling"""
        try:
            self.db = await aiosqlite.connect('techscope_crawler.db', timeout=30)
            
            # Optimized schema
            await self.db.executescript("""
                PRAGMA journal_mode=WAL;
                PRAGMA synchronous=NORMAL;
                PRAGMA cache_size=10000;
                
                CREATE TABLE IF NOT EXISTS crawler_queue (
                    url TEXT PRIMARY KEY,
                    status TEXT DEFAULT 'queued',
                    depth INTEGER DEFAULT 0,
                    last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
                
                CREATE TABLE IF NOT EXISTS domain_delays (
                    domain TEXT PRIMARY KEY,
                    last_request TIMESTAMP DEFAULT 0,
                    delay_seconds REAL DEFAULT 1.5
                );
                
                CREATE INDEX IF NOT EXISTS idx_queue_status ON crawler_queue(status);
                CREATE INDEX IF NOT EXISTS idx_queue_depth ON crawler_queue(depth);
            """)
            await self.db.commit()
            
            # Load seeds dengan batch processing
            batch_size = 50
            for i in range(0, len(SEEDS), batch_size):
                batch = SEEDS[i:i + batch_size]
                for seed in batch:
                    normalized = self.normalize_url(seed)
                    if normalized and self.should_crawl_url(normalized):
                        await self.add_url_to_queue(normalized, depth=0)
                
            print(f"üå± Loaded {len(SEEDS)} seeds, {self.stats['queued']} URLs queued")
            
        except Exception as e:
            print(f"‚ùå Database initialization failed: {e}")
            raise
    
    async def add_url_to_queue(self, url, depth=0):
        """Optimized URL queue addition"""
        if url in self.url_cache or depth > 5:  # Limit depth
            self.stats['duplicates'] += 1
            return False
            
        try:
            await self.db.execute(
                "INSERT OR IGNORE INTO crawler_queue (url, status, depth) VALUES (?, 'queued', ?)",
                (url, depth)
            )
            self.url_cache.add(url)
            self.stats['queued'] += 1
            return True
        except Exception as e:
            return False
    
    async def get_robots_parser(self, domain):
        """Cached robots.txt parser"""
        try:
            async with self.session.get(
                f"https://{domain}/robots.txt", 
                timeout=10,
                headers={'User-Agent': 'TechScopeBot/1.0'}
            ) as response:
                if response.status == 200:
                    rules_text = await response.text()
                else:
                    rules_text = ""
            
            parser = RobotFileParser()
            if rules_text:
                parser.parse(rules_text.splitlines())
            return parser
        except Exception:
            # Return permissive parser jika error
            parser = RobotFileParser()
            return parser
    
    async def can_fetch(self, url):
        """Fast robots.txt check"""
        try:
            domain = self.get_domain(url)
            parser = await self.get_robots_parser(domain)
            return parser.can_fetch("TechScopeBot", url)
        except Exception:
            return True
    
    async def apply_domain_delay(self, domain):
        """Efficient domain rate limiting"""
        try:
            cursor = await self.db.execute(
                "SELECT last_request FROM domain_delays WHERE domain = ?", 
                (domain,)
            )
            result = await cursor.fetchone()
            
            if result and result[0]:
                elapsed = time.time() - result[0]
                if elapsed < PER_DOMAIN_DELAY:
                    await asyncio.sleep(PER_DOMAIN_DELAY - elapsed)
            
            await self.db.execute(
                "INSERT OR REPLACE INTO domain_delays (domain, last_request) VALUES (?, ?)",
                (domain, time.time())
            )
        except Exception:
            pass  # Skip delay jika error
    
    async def process_url(self, url, depth):
        """Optimized URL processing"""
        try:
            self.processed_urls.add(url)
            domain = self.get_domain(url)
            
            # Fast robots check
            if not await self.can_fetch(url):
                self.stats['robots_denied'] += 1
                await self.mark_url_completed(url, 'failed')
                return
            
            # Apply rate limiting
            await self.apply_domain_delay(domain)
            
            # Fetch dengan timeout
            try:
                async with self.session.get(
                    url, 
                    timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT),
                    headers={
                        'User-Agent': 'Mozilla/5.0 (compatible; TechScopeBot/1.0)',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                        'Accept-Language': 'en-US,en;q=0.5',
                    }
                ) as response:
                    
                    if response.status != 200:
                        await self.mark_url_completed(url, 'failed')
                        self.stats['failed'] += 1
                        return
                    
                    html = await response.text()
                    
            except asyncio.TimeoutError:
                await self.mark_url_completed(url, 'failed')
                self.stats['failed'] += 1
                return
            except Exception:
                await self.mark_url_completed(url, 'failed')
                self.stats['failed'] += 1
                return
            
            # Fast HTML parsing dan link extraction
            try:
                soup = BeautifulSoup(html, 'lxml')
                links = set()
                
                for link in soup.find_all('a', href=True):
                    href = link.get('href', '')
                    if href.startswith(('http', '/', '//')):
                        normalized = self.normalize_url(href, url)
                        if normalized and self.should_crawl_url(normalized):
                            links.add(normalized)
                
                # Batch add new links
                for link in links:
                    if self.stats['saved'] >= MAX_URLS:
                        break
                    await self.add_url_to_queue(link, depth + 1)
                
            except Exception:
                links = set()
            
            # Save URL yang relevan
            if self.is_relevant_content(soup, url):
                await self.save_url_to_csv(url)
            
            await self.mark_url_completed(url, 'completed')
            
            # Progress reporting
            if self.stats['saved'] % 100 == 0:
                elapsed = time.time() - self.stats['start_time']
                urls_per_sec = self.stats['processed'] / elapsed if elapsed > 0 else 0
                print(f"üìà Progress: {self.stats['saved']:,}/{MAX_URLS:,} | "
                      f"Speed: {urls_per_sec:.1f} URLs/sec")
                
        except Exception as e:
            await self.mark_url_completed(url, 'failed')
            self.stats['failed'] += 1
    
    def is_relevant_content(self, soup, url):
        """Fast relevance checking"""
        # Cek meta tags
        meta_product = soup.find('meta', property='og:type')
        if meta_product and 'product' in str(meta_product.get('content', '')).lower():
            return True
        
        # Cek common product indicators
        product_indicators = [
            soup.find('meta', property='product:price:amount'),
            soup.find('span', class_=re.compile(r'price', re.I)),
            soup.find('button', class_=re.compile(r'buy|cart|beli', re.I)),
        ]
        
        if any(indicator for indicator in product_indicators if indicator):
            return True
        
        # Cek URL patterns
        product_patterns = ['/product/', '/p/', '/item/', '/laptop/', '/smartphone/', '/tablet/']
        if any(pattern in url.lower() for pattern in product_patterns):
            return True
            
        return False
    
    async def save_url_to_csv(self, url):
        """Fast CSV saving"""
        try:
            url_id = f"07-{self.stats['saved'] + 1:07d}"
            timestamp = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
            
            with open('collected_urls.csv', 'a', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow([url_id, url, timestamp])
            
            self.stats['saved'] += 1
            return True
        except Exception:
            return False
    
    async def mark_url_completed(self, url, status):
        """Fast status update"""
        try:
            await self.db.execute(
                "UPDATE crawler_queue SET status = ? WHERE url = ?",
                (status, url)
            )
            self.stats['processed'] += 1
        except Exception:
            pass
    
    async def get_next_batch(self, limit=10):
        """Batch processing untuk performance"""
        try:
            cursor = await self.db.execute(
                "SELECT url, depth FROM crawler_queue WHERE status = 'queued' ORDER BY depth ASC LIMIT ?",
                (limit,)
            )
            results = await cursor.fetchall()
            
            if results:
                # Mark as processing
                placeholders = ','.join('?' * len(results))
                urls = [result[0] for result in results]
                await self.db.execute(
                    f"UPDATE crawler_queue SET status = 'processing' WHERE url IN ({placeholders})",
                    urls
                )
                await self.db.commit()
                
            return results
        except Exception:
            return []
    
    async def run(self):
        """Main crawling loop yang dioptimasi"""
        print("üöÄ Starting TechScope Crawler...")
        print(f"üéØ Target: {MAX_URLS:,} URLs")
        print(f"‚ö° Concurrency: {MAX_CONCURRENT_TASKS} tasks")
        print("=" * 60)
        
        try:
            # Initialize
            await self.init_database()
            
            # Initialize CSV
            with open('collected_urls.csv', 'w', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(['id', 'source_url', 'found_at'])
            
            # Setup HTTP session dengan connection pooling
            connector = aiohttp.TCPConnector(
                limit=MAX_CONCURRENT_TASKS,
                limit_per_host=3,
                ttl_dns_cache=300
            )
            self.session = aiohttp.ClientSession(connector=connector)
            
            # Main processing loop
            tasks = set()
            batch_size = MAX_CONCURRENT_TASKS * 2
            
            while (self.stats['saved'] < MAX_URLS and 
                   self.is_running and 
                   self.stats['processed'] < (self.stats['queued'] + 1000)):
                
                # Fill task queue dengan batch processing
                while len(tasks) < MAX_CONCURRENT_TASKS and self.is_running:
                    batch = await self.get_next_batch(batch_size)
                    if not batch:
                        if not tasks:
                            print("‚è≥ No more URLs in queue, waiting...")
                            await asyncio.sleep(2)
                            continue
                        break
                    
                    for url, depth in batch:
                        if len(tasks) >= MAX_CONCURRENT_TASKS:
                            break
                        task = asyncio.create_task(self.process_url(url, depth))
                        tasks.add(task)
                        task.add_done_callback(tasks.discard)
                
                # Progress monitoring
                if self.stats['processed'] % 50 == 0:
                    elapsed = time.time() - self.stats['start_time']
                    progress_pct = (self.stats['saved'] / MAX_URLS) * 100
                    urls_per_sec = self.stats['processed'] / elapsed if elapsed > 0 else 0
                    
                    clear_output(wait=True)
                    print("üìä REAL-TIME PROGRESS:")
                    print(f"‚úÖ Saved: {self.stats['saved']:,} / {MAX_URLS:,} ({progress_pct:.1f}%)")
                    print(f"üîÑ Processed: {self.stats['processed']:,}")
                    print(f"‚ùå Failed: {self.stats['failed']:,}")
                    print(f"üö´ Robots Denied: {self.stats['robots_denied']:,}")
                    print(f"‚è≥ Active Tasks: {len(tasks)}")
                    print(f"‚ö° Speed: {urls_per_sec:.1f} URLs/sec")
                    print(f"‚è∞ Elapsed: {elapsed:.0f}s")
                    print("-" * 50)
                
                await asyncio.sleep(0.1)
            
            # Wait for remaining tasks
            if tasks:
                print(f"‚è≥ Waiting for {len(tasks)} remaining tasks...")
                await asyncio.gather(*tasks, return_exceptions=True)
            
            print("=" * 60)
            print("üéâ CRAWLING COMPLETED!")
            
        except Exception as e:
            print(f"‚ùå Crawling failed: {e}")
            raise
        finally:
            # Cleanup
            if self.session:
                await self.session.close()
            if self.db:
                await self.db.commit()
                await self.db.close()
            
            # Final stats
            elapsed = time.time() - self.stats['start_time']
            print(f"üìà Final Stats:")
            print(f"   ‚úÖ URLs Saved: {self.stats['saved']:,}")
            print(f"   üîÑ Total Processed: {self.stats['processed']:,}")
            print(f"   ‚ùå Failed: {self.stats['failed']:,}")
            print(f"   üö´ Robots Denied: {self.stats['robots_denied']:,}")
            print(f"   üîÅ Duplicates: {self.stats['duplicates']:,}")
            print(f"   ‚è∞ Total Time: {elapsed:.1f}s")
            print(f"   ‚ö° Average Speed: {self.stats['processed']/elapsed:.1f} URLs/sec")

print("‚úÖ TechScope Crawler class ready!")

In [None]:
# Cell 3: Jalankan Crawler
async def main():
    crawler = TechScopeCrawler()
    await crawler.run()
    return crawler.stats

print("üîÑ Starting TechScope Crawling Process...")
print("‚ö†Ô∏è  This may take a while for 1,000,000 URLs")
print("üí° Press Ctrl+C to stop gracefully")

# Jalankan crawler
final_stats = await main()

In [None]:
# Cell 4: Hasil dan Analisis
def analyze_results():
    """Analyze crawling results"""
    if not os.path.exists('collected_urls.csv'):
        print("‚ùå No results file found")
        return None
    
    try:
        df = pd.read_csv('collected_urls.csv')
        print(f"üìä RESULTS ANALYSIS:")
        print(f"   üìÑ Total URLs Collected: {len(df):,}")
        
        # Extract domains
        df['domain'] = df['source_url'].apply(lambda x: urlparse(x).netloc)
        
        print(f"   üåê Unique Domains: {df['domain'].nunique()}")
        print(f"   üìÖ First URL: {df['found_at'].min()}")
        print(f"   üìÖ Last URL: {df['found_at'].max()}")
        
        # Top domains
        print(f"\nüèÜ TOP DOMAINS:")
        top_domains = df['domain'].value_counts().head(10)
        for domain, count in top_domains.items():
            print(f"   {domain}: {count:,} URLs")
        
        # Sample URLs
        print(f"\nüîç SAMPLE URLs:")
        for i, url in enumerate(df['source_url'].head(5).tolist()):
            print(f"   {i+1}. {url}")
        
        return df
        
    except Exception as e:
        print(f"‚ùå Error analyzing results: {e}")
        return None

results_df = analyze_results()

In [None]:
# Cell 5: Export Summary Report
def create_detailed_summary():
    """Create comprehensive summary report"""
    summary = {
        'crawling_session': {
            'start_time': datetime.fromtimestamp(final_stats['start_time']).isoformat(),
            'end_time': datetime.now().isoformat(),
            'duration_seconds': time.time() - final_stats['start_time'],
            'target_urls': MAX_URLS,
            'achieved_urls': final_stats['saved']
        },
        'performance_metrics': {
            'urls_processed': final_stats['processed'],
            'urls_saved': final_stats['saved'],
            'success_rate': (final_stats['saved'] / final_stats['processed']) * 100 if final_stats['processed'] > 0 else 0,
            'urls_per_second': final_stats['processed'] / (time.time() - final_stats['start_time']),
            'failures': final_stats['failed'],
            'robots_denied': final_stats['robots_denied'],
            'duplicates_skipped': final_stats['duplicates']
        },
        'configuration': {
            'max_concurrent_tasks': MAX_CONCURRENT_TASKS,
            'domain_delay': PER_DOMAIN_DELAY,
            'request_timeout': REQUEST_TIMEOUT,
            'seeds_count': len(SEEDS),
            'allowed_domains': len(ALLOWED_DOMAINS)
        }
    }
    
    # Save summary
    import json
    with open('crawling_summary_detailed.json', 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)
    
    print("üìã DETAILED SUMMARY:")
    print(f"   üéØ Target: {summary['crawling_session']['target_urls']:,} URLs")
    print(f"   ‚úÖ Achieved: {summary['crawling_session']['achieved_urls']:,} URLs")
    print(f"   üìà Success Rate: {summary['performance_metrics']['success_rate']:.1f}%")
    print(f"   ‚ö° Speed: {summary['performance_metrics']['urls_per_second']:.1f} URLs/sec")
    print(f"   ‚è∞ Duration: {summary['crawling_session']['duration_seconds']:.0f} seconds")
    
    return summary

if 'final_stats' in locals():
    detailed_summary = create_detailed_summary()
else:
    print("‚ùå Run Cell 3 first to get crawling results")