In [1]:
import asyncio
import aiohttp
from aiolimiter import AsyncLimiter
from bs4 import BeautifulSoup
import json
from pathlib import Path
import logging
from datetime import datetime
from tqdm import tqdm
from urllib.parse import quote
import time
import pandas as pd
from dotenv import load_dotenv
import os

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# BrightData Proxy Configuration
BRIGHTDATA_USERNAME = os.getenv('BRIGHTDATA_USERNAME')
BRIGHTDATA_PASSWORD = os.getenv('BRIGHTDATA_PASSWORD')

if BRIGHTDATA_USERNAME and BRIGHTDATA_PASSWORD:
    PROXY_URL = f'http://{BRIGHTDATA_USERNAME}:{BRIGHTDATA_PASSWORD}@brd.superproxy.io:33335'
    logger.info('BrightData proxy configured')
else:
    PROXY_URL = None
    logger.warning('BrightData credentials not found - running without proxy')

# Configuration - optimized for BrightData proxy
DISCOVERY_START_INDEX = 250000  # Resume from 180k
DISCOVERY_BATCH_SIZE = 5000
DISCOVERY_TIMEOUT = 15  # Increased timeout
SCRAPING_START_INDEX = 211700
SCRAPING_BATCH_SIZE = 300
SCRAPING_SKIP_ON_ERROR = True
SCRAPING_TIMEOUT = 60  # Increased timeout for reliability
MAX_CONCURRENT = 200  # Keep high with proxy
RATE_LIMIT = 300  # Aggressive with proxy
BATCH_DELAY = 0.5  # Reduced delay

SCRAPED_DATA_DIR = "../scraped_data"
DISCOVERED_TITLES_DIR = "../discovered_titles"

# Create output directories
Path(DISCOVERED_TITLES_DIR).mkdir(exist_ok=True)
Path(SCRAPED_DATA_DIR).mkdir(exist_ok=True)


2025-10-29 00:35:13,497 - INFO - BrightData proxy configured


In [2]:
# Configure aiohttp with BrightData proxy if available
if PROXY_URL:
    logger.info(f"Using proxy: {PROXY_URL.split('@')[1]}")  # Don't log credentials
    proxy_auth = aiohttp.BasicAuth(BRIGHTDATA_USERNAME, BRIGHTDATA_PASSWORD) if BRIGHTDATA_USERNAME else None
    
    # Proxy configuration for aiohttp
    connector = aiohttp.TCPConnector(limit=MAX_CONCURRENT)
    proxy_config = {
        'http': f'http://brd.superproxy.io:33335',
        'https': f'http://brd.superproxy.io:33335'
    }
else:
    proxy_config = None
    logger.info("Running without proxy")


2025-10-29 00:35:13,662 - INFO - Using proxy: brd.superproxy.io:33335


In [3]:
# Helper function to parse grokipedia HTML
def parse_grokipedia_html(html_content, url, title=None):
    """Parse grokipedia HTML and extract structured data"""
    if title is None:
        title = url.split('/page/')[-1]
    
    soup = BeautifulSoup(html_content, 'html.parser')
    data = {
        'title': title,
        'url': url,
        'main_title': None,
        'sections': [],
        'paragraphs': [],
        'tables': [],
        'references': [],
    }
    
    # Find article container
    article = soup.find('div', class_='mx-auto max-w-[850px]')
    if not article:
        return data
    
    # Extract main title (h1)
    h1 = article.find('h1')
    if h1:
        data['main_title'] = h1.get_text(strip=True)
    
    # Extract sections with proper content
    for heading in article.find_all(['h1', 'h2', 'h3'], id=True):
        section_data = {
            'level': heading.name,
            'id': heading.get('id'),
            'title': heading.get_text(strip=True),
            'content': []
        }
        
        # Walk through siblings after heading
        current = heading.next_sibling
        while current:
            if hasattr(current, 'name') and current.name in ['h1', 'h2', 'h3']:
                if current.name <= heading.name:
                    break
            
            if hasattr(current, 'name'):
                if current.name == 'span' and 'mb-4' in (current.get('class') or []):
                    text = current.get_text(strip=True)
                    if text:
                        # Join sentences with proper spacing
                        section_data['content'].append({'type': 'paragraph', 'text': ' '.join(text.split())})
                elif current.name == 'ul':
                    items = [li.get_text(strip=True) for li in current.find_all('li')]
                    if items:
                        section_data['content'].append({'type': 'list', 'items': items})
                elif current.name == 'ol':
                    items = [li.get_text(strip=True) for li in current.find_all('li')]
                    if items:
                        section_data['content'].append({'type': 'ordered_list', 'items': items})
            
            current = current.next_sibling
        
        data['sections'].append(section_data)
    
    # Extract paragraphs with proper spacing
    for span in article.find_all('span', class_='mb-4'):
        text = span.get_text(strip=True)
        # Normalize whitespace
        text = ' '.join(text.split())
        if text and text not in data['paragraphs']:
            data['paragraphs'].append(text)
    
    # Extract tables
    for table in article.find_all('table'):
        table_data = []
        headers = []
        
        if table.find('thead'):
            for th in table.find('thead').find_all('th'):
                headers.append(th.get_text(strip=True))
        
        if table.find('tbody'):
            for tr in table.find('tbody').find_all('tr'):
                row = []
                for td in tr.find_all('td'):
                    row.append(td.get_text(strip=True))
                if row:
                    table_data.append(row)
        
        if headers or table_data:
            data['tables'].append({'headers': headers, 'rows': table_data})
    
    # Extract references WITH links
    references_section = soup.find('div', id='references')
    if references_section:
        for li in references_section.find_all('li'):
            ref_text = li.get_text(strip=True)
            ref_link = None
            
            link = li.find('a')
            if link and link.get('href'):
                ref_link = {'href': link.get('href'), 'text': link.get_text(strip=True)}
            
            if ref_text:
                data['references'].append({'text': ref_text, 'link': ref_link})
    
    # Remove references from paragraphs
    data['paragraphs'] = [p for p in data['paragraphs'] 
                          if not any(ref['text'].split()[0:3] == p.split()[0:3] for ref in data['references'])]
    
    return data

In [4]:
async def discover_page_exists(session, limiter, title):
    """Check if a grokipedia page exists using HEAD request"""
    url = f"https://grokipedia.com/page/{quote(title)}"
    try:
        async with limiter:
            async with session.head(
                url,
                timeout=DISCOVERY_TIMEOUT,
                headers={'Accept-Encoding': 'gzip, deflate'},
                proxy=PROXY_URL,
            ) as response:
                status = response.status
                if status == 200:
                    return {'title': title, 'url': url, 'status': 'exists', 'checked_at': datetime.now().isoformat()}
                elif status == 404:
                    return {'title': title, 'url': url, 'status': 'not_found', 'checked_at': datetime.now().isoformat()}
                elif status == 429:
                    await asyncio.sleep(5)
                    return {'title': title, 'url': url, 'status': 'rate_limited', 'checked_at': datetime.now().isoformat()}
                else:
                    return {'title': title, 'url': url, 'status': f'error_{status}', 'checked_at': datetime.now().isoformat()}
    except Exception as e:
        return {'title': title, 'url': url, 'status': 'error', 'error': str(e), 'checked_at': datetime.now().isoformat()}

async def discovery_phase(titles, start_index=0, batch_size=10000):
    """Run discovery phase to find which pages exist"""
    limiter = AsyncLimiter(max_rate=RATE_LIMIT, time_period=DISCOVERY_TIMEOUT)

    async with aiohttp.ClientSession() as session:
        discovered_count = 0
        not_found_count = 0
        error_count = 0
        results = []

        pbar = tqdm(total=len(titles), desc="Discovery", initial=start_index)

        for i in range(start_index, len(titles), MAX_CONCURRENT):
            batch = titles[i:i + MAX_CONCURRENT]

            tasks = [discover_page_exists(session, limiter, title) for title in batch]
            batch_results = await asyncio.gather(*tasks)

            for result in batch_results:
                if result['status'] == 'exists':
                    discovered_count += 1
                elif result['status'] == 'not_found':
                    not_found_count += 1
                else:
                    error_count += 1
                results.append(result)

            pbar.update(len(batch))
            pbar.set_postfix({'found': discovered_count, 'not_found': not_found_count, 'errors': error_count})

            if i % MAX_CONCURRENT < MAX_CONCURRENT - 1:
                await asyncio.sleep(BATCH_DELAY)

            if len(results) >= batch_size:
                batch_num = (i // batch_size) + 1
                batch_start = (batch_num - 1) * batch_size
                batch_end = batch_start + len(results)
                with open(f'discovered_titles/batch_{batch_start}_{batch_end}.jsonl', 'w') as f:
                    for result in results:
                        f.write(json.dumps(result) + '\n')
                checkpoint = {
                    'last_processed_index': i,
                    'discovered_count': discovered_count,
                    'not_found_count': not_found_count,
                    'error_count': error_count,
                    'total_processed': i + len(batch)
                }
                with open('discovery_checkpoint.json', 'w') as f:
                    json.dump(checkpoint, f)
                results = []

        if results:
            batch_num = (len(titles) // batch_size)
            batch_start = batch_num * batch_size
            batch_end = batch_start + len(results)
            with open(f'discovered_titles/batch_{batch_start}_{batch_end}.jsonl', 'w') as f:
                for result in results:
                    f.write(json.dumps(result) + '\n')

        pbar.close()

        stats = {
            'total_checked': len(titles),
            'discovered': discovered_count,
            'not_found': not_found_count,
            'errors': error_count,
            'completed_at': datetime.now().isoformat()
        }
        with open('discovery_stats.json', 'w') as f:
            json.dump(stats, f, indent=2)

        logger.info(f"Discovery complete: Found {discovered_count} existing pages out of {len(titles)} checked")
        return discovered_count


In [5]:
async def scrape_page(session, limiter, url, skip_on_error=True):
    """Scrape a single grokipedia page"""
    try:
        title = url.split('/page/')[-1]
        async with limiter:
            async with session.get(
                url,
                timeout=60,
                headers={'Accept-Encoding': 'gzip, deflate'},
                proxy=PROXY_URL,
            ) as response:
                if response.status == 200:
                    html = await response.text()
                    data = parse_grokipedia_html(html, url)
                    return {'success': True, 'data': data}
                elif response.status == 404:
                    return {'success': False, 'error': 'not_found', 'title': title}
                else:
                    return {'success': False, 'error': f'status_{response.status}', 'title': title}
    except asyncio.TimeoutError:
        if skip_on_error:
            return {'success': False, 'error': 'timeout', 'title': title}
        else:
            for delay in [2, 4, 8]:
                await asyncio.sleep(delay)
                try:
                    async with limiter:
                        async with session.get(
                            url,
                            timeout=45,
                            headers={'Accept-Encoding': 'gzip, deflate'},
                            proxy=PROXY_URL,
                        ) as response:
                            if response.status == 200:
                                html = await response.text()
                                data = parse_grokipedia_html(html, url, title)
                                return {'success': True, 'data': data}
                except:
                    continue
            return {'success': False, 'error': 'timeout_retries_exhausted', 'title': title}
    except Exception as e:
        return {'success': False, 'error': str(e), 'title': title}

async def scraping_phase(urls, start_index=0, batch_size=1000, skip_on_error=True):
    """Run scraping phase to extract data from discovered pages"""
    limiter = AsyncLimiter(max_rate=RATE_LIMIT, time_period=60)  # Fixed: use 60 second window
    connector = aiohttp.TCPConnector(limit=MAX_CONCURRENT, force_close=True, enable_cleanup_closed=True)
    
    # Ensure output directory exists
    Path(SCRAPED_DATA_DIR).mkdir(exist_ok=True)
    
    async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=SCRAPING_TIMEOUT)) as session:
        success_count = 0
        fail_count = 0
        scraped_data = []
        failed_pages = []
        batch_count = 0  # Track batch number
        last_save_index = 0  # Track what's been saved
        
        # Create progress bar
        pbar = tqdm(total=len(urls), desc="Scraping", initial=start_index)
        
        # Process in batches
        for i in range(start_index, len(urls), MAX_CONCURRENT):
            batch = urls[i:i + MAX_CONCURRENT]
            
            # Create tasks for concurrent requests
            tasks = [scrape_page(session, limiter, url, skip_on_error) for url in batch]
            
            # Use gather with return_exceptions to handle individual failures
            try:
                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
            except Exception as e:
                logger.error(f"Batch gather failed: {e}")
                batch_results = [{'success': False, 'error': str(e), 'title': 'batch_error'} for _ in batch]
            
            # Process results
            for j, result in enumerate(batch_results):
                # Handle exceptions
                if isinstance(result, Exception):
                    fail_count += 1
                    failed_pages.append({
                        'title': batch[j] if j < len(batch) else 'unknown',
                        'url': batch[j] if j < len(batch) else 'unknown',
                        'error': str(result),
                        'failed_at': datetime.now().isoformat()
                    })
                    continue
                
                if result['success']:
                    success_count += 1
                    scraped_data.append({
                        'title': result['data']['title'],
                        'url': result['data']['url'],
                        'data': result['data'],
                        'scraped_at': datetime.now().isoformat()
                    })
                else:
                    fail_count += 1
                    failed_pages.append({
                        'title': result.get('title', 'unknown'),
                        'url': result.get('url', 'unknown'),
                        'error': result.get('error', 'unknown'),
                        'failed_at': datetime.now().isoformat()
                    })
            
            pbar.update(len(batch))
            pbar.set_postfix({
                'success': success_count,
                'failed': fail_count,
                'fail_rate': f'{fail_count/(success_count+fail_count)*100:.1f}%' if (success_count+fail_count) > 0 else '0%'
            })
            
            # Save batch when we hit the batch_size limit OR every 100 successful items
            should_save = False
            save_reason = ""
            
            if len(scraped_data) >= batch_size:
                should_save = True
                save_reason = "batch_size"
            elif len(scraped_data) >= 100 and (i - last_save_index) >= 5000:  # Save every 5000 processed pages
                should_save = True
                save_reason = "progress"
            
            if should_save and scraped_data:
                batch_start = start_index + (batch_count * batch_size)
                batch_end = batch_start + len(scraped_data)
                
                batch_file = Path(f'{SCRAPED_DATA_DIR}/batch_{batch_start}_{batch_end}.jsonl')
                
                with open(batch_file, 'w') as f:
                    for item in scraped_data:
                        f.write(json.dumps(item) + '\n')
                
                # Save checkpoint
                checkpoint = {
                    'last_processed_index': i,
                    'success_count': success_count,
                    'fail_count': fail_count,
                    'total_processed': i + len(batch),
                    'last_save_index': batch_end,
                    'batch_count': batch_count,
                    'save_reason': save_reason
                }
                
                with open('scraping_checkpoint.json', 'w') as f:
                    json.dump(checkpoint, f)
                
                logger.info(f"Saved batch: {batch_start} to {batch_end} ({len(scraped_data)} items) - Reason: {save_reason}")
                last_save_index = i
                batch_count += 1
                scraped_data = []
            
            # Save failed pages periodically too
            if len(failed_pages) >= 1000:
                failed_file = Path('scraping_failed_partial.jsonl')
                with open(failed_file, 'a') as f:
                    for item in failed_pages:
                        f.write(json.dumps(item) + '\n')
                logger.info(f"Saved {len(failed_pages)} failed pages to partial file")
                failed_pages = []
            
            # Adaptive delay - increase if failure rate is high
            failure_rate = fail_count / (success_count + fail_count) if (success_count + fail_count) > 0 else 0
            if failure_rate > 0.3:  # More than 30% failure rate
                delay = 1.0
                logger.warning(f"High failure rate ({failure_rate*100:.1f}%), increasing delay to {delay}s")
            else:
                delay = 0.1
            
            await asyncio.sleep(delay)
        
        # Save any remaining results
        if scraped_data:
            batch_start = start_index + (batch_count * batch_size)
            batch_end = batch_start + len(scraped_data)
            
            batch_file = Path(f'{SCRAPED_DATA_DIR}/batch_{batch_start}_{batch_end}.jsonl')
            
            with open(batch_file, 'w') as f:
                for item in scraped_data:
                    f.write(json.dumps(item) + '\n')
            
            logger.info(f"Saved final batch: {batch_start} to {batch_end} ({len(scraped_data)} items)")
        
        # Save all failed pages
        if failed_pages:
            with open('scraping_failed.jsonl', 'w') as f:
                for item in failed_pages:
                    f.write(json.dumps(item) + '\n')
            logger.info(f"Saved {len(failed_pages)} failed pages")
        
        pbar.close()
        
        # Save final stats
        stats = {
            'total_scraped': len(urls),
            'success': success_count,
            'failed': fail_count,
            'completed_at': datetime.now().isoformat()
        }
        
        with open('scraping_stats.json', 'w') as f:
            json.dump(stats, f, indent=2)
        
        logger.info(f"Scraping complete: Successfully scraped {success_count} pages out of {len(urls)} attempted")
        return success_count

In [6]:
# Utility functions
def load_titles():
    """Load and sort enwiki titles from pandas DataFrame"""
    df = pd.read_pickle('../enwiki_titles_20251027.pkl')
    titles = df['page_title'].tolist()
    titles_sorted = sorted(titles)
    logger.info(f"Loaded {len(titles_sorted)} titles from DataFrame")
    return titles_sorted

def load_discovered_titles():
    """Load all discovered titles from batch files"""
    discovered = []
    for file in sorted(Path('../discovered_titles').glob('batch_*.jsonl')):
        with open(file, 'r') as f:
            for line in f:
                result = json.loads(line)
                if result['status'] == 'exists':
                    discovered.append(result['title'])
    logger.info(f"Loaded {len(discovered)} discovered titles")
    return discovered

def load_scraped_data():
    """Load all scraped data from batch files"""
    scraped = []
    for file in sorted(Path('{SCRAPED_DATA_DIR}').glob('batch_*.jsonl')):
        with open(file, 'r') as f:
            for line in f:
                scraped.append(json.loads(line))
    logger.info(f"Loaded {len(scraped)} scraped pages")
    return scraped

def retry_failed_pages():
    """Retry scraping failed pages"""
    failed = []
    if Path('scraping_failed.jsonl').exists():
        with open('scraping_failed.jsonl', 'r') as f:
            for line in f:
                failed.append(json.loads(line))
    
    if not failed:
        logger.info("No failed pages to retry")
        return []
    
    titles = [item['title'] for item in failed]
    logger.info(f"Retrying {len(titles)} failed pages")
    return titles


In [None]:
# PHASE 1: DISCOVERY
# Load and sort all enwiki titles
titles = load_titles()

In [None]:
# Run discovery phase
# This will check all 7M titles to find which ~885k exist on grokipedia
discovered_count = await discovery_phase(
    titles,
    start_index=DISCOVERY_START_INDEX,
    batch_size=DISCOVERY_BATCH_SIZE
)

print(f"\nDiscovery Summary:")
print(f"Checked {len(titles)} titles")
print(f"Found {discovered_count} existing pages on grokipedia")
print(f"Success rate: {discovered_count/len(titles)*100:.2f}%")

In [7]:
# PHASE 2: SCRAPING
# Load discovered titles from Phase 1
# discovered_titles = load_discovered_titles()
df_urls = pd.read_json("hf://datasets/stefan-it/grokipedia-urls/urls.jsonl", lines=True)
urls = df_urls['url'].tolist()


In [None]:
# Run scraping phase
# This will extract structured data from the discovered pages
scraped_count = await scraping_phase(
    urls,
    start_index=SCRAPING_START_INDEX,
    batch_size=SCRAPING_BATCH_SIZE,
    skip_on_error=SCRAPING_SKIP_ON_ERROR
)

Scraping:  24%|██▍       | 211900/885279 [00:16<15:03:34, 12.42it/s, success=200, failed=0, fail_rate=0.0%]2025-10-29 00:35:51,759 - INFO - Saved batch: 211700 to 211900 (200 items) - Reason: progress
Scraping:  24%|██▍       | 212500/885279 [03:21<51:50:54,  3.60it/s, success=530, failed=270, fail_rate=33.8%]2025-10-29 00:38:57,209 - INFO - Saved batch: 212000 to 212330 (330 items) - Reason: batch_size
Scraping:  24%|██▍       | 213300/885279 [07:03<52:46:43,  3.54it/s, success=1009, failed=591, fail_rate=36.9%]2025-10-29 00:42:38,965 - INFO - Saved batch: 212300 to 212779 (479 items) - Reason: batch_size
Scraping:  24%|██▍       | 213700/885279 [09:07<55:23:00,  3.37it/s, success=1354, failed=646, fail_rate=32.3%]2025-10-29 00:44:42,730 - INFO - Saved batch: 212600 to 212945 (345 items) - Reason: batch_size
Scraping:  24%|██▍       | 214100/885279 [11:10<56:33:25,  3.30it/s, success=1726, failed=674, fail_rate=28.1%]2025-10-29 00:46:46,649 - INFO - Saved batch: 212900 to 213272 (372 

In [None]:
print(f"\nScraping Summary:")
print(f"Attempted to scrape {len(urls)} pages")
print(f"Successfully scraped {scraped_count} pages")
print(f"Success rate: {scraped_count/len(urls)*100:.2f}%")

In [None]:
# ANALYSIS: Compare coverage
titles = load_titles()
discovered = load_discovered_titles()

print("Coverage Analysis:")
print(f"Total enwiki titles: {len(titles):,}")
print(f"Grokipedia pages found: {len(discovered):,}")
print(f"Coverage: {len(discovered)/len(titles)*100:.2f}%")
print(f"Missing: {len(titles)-len(discovered):,} pages")


In [None]:
# RETRY FAILED PAGES (optional)
# Uncomment to retry pages that failed during Phase 2
# failed_titles = retry_failed_pages()
# if failed_titles:
#     await scraping_phase(
#         failed_titles,
#         start_index=0,
#         batch_size=SCRAPING_BATCH_SIZE,
#         skip_on_error=False  # Use exponential backoff for retries
#     )


In [None]:
# LOAD ALL SCRAPED DATA
# Load all scraped data into a list for analysis
scraped = load_scraped_data()

# Example: View first scraped page
if scraped:
    print(f"Total pages scraped: {len(scraped)}")
    print("\nExample page structure:")
    import json
    print(json.dumps(scraped[0], indent=2)[:500] + "...")
