In [None]:
import json
from pathlib import Path
import re
from urllib.parse import quote
import pandas as pd

In [None]:
def analyze_batch_gaps(urls, scraped_data_dir="../scraped_data", start_index=None, end_index=None):
    """
    Analyze scraped batches to find (a) skipped pages and (b) failed/errored pages
    
    Args:
        urls: List of urls that should have been scraped (in order)
        scraped_data_dir: Directory containing scraped batch files
        start_index: Start index in discovered_titles (None = start from beginning)
        end_index: End index in discovered_titles (None = end at last)
    
    Returns:
        tuple: (skipped_pages, failed_pages)
        - skipped_pages: List of titles that were never scraped
        - failed_pages: List of titles that failed with errors
    """
    # Get the range of titles to analyze
    if start_index is None:
        start_index = 0
    if end_index is None:
        end_index = len(urls)
    
    urls_to_analyze = urls[start_index:end_index]
    
    # Load all scraped data
    scraped_titles = set()
    failed_titles = []
    
    batch_files = sorted(Path(scraped_data_dir).glob('batch_*.jsonl'))
    
    print(f"Analyzing batches from index {start_index} to {end_index}")
    print(f"Total urls to analyze: {len(urls_to_analyze)}")
    
    for batch_file in batch_files:
        # Extract batch range from filename
        match = re.search(r'batch_(\d+)_(\d+)\.jsonl', str(batch_file))
        if not match:
            continue
        
        batch_start_idx = int(match.group(1))
        batch_end_idx = int(match.group(2))
        
        # Check if this batch is in our range of interest
        if batch_end_idx < start_index or batch_start_idx > end_index:
            continue
        
        print(f"Processing {batch_file.name} (range: {batch_start_idx}-{batch_end_idx})")
        
        with open(batch_file, 'r') as f:
            for line in f:
                data = json.loads(line)
                scraped_titles.add(data['title'])
    
    # Load failed pages
    failed_file = Path('../scraping_failed.jsonl')
    if failed_file.exists():
        with open(failed_file, 'r') as f:
            for line in f:
                failed_data = json.loads(line)
                failed_titles.append({
                    'title': failed_data.get('title', ''),
                    'url': failed_data.get('url', ''),
                    'error': failed_data.get('error', 'unknown'),
                    'failed_at': failed_data.get('failed_at', '')
                })
    
    # Find skipped pages (expected but not scraped)
    skipped_pages = []
    for url in urls_to_analyze:
        title = url.split('/page/')[-1]
        if title not in scraped_titles:
            skipped_pages.append(title)
    
    print(f"\nAnalysis Results:")
    print(f"  Scraped successfully: {len(scraped_titles)}")
    print(f"  Skipped (not scraped): {len(skipped_pages)}")
    print(f"  Failed with errors: {len(failed_titles)}")
    
    return skipped_pages, failed_titles


def save_retry_lists(skipped_pages, failed_pages, output_dir="../retry"):
    """
    Save skipped and failed pages to files for retry
    
    Args:
        skipped_pages: List of titles that were skipped
        failed_pages: List of dicts with failed page info
        output_dir: Directory to save retry lists
    """
    Path(output_dir).mkdir(exist_ok=True)
    
    # Save skipped pages
    if skipped_pages:
        skipped_file = Path(output_dir) / 'skipped_pages.jsonl'
        with open(skipped_file, 'w') as f:
            for title in skipped_pages:
                f.write(json.dumps({'title': title, 'url': f"https://grokipedia.com/page/{quote(title)}"}) + '\n')
        print(f"Saved {len(skipped_pages)} skipped pages to {skipped_file}")
    
    # Save failed pages
    if failed_pages:
        failed_file = Path(output_dir) / 'failed_pages.jsonl'
        with open(failed_file, 'w') as f:
            for item in failed_pages:
                f.write(json.dumps(item) + '\n')
        print(f"Saved {len(failed_pages)} failed pages to {failed_file}")
    
    # Save combined retry list
    all_retry = []
    
    # Add skipped pages
    for title in skipped_pages:
        all_retry.append({
            'title': title,
            'url': f"https://grokipedia.com/page/{quote(title)}",
            'reason': 'skipped'
        })
    
    # Add failed pages
    for item in failed_pages:
        all_retry.append({
            'title': item.get('title', ''),
            'url': item.get('url', ''),
            'reason': item.get('error', 'unknown'),
            'previous_error': item.get('error', 'unknown')
        })
    
    if all_retry:
        retry_file = Path(output_dir) / 'retry_list.jsonl'
        with open(retry_file, 'w') as f:
            for item in all_retry:
                f.write(json.dumps(item) + '\n')
        print(f"Saved {len(all_retry)} total pages to retry list: {retry_file}")

In [None]:
# Usage example:
# Load discovered titles (replace with your actual method)
df_urls = pd.read_json("hf://datasets/stefan-it/grokipedia-urls/urls.jsonl", lines=True)
urls = df_urls['url'].tolist()

# Analyze batch gap for indices 166500 to 167352
skipped, failed = analyze_batch_gaps(
    urls, 
    start_index=0, 
    end_index=176500
)

# Save results for retry
save_retry_lists(skipped, failed)