# Download Wikipedia Articles via Wikimedia Enterprise API

This notebook loads grokipedia URLs and downloads the corresponding Wikipedia pages from Wikimedia Enterprise's On-demand Article Lookup API.

The downloaded data includes:
- Article metadata (title, identifier, language, project)
- Revision information
- Article body in wikitext format
- Structured content (abstract, sections, etc.)

Reference: https://enterprise.wikimedia.com/docs/

In [None]:
import pandas as pd
import requests
import json
import time
import os
from urllib.parse import unquote, quote
from dotenv import load_dotenv
from pathlib import Path
from tqdm import tqdm
import logging
import tarfile
from pathlib import Path

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()


In [None]:
# Load grokipedia URLs from HuggingFace dataset
df_urls = pd.read_json("hf://datasets/stefan-it/grokipedia-urls/urls.jsonl", lines=True)
logger.info(f"Loaded {len(df_urls)} URLs from grokipedia")

# Preview the data
df_urls.head()


In [None]:
def extract_wikipedia_title(grokipedia_url):
    """Extract Wikipedia article title from grokipedia URL"""
    # URLs are like: https://grokipedia.com/page/TITLE
    if '/page/' in grokipedia_url:
        title = grokipedia_url.split('/page/')[-1]
        # URL decode the title
        title = unquote(title)
        # Replace underscores with spaces
        title = title.replace('_', ' ')
        return title
    return None

# Load grokipedia URLs and create title set
df_urls = pd.read_json("hf://datasets/stefan-it/grokipedia-urls/urls.jsonl", lines=True)
logger.info(f"Loaded {len(df_urls)} URLs from grokipedia")

df_urls['wikipedia_title'] = df_urls['url'].apply(extract_wikipedia_title)
grokipedia_titles = set(df_urls['wikipedia_title'].dropna().tolist())
logger.info(f"Created set of {len(grokipedia_titles)} unique Wikipedia titles")

# Show some examples
print("Sample titles:", list(grokipedia_titles)[:5])


In [None]:
def stream_extract_grokipedia_pages(snapshot_path, grokipedia_titles, output_path, batch_size=1000):
    """
    Stream through tar file using tar.next() and extract matching grokipedia pages.
    This is the most memory-efficient approach.
    """
    
    logger.info(f"Starting streaming extraction from {snapshot_path}")
    logger.info(f"Looking for {len(grokipedia_titles)} titles")
    
    extracted_count = 0
    total_processed = 0
    batch_buffer = []
    
    with tarfile.open(snapshot_path, 'r:gz') as tar:
        with open(output_path, 'w') as outfile:
            # Process files as we encounter them using tar.next()
            while True:
                member = tar.next()
                if member is None:
                    break
                
                # Only process .ndjson files
                if member.name.endswith('.ndjson'):
                    logger.info(f"Processing file: {member.name} ({member.size / 1024 / 1024:.2f} MB)")
                    
                    with tar.extractfile(member) as f:
                        with tqdm(total=member.size, unit='B', unit_scale=True, desc=f"Processing {member.name}") as pbar:
                            for line in f:
                                try:
                                    # Parse JSON line
                                    article = json.loads(line.decode('utf-8'))
                                    total_processed += 1
                                    
                                    # Check if this article is in our grokipedia set (O(1) lookup)
                                    article_title = article.get('name', '')
                                    if article_title in grokipedia_titles:
                                        batch_buffer.append(line.decode('utf-8'))
                                        extracted_count += 1
                                        
                                        # Write batch when buffer is full
                                        if len(batch_buffer) >= batch_size:
                                            outfile.writelines(batch_buffer)
                                            batch_buffer = []
                                            logger.info(f"Extracted {extracted_count} articles so far...")
                                    
                                    # Update progress bar
                                    pbar.update(len(line))
                                    
                                except json.JSONDecodeError:
                                    continue
                                except Exception as e:
                                    logger.warning(f"Error processing line: {e}")
                                    continue
                    
                    # Write any remaining articles in buffer after each file
                    if batch_buffer:
                        outfile.writelines(batch_buffer)
                        batch_buffer = []
    
    logger.info(f"Streaming extraction complete!")
    logger.info(f"Total articles processed: {total_processed:,}")
    logger.info(f"Grokipedia articles extracted: {extracted_count:,}")
    logger.info(f"Output saved to: {output_path}")
    
    return extracted_count

# Run the extraction
snapshot_file = "../enwiki_structured.tar.gz"
output_file = "../grokipedia_wikipedia_articles.ndjson"

extracted_count = stream_extract_grokipedia_pages(
    snapshot_file, 
    grokipedia_titles, 
    output_file,
    batch_size=1000
)

In [None]:
def quick_verify_extraction(output_file, sample_size=10):
    """Quick verification by sampling articles"""
    
    logger.info(f"Quick verification of {output_file}")
    
    # Count total lines
    total_lines = 0
    with open(output_file, 'r') as f:
        for _ in f:
            total_lines += 1
    
    logger.info(f"Total articles extracted: {total_lines:,}")
    
    # Sample a few articles
    sample_articles = []
    with open(output_file, 'r') as f:
        for i, line in enumerate(f):
            if i < sample_size:
                try:
                    article = json.loads(line)
                    sample_articles.append(article)
                except json.JSONDecodeError:
                    continue
            else:
                break
    
    if sample_articles:
        logger.info(f"Sample articles:")
        for i, article in enumerate(sample_articles):
            title = article.get('name', 'Unknown')
            logger.info(f"  {i+1}. {title}")
    
    # Show file size
    file_size = Path(output_file).stat().st_size
    logger.info(f"Output file size: {file_size / 1024 / 1024:.2f} MB")
    
    return total_lines, sample_articles

# Run verification
total_extracted, sample_articles = quick_verify_extraction(output_file)

In [None]:
def create_final_summary(grokipedia_titles, total_extracted):
    """Create final summary report"""
    
    extraction_rate = total_extracted / len(grokipedia_titles) * 100
    
    summary = {
        'total_grokipedia_titles': len(grokipedia_titles),
        'successfully_extracted': total_extracted,
        'extraction_rate': extraction_rate,
        'file_size_mb': Path(output_file).stat().st_size / 1024 / 1024
    }
    
    logger.info(f"Final Summary:")
    logger.info(f"  Total grokipedia titles: {summary['total_grokipedia_titles']:,}")
    logger.info(f"  Successfully extracted: {summary['successfully_extracted']:,}")
    logger.info(f"  Extraction rate: {summary['extraction_rate']:.2f}%")
    logger.info(f"  Output file size: {summary['file_size_mb']:.2f} MB")
    
    # Save summary
    with open('extraction_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)
    
    return summary

# Generate final summary
final_summary = create_final_summary(grokipedia_titles, total_extracted)