# Filter enwiki pages dump to Grokipedia pages

In [None]:
import pandas as pd
import json
from urllib.parse import unquote
from dotenv import load_dotenv
from tqdm import tqdm
import logging
import tarfile

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()


## Get titles from URLs

In [None]:
def extract_wikipedia_title(grokipedia_url):
    """Extract Wikipedia article title from grokipedia URL"""
    # URLs are like: https://grokipedia.com/page/TITLE
    if '/page/' in grokipedia_url:
        title = grokipedia_url.split('/page/')[-1]
        # URL decode the title
        title = unquote(title)
        # Replace underscores with spaces
        title = title.replace('_', ' ')
        return title
    return None

# Load grokipedia URLs and create title set
df_urls = pd.read_json("hf://datasets/stefan-it/grokipedia-urls/urls.jsonl", lines=True)
logger.info(f"Loaded {len(df_urls)} URLs from grokipedia")

df_urls['wikipedia_title'] = df_urls['url'].apply(extract_wikipedia_title)
grokipedia_titles = set(df_urls['wikipedia_title'].dropna().tolist())
logger.info(f"Created set of {len(grokipedia_titles)} unique Wikipedia titles")

# Show some examples
print("Sample titles:", list(grokipedia_titles)[:5])


## Do memory-light extraction using `tar.next()`

In [None]:
def stream_extract_grokipedia_pages(snapshot_path, grokipedia_titles, output_path, batch_size=1000):
    """
    Stream through tar file using tar.next() and extract matching grokipedia pages.
    This is the most memory-efficient approach.
    """
    
    logger.info(f"Starting streaming extraction from {snapshot_path}")
    logger.info(f"Looking for {len(grokipedia_titles)} titles")
    
    extracted_count = 0
    total_processed = 0
    batch_buffer = []
    
    with tarfile.open(snapshot_path, 'r:gz') as tar:
        with open(output_path, 'w') as outfile:
            # Process files as we encounter them using tar.next()
            while True:
                member = tar.next()
                if member is None:
                    break
                
                # Only process .ndjson files
                if member.name.endswith('.ndjson'):
                    logger.info(f"Processing file: {member.name} ({member.size / 1024 / 1024:.2f} MB)")
                    
                    with tar.extractfile(member) as f:
                        with tqdm(total=member.size, unit='B', unit_scale=True, desc=f"Processing {member.name}") as pbar:
                            for line in f:
                                try:
                                    # Parse JSON line
                                    article = json.loads(line.decode('utf-8'))
                                    total_processed += 1
                                    
                                    # Check if this article is in our grokipedia set (O(1) lookup)
                                    article_title = article.get('name', '')
                                    if article_title in grokipedia_titles:
                                        batch_buffer.append(line.decode('utf-8'))
                                        extracted_count += 1
                                        
                                        # Write batch when buffer is full
                                        if len(batch_buffer) >= batch_size:
                                            outfile.writelines(batch_buffer)
                                            batch_buffer = []
                                            logger.info(f"Extracted {extracted_count} articles so far...")
                                    
                                    # Update progress bar
                                    pbar.update(len(line))
                                    
                                except json.JSONDecodeError:
                                    continue
                                except Exception as e:
                                    logger.warning(f"Error processing line: {e}")
                                    continue
                    
                    # Write any remaining articles in buffer after each file
                    if batch_buffer:
                        outfile.writelines(batch_buffer)
                        batch_buffer = []
    
    logger.info(f"Streaming extraction complete!")
    logger.info(f"Total articles processed: {total_processed:,}")
    logger.info(f"Grokipedia articles extracted: {extracted_count:,}")
    logger.info(f"Output saved to: {output_path}")
    
    return extracted_count

In [None]:
# Run the extraction
snapshot_file = "../enwiki_structured.tar.gz"
output_file = "../grokipedia_wikipedia_articles.ndjson"

extracted_count = stream_extract_grokipedia_pages(
    snapshot_file, 
    grokipedia_titles, 
    output_file,
    batch_size=1000
)