In [2]:
import os
import json
import requests
from time import sleep

BATCH_SIZE = 60
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"

def fetch_pages(page_titles: list, output_dir: str) -> dict:
    """
    Fetches the content of multiple Wikipedia pages and saves them in HTML files in the specified directory.
    Returns a dictionary mapping page titles to file names.
    """
    params = {
        "action": "query",
        "titles": "|".join(page_titles),
        "prop": "revisions",
        "rvprop": "content",
        "format": "json",
        "formatversion": "2"
    }
    response = requests.get(WIKIPEDIA_API_URL, params=params)
    response.raise_for_status()
    data = response.json()
    
    page_title_to_file_name = {}
    for page in data["query"]["pages"]:
        page_title = page["title"]
        underscored_page_title = page_title.replace(" ", "_")
        file_name = f"{underscored_page_title}.html"
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, "w") as file:
            file.write(page["revisions"][0]["content"])
        page_title_to_file_name[page_title] = file_name
    
    return page_title_to_file_name

def update_checkpoint(checkpoint_data: dict, checkpoint_file: str):
    with open(checkpoint_file, 'w') as file:
        json.dump(checkpoint_data, file, indent=4)

def load_checkpoint(checkpoint_file: str) -> dict:
    if not os.path.exists(checkpoint_file):
        return {}
    with open(checkpoint_file, 'r') as file:
        return json.load(file)

def process_batch(pages: list, directory: str, checkpoint_data: dict, checkpoint_file: str):
    page_title_to_file_name = fetch_pages(pages, directory)
    checkpoint_data.update(page_title_to_file_name)
    update_checkpoint(checkpoint_data, checkpoint_file)

def process_directory(directory: str):
    response_file = os.path.join(directory, 'response.json')
    checkpoint_file = os.path.join(directory, 'page_title_file_name_map.json')
    
    with open(response_file, 'r') as file:
        response_data = json.load(file)
    
    checkpoint_data = load_checkpoint(checkpoint_file)
    pages_to_process = [member['title'] for member in response_data['query']['categorymembers'] if member['ns'] == 0]
    subcategories = [member['title'] for member in response_data['query']['categorymembers'] if member['ns'] == 14]
    
    # Process pages in batches
    for i in range(0, len(pages_to_process), BATCH_SIZE):
        batch = pages_to_process[i:i + BATCH_SIZE]
        process_batch(batch, directory, checkpoint_data, checkpoint_file)
        sleep(1)  # Politeness delay between batches

    # Process subcategories
    for subcategory in subcategories:
        subcategory_dir = os.path.join(directory, subcategory.replace('Category:', ''))
        if os.path.isdir(subcategory_dir):
            process_directory(subcategory_dir)

In [3]:
process_directory('Dinosaurs')

KeyError: 'query'

Abandon experiment for later:
- Main issue is that the downloaded docs are not cleaned of unwanted tags. Lot of preprocessing may be required to get the desired result.
- Investigate key error in above process