In [8]:
import wikipediaapi
import os

def fetch_page(page_title: str, output_dir: str) -> str:
    """
    Fetches the page content from Wikipedia, saves it in an HTML file in the specified directory, and returns the file name.
    """
    wiki_html = wikipediaapi.Wikipedia(
        user_agent='Tinker/0.1 (kartikeyapophali@gmail.com)',
        language='en',
        extract_format=wikipediaapi.ExtractFormat.HTML
    )

    p_html = wiki_html.page(page_title)
    underscored_page_title = page_title.replace(" ", "_")
    file_name = f"{underscored_page_title}.html"
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, "w") as file:
        file.write(p_html.text)
        
    return file_name

In [9]:
import os
import json

def update_checkpoint(checkpoint_file: str, page_title: str, file_name: str):
    if not os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'w') as file:
            json.dump({}, file, indent=4)
    
    with open(checkpoint_file, 'r+') as file:
        checkpoint_data = json.load(file)
        checkpoint_data[page_title] = file_name
        file.seek(0)
        json.dump(checkpoint_data, file, indent=4)
        file.truncate()

def load_checkpoint(checkpoint_file: str):
    if not os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'w') as file:
            json.dump({}, file, indent=4)
    with open(checkpoint_file, 'r') as file:
        return json.load(file)

In [10]:
def process_directory(directory: str):
    response_file = os.path.join(directory, 'response.json')
    checkpoint_file = os.path.join(directory, 'page_title_file_name_map.json')
    
    with open(response_file, 'r') as file:
        response_data = json.load(file)
    
    checkpoint_data = load_checkpoint(checkpoint_file)
    processed_pages = set(checkpoint_data.keys())
    
    for member in response_data['query']['categorymembers']:
        if member['ns'] == 0:  # Only process pages, not categories
            page_title = member['title']
            if page_title not in processed_pages:
                try:
                    file_name = fetch_page(page_title, directory)
                    update_checkpoint(checkpoint_file, page_title, file_name)
                except Exception as e:
                    print(f"Error processing page {page_title}: {e}")

    for member in response_data['query']['categorymembers']:
        if member['ns'] == 14:  # Process subcategories
            subcategory_dir = os.path.join(directory, member['title'].replace('Category:', ''))
            if os.path.isdir(subcategory_dir):
                process_directory(subcategory_dir)

In [10]:
process_directory('Dinosaurs')

Satisfactory result.

***Notes:***
- Fetching 127 pages requires around 93 seconds in above experiment. 3800 pages will take close to an hour.
- Performance can be optimized with batch processing. Experiments were done, but the underlying pages fetched were not in satisfactory format.
- As downloading and indexing will not be done often, 1 hour of download time can be accepted as a trade-off.

In [11]:
process_directory("data/Dinosaurs")

Tested above with partial runs as well where some of the files were arbitrarily deleted to test checkpointing. The checkpointing file was created where it did not exist and the files were fetched.

In [12]:
process_directory("data/Dinosaurs")

Above run was expected to be fast. All files already exist.

Next, just delete a few entries from the ```page_title_file_name_map``` and delete the corresponding files.

In [13]:
process_directory("data/Dinosaurs")

Works! Three arbitrary files were removed along with their corresponding entries in the map. Upon rerun of function, the files were reinstated with their mappings.