In [2]:
import os
from typing import List
import requests
import json
import redis
import wikipediaapi

In [3]:
# Connect to Redis
r = redis.Redis(host='localhost', port=6379, db=0)

In [21]:
def wiki_category_members(category: str, filepath: str) -> list:
    """
    Returns the list of pages and sub-categories of a given Wikipedia category.
    """
    metadata_download_path = os.path.join(filepath, ".metadata/download")
    if not os.path.exists(metadata_download_path):
        os.makedirs(metadata_download_path)
        
    response_filepath = os.path.join(metadata_download_path, "response.json")
    if os.path.exists(response_filepath):
        with open(response_filepath, "r") as file:
            response_data = json.load(file)
        return response_data["query"]["categorymembers"]

    # Make the initial request to get the category information
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{category}&cmlimit=max"
    headers = {"User-Agent": "Tinker/0.1 (kartikeyapophali@gmail.com)"}
    response = requests.get(url, headers=headers)
    write_response_data = response.json()

    with open(response_filepath, "w") as file:
        json.dump(write_response_data, file)

    response_data = response.json()

    return response_data["query"]["categorymembers"]

In [29]:
def title_pathname_map(category: str, filepath: str, inverse_filter: List[str] = []) -> dict:
    """
    Returns a map of page_title to file name and category title to directory name derived from Wikipedia response query. 
    Pages/categories in the inverse_filter list are not included. 
    """
    metadata_download_path = os.path.join(filepath, ".metadata/download")

    title_pathname_filepath = os.path.join(
        metadata_download_path, "title_pathname.json"
    )
    if os.path.exists(title_pathname_filepath):
        with open(title_pathname_filepath, "r") as file:
            title_pathname_data = json.load(file)
        return title_pathname_data
    
    title_pathname = {"pages": {}, "categories": {}}

    category_members = wiki_category_members(category, filepath)

    for member in category_members:
        # Skip members to filter out based on keywords or phrases in inverse_filter
        # Example: inverse_filter = ["birds", "list of"] will filter out all pages/categories with "birds" or "list of" in their title
        if any(
            keyword.lower() in member["title"].lower() for keyword in inverse_filter
        ):
            continue

        if member["ns"] == 0:
            page_title = member["title"]
            underscored_page_title = page_title.replace(" ", "_")
            page_filename = f"{underscored_page_title}.html"
            title_pathname["pages"][page_title] = page_filename
        elif member["ns"] == 14:
            category_title = member["title"].replace("Category:", "")
            underscored_category_title = category_title.replace(" ", "_")
            category_dirname = f"{underscored_category_title}"
            title_pathname["categories"][category_title] = category_dirname

    with open(title_pathname_filepath, "w") as file:
        json.dump(title_pathname, file)
        
    return title_pathname

In [30]:
def download_page(page_title: str, page_filename: str, filepath: str) -> None:
    """
    Fetches the page content from Wikipedia, saves it in an HTML file in the specified directory, and returns the file name.
    """
    wiki_html = wikipediaapi.Wikipedia(
        user_agent='Tinker/0.1 (kartikeyapophali@gmail.com)',
        language='en',
        extract_format=wikipediaapi.ExtractFormat.HTML
    )

    p_html = wiki_html.page(page_title)
    file_path = os.path.join(filepath, page_filename)
    with open(file_path, "w") as file:
        file.write(p_html.text)

In [31]:
def fetch_wiki_data(category: str, filepath: str, inverse_filter: List[str], category_pages_downloaded: dict, depth: int) -> int:
    if depth > 100:
        return 0
    
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    
    num_total_pages_downloaded = 0    
    title_pathname = title_pathname_map(category, filepath, inverse_filter)
    
    pages = title_pathname["pages"]
    for page_title, page_filename in pages.items():
        if r.sismember("downloaded_pages", page_title):
            continue
        download_page(page_title, page_filename, filepath)
        num_total_pages_downloaded += 1
        r.sadd("downloaded_pages", page_title)
        
    if num_total_pages_downloaded > 0:
        category_pages_downloaded[category] = num_total_pages_downloaded
    
    subcategories = title_pathname["categories"]
    for subcategory_title, subcategory_path in subcategories.items():
        if r.sismember("downloaded_categories", subcategory_title):
            continue
        subcategory_path = os.path.join(filepath, subcategory_path)
        subcategory_total_pages_downloaded = fetch_wiki_data(subcategory_title, subcategory_path, inverse_filter, category_pages_downloaded, depth + 1)
        num_total_pages_downloaded += subcategory_total_pages_downloaded
        r.sadd("downloaded_categories", subcategory_title)
    
    return num_total_pages_downloaded

In [47]:
category_pages_downloaded = {}
pages_downloaded = fetch_wiki_data("Dinosaurs", "data/v2/Dinosaurs", ["bird", "list of", "lists of"], category_pages_downloaded, 99)

pages_downloaded

109

In [48]:
category_pages_downloaded

{'Dinosaurs': 11,
 'Dinosaur-related lists': 3,
 'Dinosaur paleontology': 3,
 'Dinosaurs in popular culture': 14,
 'Dinosaur taxonomy': 1,
 'Ornithischians': 3,
 'Saurischians': 11,
 'Dinosaur stubs': 63}

In [4]:
r.scard("downloaded_pages")

109

In [5]:
r.scard("downloaded_categories")

43

***Note: There is no conflict between count of ```category_pages_downloaded``` and count of ```downloaded_categories``` in redis. Code logic does not add category in the dict if page count is 0. In this sample run, depth is restricted to 2, therefore 3rd level categories haven't been processed, but get added in redis' downloaded_categories.***  

In [44]:
r.delete("downloaded_pages")
r.delete("downloaded_categories")

1



***INCLUDE ERROR HANDLING IN PRODUCTION CODE***