### Single request to wiki category, examine response

In [2]:
import requests

cat = 'Physics'
# Make the request to fetch category information
url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{cat}&cmlimit=max"
headers = {
    'User-Agent': 'Tinker/0.1 (kartikeyapophali@gmail.com)'
}
response = requests.get(url, headers=headers)

# Print the response
print(response.json())

{'batchcomplete': '', 'limits': {'categorymembers': 500}, 'query': {'categorymembers': [{'pageid': 22939, 'ns': 0, 'title': 'Physics'}, {'pageid': 844186, 'ns': 0, 'title': 'Modern physics'}, {'pageid': 1653925, 'ns': 100, 'title': 'Portal:Physics'}, {'pageid': 78053369, 'ns': 0, 'title': 'Bijel'}, {'pageid': 74985603, 'ns': 0, 'title': 'Edge states'}, {'pageid': 1996857, 'ns': 0, 'title': 'Nucleation'}, {'pageid': 2137509, 'ns': 0, 'title': 'Perfect fluid'}, {'pageid': 27481335, 'ns': 0, 'title': 'Plasmaron'}, {'pageid': 75463818, 'ns': 0, 'title': 'Quasi-isodynamic stellarator'}, {'pageid': 76197486, 'ns': 0, 'title': 'Shockwave cosmology'}, {'pageid': 21276538, 'ns': 0, 'title': 'Surface stress'}, {'pageid': 467047, 'ns': 0, 'title': 'Thermal energy'}, {'pageid': 74170779, 'ns': 0, 'title': 'Toroidal solenoid'}, {'pageid': 70983414, 'ns': 14, 'title': 'Category:Physics by country'}, {'pageid': 49740128, 'ns': 14, 'title': 'Category:Subfields of physics'}, {'pageid': 37468090, 'ns': 

### Fetch category and page information from wiki recursively
- Create directory structure of categories, subcategories
- Save response.json for each category in corresponding directory

In [3]:
import requests
import json
import os

pageTitleSet = set()
pageIdSet = set()
categoryTitleSet = set()

def fetch_category_pages(category, filepath, depth):
    dep = depth + 1
    if dep > 1:
        return
    
    # Create the directory for the category if it doesn't exist
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    
    # Make the initial request to get the category information
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{category}&cmlimit=max"
    headers = {"User-Agent": "Tinker/0.1 (kartikeyapophali@gmail.com)"}
    response = requests.get(url, headers=headers)
    data = response.json()
    
    # Write the response to a response.json file
    response_filepath = os.path.join(filepath, "response.json")
    with open(response_filepath, "w") as file:
        json.dump(data, file)

    # Check if the category has any pages
    if "categorymembers" in data["query"]:
        pages = data["query"]["categorymembers"]

        # Filter out pages and subcategories with 'bird' or 'Bird' in the title
        pages = [page for page in pages if 'bird' not in page["title"].lower()]

        # Separate pages and subcategories
        actual_pages = [page for page in pages if page["ns"] == 0]
        subcategories = [page for page in pages if page["ns"] == 14]

        # Process actual pages
        for page in actual_pages:
            page_title = page["title"]
            pageTitleSet.add(page_title)
            page_id = page["pageid"]
            pageIdSet.add(page_id)

        # Process subcategories
        for subcategory in subcategories:
            subcategory_id = subcategory["pageid"]
            if subcategory_id in categoryTitleSet:
                continue
            categoryTitleSet.add(subcategory_id)
            subcategory_name = subcategory["title"].replace("Category:", "")
            subcategory_path = os.path.join(filepath, subcategory_name)
            # Recursively fetch pages from subcategories
            fetch_category_pages(subcategory_name, subcategory_path, dep)
                
fetch_category_pages("Physics", "/aux/data/wiki/physics/Physics", -1000)
# fetch_category_pages("Physics", "Physics", -1)
print(f"Count of pages by page titles: {len(pageTitleSet)}")
print(f"Count of pages by page id: {len(pageIdSet)}")
print(f"Count of categories: {len(categoryTitleSet)}")

KeyboardInterrupt: 

In [None]:
print(f"Count of categories: {len(categoryTitleSet)}")

### Fetch category pages for depth=1

In [4]:
import requests
import json
import os

pageTitleSet = set()
categoryTitleSet = set()


def fetch_category_pages(category, filepath, depth):
    if depth > 1:
        return

    # Create the directory for the category if it doesn't exist
    if not os.path.exists(filepath):
        os.makedirs(filepath)

    # Make the initial request to get the category information
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{category}&cmlimit=max"
    headers = {"User-Agent": "Tinker/0.1 (kartikeyapophali@gmail.com)"}
    response = requests.get(url, headers=headers)
    data = response.json()

    # Write the response to a response.json file
    response_filepath = os.path.join(filepath, "response.json")
    with open(response_filepath, "w") as file:
        json.dump(data, file)

    pages_skipped = []
    categories_skipped = []

    # Check if the category has any pages
    if "categorymembers" in data["query"]:
        category_members = data["query"]["categorymembers"]

        # Filter out pages and subcategories with 'bird' or 'Bird' in the title
        category_members = [category_member for category_member in category_members if "bird" not in category_member["title"].lower()]

        # Separate pages and subcategories
        pages = [category_member for category_member in category_members if category_member["ns"] == 0]
        subcategories = [category_member for category_member in category_members if category_member["ns"] == 14]

        # Process actual pages
        for page in pages:
            page_title = page["title"]
            if page_title in pageTitleSet:
                pages_skipped.append(page_title)
                continue
            pageTitleSet.add(page_title)

        # Process subcategories
        for subcategory in subcategories:
            subcategory_title = subcategory["title"].replace("Category:", "")
            if subcategory_title in categoryTitleSet:
                categories_skipped.append(subcategory_title)
                continue
            categoryTitleSet.add(subcategory_title)

            subcategory_path = os.path.join(filepath, subcategory_title)
            # Recursively fetch pages from subcategories
            fetch_category_pages(subcategory_title, subcategory_path, depth + 1)
    
    # Write the skipped pages to a file
    pages_skipped_filepath = os.path.join(filepath, "pages_skipped.json")
    with open(pages_skipped_filepath, "w") as file:
        json.dump(pages_skipped, file)
    
    # Write the skipped categories to a file
    categories_skipped_filepath = os.path.join(filepath, "categories_skipped.json")
    with open(categories_skipped_filepath, "w") as file:
        json.dump(categories_skipped, file)

In [None]:
# fetch_category_pages("Dinosaurs", "/aux/data/wiki/Dinosaurs", -1000)
fetch_category_pages("Dinosaurs", "data/Dinosaurs", 0)
print(f"Count of pages by page titles: {len(pageTitleSet)}")
print(f"Count of categories: {len(categoryTitleSet)}")