### Single request to wiki category, examine response

In [1]:
import requests

cat = 'Dinosaurs'
# Make the request to fetch category information
url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{cat}&cmlimit=max"
headers = {
    'User-Agent': 'Tinker/0.1 (kartikeyapophali@gmail.com)'
}
response = requests.get(url, headers=headers)

# Print the response
print(response.json())

{'batchcomplete': '', 'limits': {'categorymembers': 500}, 'query': {'categorymembers': [{'pageid': 8311, 'ns': 0, 'title': 'Dinosaur'}, {'pageid': 1070621, 'ns': 0, 'title': 'Dinosaur classification'}, {'pageid': 10319435, 'ns': 100, 'title': 'Portal:Dinosaurs'}, {'pageid': 31102995, 'ns': 0, 'title': 'Outline of dinosaurs'}, {'pageid': 74223457, 'ns': 0, 'title': 'Attenborough and the Giant Dinosaur'}, {'pageid': 3410, 'ns': 0, 'title': 'Bird'}, {'pageid': 174609, 'ns': 0, 'title': 'Chicxulub crater'}, {'pageid': 44503418, 'ns': 0, 'title': 'Cretaceous–Paleogene extinction event'}, {'pageid': 41707879, 'ns': 0, 'title': 'Glossary of dinosaur anatomy'}, {'pageid': 54226283, 'ns': 0, 'title': 'List of non-avian dinosaur species preserved with evidence of feathers'}, {'pageid': 24865137, 'ns': 0, 'title': 'Opisthotonic death pose'}, {'pageid': 6763404, 'ns': 0, 'title': 'Origin of birds'}, {'pageid': 25790373, 'ns': 0, 'title': 'Ornithoscelida'}, {'pageid': 6040372, 'ns': 0, 'title': 'Ph

### Fetch category and page information from wiki recursively
- Create directory structure of categories, subcategories
- Save response.json for each category in corresponding directory
- Filter out category/pages with substring 'bird' in the title

In [13]:
import requests
import json
import os

pageTitleSet = set()
pageIdSet = set()
categoryIdSet = set()

def fetch_category_pages(category, filepath, depth):
    dep = depth + 1
    if dep > 1:
        return
    
    # Create the directory for the category if it doesn't exist
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    
    # Make the initial request to get the category information
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{category}&cmlimit=max"
    headers = {"User-Agent": "Tinker/0.1 (kartikeyapophali@gmail.com)"}
    response = requests.get(url, headers=headers)
    data = response.json()
    
    # Write the response to a response.json file
    response_filepath = os.path.join(filepath, "response.json")
    with open(response_filepath, "w") as file:
        json.dump(data, file)

    # Check if the category has any pages
    if "categorymembers" in data["query"]:
        pages = data["query"]["categorymembers"]

        # Filter out pages and subcategories with 'bird' or 'Bird' in the title
        pages = [page for page in pages if 'bird' not in page["title"].lower()]

        # Separate pages and subcategories
        actual_pages = [page for page in pages if page["ns"] == 0]
        subcategories = [page for page in pages if page["ns"] == 14]

        # Process actual pages
        for page in actual_pages:
            page_title = page["title"]
            pageTitleSet.add(page_title)
            page_id = page["pageid"]
            pageIdSet.add(page_id)

        # Process subcategories
        for subcategory in subcategories:
            subcategory_id = subcategory["pageid"]
            if subcategory_id in categoryIdSet:
                continue
            categoryIdSet.add(subcategory_id)
            subcategory_name = subcategory["title"].replace("Category:", "")
            subcategory_path = os.path.join(filepath, subcategory_name)
            # Recursively fetch pages from subcategories
            fetch_category_pages(subcategory_name, subcategory_path, dep)
                
fetch_category_pages("Dinosaurs", "/aux/data/wiki/Dinosaurs", -1000)
# fetch_category_pages("Dinosaurs", "Dinosaurs", -1)
print(f"Count of pages by page titles: {len(pageTitleSet)}")
print(f"Count of pages by page id: {len(pageIdSet)}")
print(f"Count of categories: {len(categoryIdSet)}")

Count of pages by page titles: 3810
Count of pages by page id: 3810
Count of categories: 364


In [14]:
print(f"Count of categories: {len(categoryIdSet)}")

Count of categories: 364


### Fetch category pages for depth=1

In [3]:
import requests
import json
import os

pageTitleSet = set()
pageIdSet = set()
categoryIdSet = set()

def fetch_category_pages(category, filepath, depth):
    if depth > 1:
        return
    next_depth = depth + 1
    
    # Create the directory for the category if it doesn't exist
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    
    # Make the initial request to get the category information
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{category}&cmlimit=max"
    headers = {"User-Agent": "Tinker/0.1 (kartikeyapophali@gmail.com)"}
    response = requests.get(url, headers=headers)
    data = response.json()
    
    # Write the response to a response.json file
    response_filepath = os.path.join(filepath, "response.json")
    with open(response_filepath, "w") as file:
        json.dump(data, file)

    # Check if the category has any pages
    if "categorymembers" in data["query"]:
        pages = data["query"]["categorymembers"]

        # Filter out pages and subcategories with 'bird' or 'Bird' in the title
        pages = [page for page in pages if 'bird' not in page["title"].lower()]

        # Separate pages and subcategories
        actual_pages = [page for page in pages if page["ns"] == 0]
        subcategories = [page for page in pages if page["ns"] == 14]

        # Process actual pages
        for page in actual_pages:
            page_title = page["title"]
            pageTitleSet.add(page_title)
            page_id = page["pageid"]
            pageIdSet.add(page_id)

        # Process subcategories
        for subcategory in subcategories:
            subcategory_id = subcategory["pageid"]
            if subcategory_id in categoryIdSet:
                continue
            categoryIdSet.add(subcategory_id)
            subcategory_name = subcategory["title"].replace("Category:", "")
            subcategory_path = os.path.join(filepath, subcategory_name)
            # Recursively fetch pages from subcategories
            fetch_category_pages(subcategory_name, subcategory_path, next_depth)

In [4]:
# fetch_category_pages("Dinosaurs", "/aux/data/wiki/Dinosaurs", -1000)
fetch_category_pages("Dinosaurs", "Dinosaurs", 0)
print(f"Count of pages by page titles: {len(pageTitleSet)}")
print(f"Count of pages by page id: {len(pageIdSet)}")
print(f"Count of categories: {len(categoryIdSet)}")

Count of pages by page titles: 127
Count of pages by page id: 127
Count of categories: 46
