<a href="https://colab.research.google.com/github/kousiknandy/pycolab/blob/main/SimpleWiki_AsyncIO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
from bs4 import BeautifulSoup
import aiohttp
import asyncio

class AWikiCrawler:
    def __init__(self, url):
        self.url = url
        self.host = "https://simple.wikipedia.org"

    async def next_urls(self):
        async with aiohttp.ClientSession() as session:
            async with session.get(self.host + self.url) as resp:
                self.data = await resp.text()
        soup = BeautifulSoup(self.data, "html.parser")
        if not (h2 := soup.find(id="Related_pages")): return None
        rl = h2.find_parent().find_next_siblings()
        for r in rl:
            # print(type(r), r)
            if r.name == "ul" or  r.find("ul"):
                links = r.find_all("a")
                # print(links)
                return [l["href"] for l in links if l["href"].startswith("/wiki/")]



async def scrape(url, depth):
    w = AWikiCrawler(url)
    print("Visiting", depth, url)
    await asyncio.sleep(0.3333333)
    return depth, await w.next_urls()

async def main(seed_url):
    url_queue = [(seed_url, 0)]
    visited_urls = set()
    pending = set()
    while url_queue or pending:
        if url_queue:
            new_tsk = [asyncio.create_task(scrape(u,d)) for u,d in url_queue]
        url_queue = []
        tsk = list(pending) + new_tsk
        done, pending = await asyncio.wait(tsk, return_when=asyncio.FIRST_COMPLETED)
        for w in done:
            depth, new_urls = await w
            print("next", new_urls)
            if not new_urls: continue
            if depth > 5 or len(visited_urls) > 50:
                continue
            for u in new_urls:
                if u in visited_urls:
                    print("Already visited", u)
                    continue
                visited_urls.add(u)
                url_queue.append((u,depth+1))

    print("Overall visited", len(visited_urls), visited_urls)

await main("/wiki/Computer_science")
await main("/wiki/Language")

Visiting 0 /wiki/Computer_science
next ['/wiki/Computing', '/wiki/Formal_language', '/wiki/Turing_Award', '/wiki/Computer_jargon', '/wiki/List_of_words_about_computers']
Visiting 1 /wiki/Computing
Visiting 1 /wiki/Formal_language
Visiting 1 /wiki/Turing_Award
Visiting 1 /wiki/Computer_jargon
Visiting 1 /wiki/List_of_words_about_computers
next None
next None
next ['/wiki/Language', '/wiki/Syntax', '/wiki/Semantics', '/wiki/Natural_language', '/wiki/Computer_language', '/wiki/Programming_language']
next ['/wiki/Computing', '/wiki/Formal_language', '/wiki/Turing_Award', '/wiki/Computer_jargon', '/wiki/List_of_words_about_computers']
Already visited /wiki/Computing
Already visited /wiki/Formal_language
Already visited /wiki/Turing_Award
Already visited /wiki/Computer_jargon
Already visited /wiki/List_of_words_about_computers
next None
Visiting 2 /wiki/Language
Visiting 2 /wiki/Syntax
Visiting 2 /wiki/Semantics
Visiting 2 /wiki/Natural_language
Visiting 2 /wiki/Computer_language
Visiting 2 