<a href="https://colab.research.google.com/github/kousiknandy/pycolab/blob/main/SimpleWiki_Related_Asyncio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
from bs4 import BeautifulSoup
import aiohttp
import asyncio

class AWikiCrawler:
  def __init__(self, url):
    self.url = url
    self.host = "https://simple.wikipedia.org"

  async def next_urls(self):
    async with aiohttp.ClientSession() as session:
      async with session.get(self.host + self.url) as resp:
        self.data = await resp.text()
    soup = BeautifulSoup(self.data, "html.parser")
    if not (h2 := soup.find(id="Related_pages")):
      return None
    rl = h2.parent.next_siblings
    _ = next(rl)
    ul = next(rl)
    links = ul.find_all("a")
    return [l["href"] for l in links if l["href"].startswith("/wiki/")]


In [16]:
async def scrape(url, depth):
  w = AWikiCrawler(url)
  print("Visiting", depth, url)
  return await w.next_urls()

async def main(seed_url):
  depth = 0
  url_queue = [seed_url]
  visited_urls = set()
  while url_queue:
    ws = [asyncio.create_task(scrape(u,depth)) for u in url_queue]
    url_queue = []
    depth += 1
    for w in asyncio.as_completed(ws):
      new_urls = await w
      print("next", new_urls)
      if not new_urls: continue
      if depth > 5 or len(visited_urls) > 150: continue
      for u in new_urls:
        if u in visited_urls: continue
        visited_urls.add(u)
        url_queue.append(u)

  print("Overall visited", len(visited_urls), visited_urls)

await main("/wiki/Computer_science")

Visiting 0 /wiki/Computer_science
next ['/wiki/Computing', '/wiki/Formal_language', '/wiki/Turing_Award', '/wiki/Computer_jargon', '/wiki/List_of_words_about_computers']
Visiting 1 /wiki/Computing
Visiting 1 /wiki/Formal_language
Visiting 1 /wiki/Turing_Award
Visiting 1 /wiki/Computer_jargon
Visiting 1 /wiki/List_of_words_about_computers
next None
next None
next None
next ['/wiki/Computing', '/wiki/Formal_language', '/wiki/Turing_Award', '/wiki/Computer_jargon', '/wiki/List_of_words_about_computers']
next ['/wiki/Language', '/wiki/Syntax', '/wiki/Semantics', '/wiki/Natural_language', '/wiki/Computer_language', '/wiki/Programming_language']
Visiting 2 /wiki/Language
Visiting 2 /wiki/Syntax
Visiting 2 /wiki/Semantics
Visiting 2 /wiki/Natural_language
Visiting 2 /wiki/Computer_language
Visiting 2 /wiki/Programming_language
next ['/wiki/Artificial_language', '/wiki/Formal_language', '/wiki/Natural_language_processing']
next ['/wiki/Algorithm', '/wiki/Formal_language', '/wiki/List_of_progra