<a href="https://colab.research.google.com/github/kousiknandy/pycolab/blob/main/SimpleWiki_Related.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [88]:
import urllib3
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from collections import deque
import time

class WikiRelated:
  host = "https://simple.wikipedia.org"

  def __init__(self, url):
    self.url = url
    self.data = self._fetch()

  def _fetch(self):
    resp = urllib3.request("GET", self.host + self.url, retries=2, timeout=2)
    if resp.status != 200: return None
    return resp.data

  def related(self):
    if not self.data: return None
    soup = BeautifulSoup(self.data, "html.parser")
    if not (h2 := soup.find(id="Related_pages")):
      return None
    rl = h2.parent.next_siblings
    _ = next(rl)
    ul = next(rl)
    links = ul.find_all("a")
    return [l["href"] for l in links if l["href"].startswith("/wiki/")]


In [89]:
s = WikiRelated("/wiki/Computer_science")
s.related()

['/wiki/Computing',
 '/wiki/Formal_language',
 '/wiki/Turing_Award',
 '/wiki/Computer_jargon',
 '/wiki/List_of_words_about_computers']

In [90]:
visited_urls, visited_lock = set(), Lock()
max_depth = 5

In [91]:
def crawl_url(url, visited_url, visited_lock, depth):
  if depth > max_depth: return None
  with visited_lock:
    if url in visited_url: return None
    visited_url.add(url)
  print("Fetching", depth, url)
  time.sleep(0.333333)
  page = WikiRelated(url)
  return page.related()

# s = crawl_url("/wiki/Computer_science", visited_urls, visited_lock, 15)
# print(list(s),  visited_urls, visited_lock)

In [92]:
seed_url = "/wiki/Computer_science"
exec_queue = [(seed_url, 0)]

In [93]:
with ThreadPoolExecutor(max_workers=2) as executor:
  while exec_queue:
    fs = {executor.submit(crawl_url, url, visited_urls, visited_lock, d): d for url,d in exec_queue}
    exec_queue = []
    for fut in as_completed(fs):
      if next_urls := fut.result():
        print(next_urls)
        exec_queue.extend([(u, fs[fut]+1) for u in next_urls])

Fetching 0 /wiki/Computer_science
['/wiki/Computing', '/wiki/Formal_language', '/wiki/Turing_Award', '/wiki/Computer_jargon', '/wiki/List_of_words_about_computers']
Fetching 1 /wiki/Computing
Fetching 1 /wiki/Formal_language




Fetching 1 /wiki/Turing_Award
['/wiki/Computing', '/wiki/Formal_language', '/wiki/Turing_Award', '/wiki/Computer_jargon', '/wiki/List_of_words_about_computers']
Fetching['/wiki/Language', '/wiki/Syntax', '/wiki/Semantics', '/wiki/Natural_language', '/wiki/Computer_language', '/wiki/Programming_language']
 1 /wiki/Computer_jargon
Fetching 1 /wiki/List_of_words_about_computers
FetchingFetching 2 /wiki/Syntax
 2 /wiki/Language




Fetching 2 /wiki/Semantics
['/wiki/Phrase', '/wiki/Grammar']
Fetching 2 /wiki/Natural_language
Fetching 2 /wiki/Computer_language
Fetching 2 /wiki/Programming_language
['/wiki/Artificial_language', '/wiki/Formal_language', '/wiki/Natural_language_processing']
['/wiki/Algorithm', '/wiki/Formal_language', '/wiki/List_of_programming_languages', '/wiki/Programmer', '/wiki/Compiler', '/wiki/Computer_programming', '/wiki/Programming_paradigm', '/wiki/Pseudocode']
['/wiki/Algorithm', '/wiki/Formal_language', '/wiki/List_of_programming_languages', '/wiki/Programmer', '/wiki/Compiler', '/wiki/Computer_programming', '/wiki/Programming_paradigm', '/wiki/Pseudocode']
Fetching 3 /wiki/Phrase
Fetching 3 /wiki/Grammar




Fetching 3 /wiki/Artificial_language
Fetching['/wiki/Hebrew', '/wiki/Linguistics', '/wiki/Greek_alphabet', '/wiki/Iranic_languages', '/wiki/List_of_languages', '/wiki/Hebrew_alphabet', '/wiki/Japonic_languages', '/wiki/Phoenician_alphabet', '/wiki/Afro-Asiatic_languages', '/wiki/Indo-Iranian_languages', '/wiki/Sino-Tibetan_languages', '/wiki/Indo-European_languages']
 3 /wiki/Natural_language_processing
Fetching 3 /wiki/Algorithm
['/wiki/Constructed_language', '/wiki/Natural_language', '/wiki/Computer_language']




Fetching 3 /wiki/List_of_programming_languages
Fetching['/wiki/Euclidean_algorithm', '/wiki/Greatest_common_divisor', '/wiki/Numerical_analysis']
 3 /wiki/Programmer
Fetching 3 /wiki/Compiler
Fetching 3 /wiki/Computer_programming
Fetching 3 /wiki/Programming_paradigm
Fetching 3 /wiki/Pseudocode
FetchingFetching 4 /wiki/Linguistics
 4 /wiki/Hebrew




Fetching['/wiki/Phoenician_alphabet']
 4 /wiki/Greek_alphabet
Fetching 4 /wiki/Iranic_languages
Fetching['/wiki/Ancient_Greek_language', '/wiki/Greek_language', '/wiki/History_of_the_alphabet']
 4 /wiki/List_of_languages
Fetching 4 /wiki/Hebrew_alphabet
Fetching['/wiki/Language', '/wiki/Constructed_language', '/wiki/List_of_fictional_languages', '/wiki/List_of_programming_languages', '/wiki/Sign_language']
 4 /wiki/Japonic_languages
Fetching 4 /wiki/Phoenician_alphabet
['/wiki/Phoenician_alphabet']
Fetching 4 /wiki/Afro-Asiatic_languages
Fetching 4 /wiki/Indo-Iranian_languages
Fetching 4 /wiki/Sino-Tibetan_languages
Fetching 4 /wiki/Indo-European_languages
Fetching['/wiki/Chinese_language']
 4 /wiki/Constructed_language
Fetching 4 /wiki/Euclidean_algorithm
Fetching 4 /wiki/Greatest_common_divisor
Fetching 4 /wiki/Numerical_analysis
['/wiki/Coprime', '/wiki/Euclid%27s_Elements', '/wiki/Least_common_multiple']
['/wiki/Wikipedia:Sister_projects']
Fetching 5 /wiki/Ancient_Greek_language
Fe



Fetching 5 /wiki/History_of_the_alphabet
['/wiki/Greek_alphabet', '/wiki/Greek_language']
Fetching['/wiki/Greek_numbers']
 5 /wiki/List_of_fictional_languages
Fetching 5 /wiki/Sign_language
Fetching 5 /wiki/Chinese_language
Fetching 5 /wiki/Coprime
['/wiki/American_Sign_Language', '/wiki/Deaf']
Fetching['/wiki/Chinese_remainder_theorem', '/wiki/Euler%27s_totient_function', '/wiki/Euler%27s_totient_theorem']
 5 /wiki/Euclid%27s_Elements
Fetching 5 /wiki/Least_common_multiple
['/wiki/Standard_Chinese', '/wiki/Mandarin_Chinese', '/wiki/Cantonese_language']
Fetching 5 /wiki/Wikipedia:Sister_projects
['/wiki/Scalar', '/wiki/Algebra', '/wiki/Arithmetic', '/wiki/Pure_mathematics', '/wiki/Mathematical_analysis', '/wiki/Greatest_common_divisor']


In [94]:
visited_lock

<unlocked _thread.lock object at 0x7f4b9928a840>