<a href="https://colab.research.google.com/github/kousiknandy/pycolab/blob/main/Crawler_Hostcounter_Asyncio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [159]:
from bs4 import BeautifulSoup
from urllib3.util import parse_url
import asyncio
import aiohttp
import random
from functools import partial, reduce
from itertools import tee
from collections import Counter, defaultdict
from concurrent.futures import ProcessPoolExecutor
import heapq

base_host = "simple.m.wikipedia.org"
max_depth = 1

In [160]:
async def scrape_page(url):
    await asyncio.sleep(1+ random.random())
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            data = await resp.text()
    soup = BeautifulSoup(data, "html.parser")
    links = soup.find_all("a")
    hosts, urls = tee(l.get("href") for l in links if l.get("href"))
    hosts = (parse_url(h).hostname or base_host for h in hosts)
    urls = (
        l
        for l in urls
        if parse_url(l).hostname == None and parse_url(l).path.startswith("/wiki/")
    )
    return hosts, urls



In [161]:
async def mapper(url, depth):
    hosts, urls = await scrape_page("https://" + base_host + url)
    hosts = Counter(hosts)
    return depth, hosts, urls

In [162]:
def host_hash(host, parts):
    h = 5381
    for c in host:
        h = h * 33 + ord(c)
    return h % parts

def partitioner(parts):
    hashes = [defaultdict(list) for _ in range(parts)]
    while hc := (yield):
        hashes[host_hash(hc[0], parts)][hc[0]].append(hc[1])
    return hashes

In [163]:
def host_summer(hashes):
    host_count = {}
    for h, l in hashes.items():
        host_count[h] = sum(l)
    return host_count


In [164]:
async def main(seed_url, partitions):
    p = partitioner(partitions)
    p.send(None)
    pending = []
    visited_url = set()
    to_scrape = [(seed_url, 0)]
    while to_scrape or pending:
        # print(to_scrape)
        ts = [asyncio.create_task(mapper(u, d)) for u,d in to_scrape]
        to_scrape = []
        done, pending = await asyncio.wait(ts + list(pending), return_when=asyncio.FIRST_COMPLETED)
        for t in done:
            d, hosts, urls = await t
            for h, c in hosts.items():
                p.send((h,c))
            if d < max_depth and len(visited_url) < 10:
                urls = [u for u in urls if u not in visited_url]
                visited_url.update(urls)
                to_scrape += [(u, d+1) for u in urls]
    try:
        p.send(None)
    except StopIteration as e:
        partitions = e.value
    with ProcessPoolExecutor(max_workers=2) as executor:
        res = executor.map(host_summer, partitions)
    h_c = reduce(lambda a, b: {**a, **b}, res )
    return h_c

In [165]:
hosts = await main("/wiki/Computer_science", 2)
hosts = [(v, k) for k,v in hosts.items()]
heapq.nlargest(15, hosts)

[(10611, 'simple.m.wikipedia.org'),
 (319, 'foundation.wikimedia.org'),
 (268, 'simple.wikipedia.org'),
 (149, 'web.archive.org'),
 (135, 'archive.org'),
 (123, 'en.wikipedia.org'),
 (109, 'www.mediawiki.org'),
 (106, 'es.wikipedia.org'),
 (105, 'www.wikimedia.org'),
 (105, 'stats.wikimedia.org'),
 (105, 'foundation.m.wikimedia.org'),
 (105, 'donate.wikimedia.org'),
 (105, 'developer.wikimedia.org'),
 (102, 'creativecommons.org'),
 (101, 'uk.wikipedia.org')]