<a href="https://colab.research.google.com/github/kousiknandy/pycolab/blob/main/Wordcount_Asyncio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [102]:
import asyncio
import aiohttp
import random
import re
from collections import defaultdict

url_prefix = "https://raw.githubusercontent.com/kousiknandy/pycolab/refs/heads/main/data/book/frankenstein/xba"
wordlen = 8

async def fetch_text(suffx):
    await asyncio.sleep(random.random()+0.5)
    async with aiohttp.ClientSession() as session:
        async with session.get(url_prefix + suffx) as resp:
            async for line in resp.content:
                yield line

async def fetch_words(suffx):
    async for line in  fetch_text(suffx):
        words = re.finditer(r"\b\w+\b", line.decode("UTF-8"), re.ASCII)
        for wm in words:
            w = wm.group(0).lower()
            if len(w) >= wordlen:
                yield w

async def mapper(suffx):
    words = defaultdict(int)
    async for w in fetch_words(suffx):
        words[w] += 1
    return words


In [103]:
def wordhash(word, m):
    h = 5381
    for c in word:
        h = h * 33 + ord(c)
    return h % m

partitions = 2
bucket = lambda word: wordhash(word, partitions)

def partition(words, parts):
    for w,c in words.items():
        parts[bucket(w)][w].append(c)


In [104]:
import heapq
from functools import partial

def top_few(word, n):
    counts = [(sum(v), k) for k,v in word.items()]
    tops = heapq.nlargest(n, counts)
    return tops

In [105]:
from concurrent.futures import ProcessPoolExecutor

async def main(file_suffixes):
    fs = [asyncio.create_task(mapper(f)) for f in file_suffixes]
    parts = [defaultdict(list) for _ in range(partitions)]
    for r in asyncio.as_completed(fs):
        words = await r
        partition(words, parts)
    reducers = [partial(top_few, word=parts[i], n=10) for i in range(partitions)]
    loop = asyncio.get_event_loop()
    tops = []
    with ProcessPoolExecutor(max_workers=2) as pool:
        fs = [loop.run_in_executor(pool, r) for r in reducers]
        for r in asyncio.as_completed(fs):
            t = await r
            tops.append(t)
    res = heapq.merge(*tops)
    return res

In [106]:
from string import ascii_lowercase
file_suffixes = random.sample(ascii_lowercase[:25], k=6)

res = await main(file_suffixes=file_suffixes)
print(list(res)[:10])

[(18, 'feelings'), (12, 'magistrate'), (12, 'countenance'), (11, 'expressed'), (11, 'creature'), (10, 'darkness'), (10, 'continued'), (9, 'evidence'), (8, 'witnesses'), (8, 'returned')]
