Below is a solution to [LeetCode 1236 - Web Crawler](https://leetcode.com/problems/web-crawler/)

In [35]:
# """
# This is HtmlParser's API interface.
# You should not implement it, or speculate about its implementation
# """
# class HtmlParser(object):
#    def getUrls(self, url):
#        """
#        :type url: str
#        :rtype List[str]
#        """
from typing import List
from urllib.parse import urlparse


def get_hostname(url: str):
    url = urlparse(url).hostname
    if not url:
        return
    found_dot = False
    for i, c in enumerate(reversed(url)):
        if c != '.':
            continue
        if not found_dot:
            found_dot = True
        else:
            return url[len(url)-i:]
    return url


class Solution:
    def crawl(self, start_url: str, html_parser: 'HtmlParser') -> List[str]:
        hostname = get_hostname(start_url)
        visited = set()
        stack = [start_url]
        while stack:
            curr_url = stack.pop()
            if get_hostname(curr_url) != hostname:
                continue
            visited.add(curr_url)
            for child in html_parser.getUrls(curr_url):
                if child not in visited:
                    stack.append(child)
        return list(visited)


## Concurrency with `concurrent`

Let's now expand this web crawler to run asynchronously. Firstly, here is Python pseudocode for how we would implement a multithreaded if we could magically use threads. 
```python3
def crawl_thread(task_pool, htmlParser, new_url, visited_urls):
    visited_urls.lock()
    if new_url in visited_urls:
        return
    visited_urls.add(new_url)
    visited_urls.unlock()
    
    children = htmlParser.getUrls()
    for child in children:
        if child not in visited:
            task_pool.lock()
            new_task = crawl_thread(task_pool, htmlParser, new_url, visited_urls)
            task_pool.add(new_task)
            task_pool.unlock()
            new_task.start()


def crawl(starting_url):
    task_pool = [crawl_thread(task_pool, htmlParser, new_url, visited_urls)]
    visited_urls = []
    task_pool.run()
    return visited_urls

```
Python has some challenges around writing concurrent code because of the Global Interpreter Lock (GIL) and non-thread-safe memory model so there's a couple workaround. One workaround is the `concurrent` library. We can use this to implement a solution to [LeetCode 1242 - Web Crawler Multithreaded](https://leetcode.com/problems/web-crawler-multithreaded/).


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse

def get_hostname(url: str):
    url = urlparse(url).hostname
    if not url:
        return
    found_dot = False
    for i, c in enumerate(reversed(url)):
        if c != '.':
            continue
        if not found_dot:
            found_dot = True
        else:
            return url[len(url)-i:]
    return url

def crawl_thread(executor, current_url, base_url, html_parser, visited_urls, futures):
    visited_urls.add(current_url)
    children = html_parser.getUrls(current_url)
    for child in children:
        if (child not in visited_urls) and (get_hostname(child) == base_url):
            future = executor.submit(crawl_thread, executor, child, base_url, html_parser, visited_urls, futures)
            futures.add(future)
            if future.result():
                raise RuntimeError(f"{future.result()}")
    
class Solution:
    def crawl(self, start_url, html_parser):
        base_url = get_hostname(start_url)
        visited_urls = set([start_url]) 
        futures = set()

        with ThreadPoolExecutor(max_workers=50) as executor:
            initial_future = executor.submit(crawl_thread, executor, start_url, base_url, html_parser, visited_urls, futures)
            futures.add(initial_future)
            for future in as_completed(futures):
                # Spinlock until all threads complete
                pass
        
        return list(visited_urls)

## Concurrency with `asyncio`
Another option is the `asyncio` library, which is great for i/o bound activity. Here's an example of the crawler implemented with `asyncio` coroutines (this will not actually pass the LeetCode test battery, because it TLEs):

In [None]:
from typing import List
from urllib.parse import urlparse
import asyncio

def get_hostname(url: str):
    url = urlparse(url).hostname
    if not url:
        return
    found_dot = False
    for i, c in enumerate(reversed(url)):
        if c != '.':
            continue
        if not found_dot:
            found_dot = True
        else:
            return url[len(url)-i:]
    return url


async def crawl_task(base_host, url, html_parser, visited_urls, semaphore):
    visited_urls.add(url)
    
    tasks = []
    async with semaphore:
        children = html_parser.getUrls(url)
    
    for child in children:
        if child not in visited_urls and get_hostname(child) == base_host:
            task = asyncio.create_task(crawl_task(base_host, child, html_parser, visited_urls, semaphore))
            tasks.append(task)
    await asyncio.gather(*tasks)
            
                                       
class Solution:
    def crawl(self, start_url: str, html_parser: 'HtmlParser') -> List[str]:
        visited_urls = set()
        semaphore = asyncio.Semaphore(60)
        asyncio.run(crawl_task(get_hostname(start_url), start_url, html_parser, visited_urls, semaphore))
        return list(visited_urls)
