Description

    Most of us are familiar with web spiders and crawlers like GoogleBot - they visit a web page, index content there, and then visit outgoing links from that page. Crawlers are an interesting technology with continuing development.

    Web crawlers marry queuing and HTML parsing and form the basis of search engines etc. Writing a simple crawler is a good exercise in putting a few things together. Writing a well behaved crawler is another step up.
    For this challenge you may use any single shot web client you wish, e.g. Python's httplib or any of a number of libcurl bindings; you may NOT use a crawling library like Mechanize or whatnot. You may use an HTML parsing library like BeautifulSoup; you may NOT use a headless browser like PhantomJS. The purpose of this challenge is to tie together fetching a page, reassembling links, discovering links and assembling them, adding them to a queue, managing the depth of the queue, and visiting them in some reasonable order - while avoiding duplicate visits.

    Your crawler MUST support the following features:

    HTTP/1.1 client behaviors
    GET requests are the only method you must support
    Parse all links presented in HTML - anchors, images, scripts, etc
    Take at least two options - a starting (seed) URL and a maximum depth to recurse to (e.g. "1" would be fetch the HTML page and all resources like images and script associated with it but don't visit any outgoing anchor links; a depth of "2" would visit the anchor links found on that first page only, etc ...)

    Do not visit the same link more than once per session
    Optional features include HTTPS support, support for robots.txt, support for domains to which you restrict the crawler, and storing results (for example how wget does so).
    
    Be careful with what you crawl! Don't get yourself banned from the Internet. I highly suggest you crawl a local server you control as you may trigger rate limits and other mechanisms to identify unwanted visitors.

[Reddit Challange](https://www.reddit.com/r/dailyprogrammer/comments/7dlaeq/20171117_challenge_340_hard_write_a_web_crawler/)

[Extra source](http://www.east5th.co/blog/2017/10/09/learning-to-crawl-building-a-bare-bones-web-crawler-with-elixir/)

In [15]:
import re
import requests
from bs4 import BeautifulSoup as bs
import collections as co

pattern = re.compile(r'''[a-z]*://|          # protocol                                             
                         (?<=://)[a-z0-9.]*| # main_url
                         (?<=[a-z])/.*|      # detail
                         ^/.*                # local ref
                         ''',re.X)

def process_link(link,old=''):

    found = re.findall(pattern,link)
    
    if len(found)>1:
        parts = found.pop(1).split('.')
        two_parts = ['.'.join(parts[:-2]) + '.'*bool(parts[:-2]),
                     '.'.join(parts[-2:])]
        d_if = {True:found.pop, False:lambda:''}   
        found += [*two_parts, d_if[bool(len(found)>1)]()]
    elif found:
        found = old + found
    else:
        found = old
    return found

def visit(link):
    
    r = requests.get(link)
    if r.status_code != 200:
        return
    else:
        old = process_link(link)[:3]
        out_links = []
        soup = bs(r.text,'html.parser')
        for line in soup.find_all('a'):
            a_link = line.get('href')
            if a_link:
                out_links += [process_link(a_link,old)]
        return out_links
            
def crawl(a_link,visits):
    
    l_list = process_link(a_link)
    queue = co.deque([l_list])
    main_visited = co.defaultdict(int) 
    main_visited[l_list[2]] += 1
    all_visited = set()

    while queue and visits:
        
        l_list = queue.popleft()
        link = ''.join(l_list)
        if not link in all_visited and main_visited[l_list[2]] < 4:
            main_visited[l_list[2]] += 1
            all_visited.add(link)
            visits -= 1
            now = visit(''.join(link))
            queue.extend(now)
    
    return main_visited
            
# crawl('https://www.google.com',40)

In [16]:
%%time

crawl('https://www.google.com',40)

Wall time: 27 s


defaultdict(int,
            {'ad.nl': 1,
             'android.com': 2,
             'blogger.com': 1,
             'demorgen.be': 3,
             'google.be': 4,
             'google.com': 4,
             'hbvl.be': 1,
             'hln.be': 4,
             'knack.be': 1,
             'metrotime.be': 1,
             'nieuwsblad.be': 4,
             'nos.nl': 1,
             'nrc.nl': 1,
             'sporza.be': 1,
             'standaard.be': 4,
             'tijd.be': 2,
             'vrt.be': 1,
             'welingelichtekringen.nl': 1,
             'youtube.com': 4})

In [1]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup

visited = set()

async def crawl(node, max_depth, session):
    if node['depth'] > max_depth:
        return
    node['next'] = {
        'depth': node['depth'] + 1,
        'urls': [],
    }
    for link in node['urls']:
        links = await get_links(link, session)
        node['next']['urls'].extend(links)
    await crawl(node['next'], max_depth, session)

async def get_links(url, session):
    if url in visited or url.startswith('/'):
        return []
    visited.add(url)
    links = []
    async with session.get(url) as resp:
        text = await resp.text()
        soup = BeautifulSoup(text, 'html.parser')
        for a in soup.find_all('a', href=True):
            if '//' in a['href']:
                links.append(a['href']) 
    return links

async def main():
    # max_depth = 2
    # root = r'http://quotes.toscrape.com/'
    root = input('Website to crawl: ')
    max_depth = int(input('Depth to crawl (int): '))
    links = {
        'urls': [root],
        'depth': 1,
    }
    with aiohttp.ClientSession() as session:
        await crawl(links, max_depth, aiohttp.ClientSession())
    print(f'Number of pages crawled: {len(visited)}')



In [2]:
%%time

loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Website to crawl: http://quotes.toscrape.com/
Depth to crawl (int): 1


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000001A536628748>
Unclosed connector
connections: ['[(<aiohttp.client_proto.ResponseHandler object at 0x000001A535A506D8>, 446205.906)]']
connector: <aiohttp.connector.TCPConnector object at 0x000001A536628C50>


Number of pages crawled: 1
Wall time: 4.45 s


In [45]:
import asyncio
import aiohttp
import re

from bs4 import BeautifulSoup
# from bloom_filter import BloomFilter

import collections as co


class Crawler():

    def __init__(self,source,max_visits=10,max_rude=3):
    
        # self.bloom = BloomFilter(max_elements=max_visits, error_rate=0.1)
        self.done = set()
        
        self.n = max_visits
        self.rude = max_rude

        self.main_visited = co.defaultdict(int)
        self.queue = co.deque([source])

    # not blocking
    async def top_level(self,url):   
        
        found = re.search(r'(?<=://)[a-z0-9.]*',url)
        if found:
            parts = found[0].split('.')
            tp_level = '.'.join(parts[-2:])
            await tp_level
        await False

    async def crawl(self):

        while self.queue and self.n:
            url = self.queue.popleft()
            if url not in self.done:
                await tp_level = self.top_level(url)
                if self.main_visited[tp_level] < self.rude:
                    await self.get_links(url, tp_level)
                self.main_visited[tp_level] += 1
                
    async def get_links(self, url, tp_level):

        try:
            async with aiohttp.ClientSession().get(url) as resp:
                text = await resp.text()
                self.n -= 1
                self.done.add(url)
                soup = BeautifulSoup(text, 'html.parser')
                for new_url in soup.find_all('a', href=True):
                    c = await self.top_level(new_url['href'])
                    if c:
                        self.queue.append(c)
        except:
            await asyncio.sleep(0)

SyntaxError: can't assign to await expression (<ipython-input-45-9cdd2b45b231>, line 39)

In [46]:
crow = Crawler('https://www.google.com/')
    
loop = asyncio.get_event_loop()
loop.run_until_complete(crow.crawl())

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000001BE7A4E3550>
Unclosed connector
connections: ['[(<aiohttp.client_proto.ResponseHandler object at 0x000001BE7A4E3588>, 491837.687)]', '[(<aiohttp.client_proto.ResponseHandler object at 0x000001BE799A0DD8>, 491837.828)]']
connector: <aiohttp.connector.TCPConnector object at 0x000001BE7A4E36A0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000001BE7A276668>
Unclosed connector
connections: ['[(<aiohttp.client_proto.ResponseHandler object at 0x000001BE79A204E0>, 491838.031)]']
connector: <aiohttp.connector.TCPConnector object at 0x000001BE79F43EF0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000001BE7A226748>
Unclosed connector
connections: ['[(<aiohttp.client_proto.ResponseHandler object at 0x000001BE7A237748>, 491838.187)]', '[(<aiohttp.client_proto.ResponseHandler object at 0x000001BE799163C8>, 491838.578)]']
connector: <aiohttp.

In [47]:
crow.main_visited,crow.n

(defaultdict(int,
             {<coroutine object Crawler.top_level at 0x000001BE7A650620>: 1,
              <coroutine object Crawler.top_level at 0x000001BE79E13CA8>: 1,
              <coroutine object Crawler.top_level at 0x000001BE79344C50>: 1,
              <coroutine object Crawler.top_level at 0x000001BE77C0DE60>: 1,
              <coroutine object Crawler.top_level at 0x000001BE7A650570>: 1,
              <coroutine object Crawler.top_level at 0x000001BE7A650A40>: 1,
              <coroutine object Crawler.top_level at 0x000001BE792DEDB0>: 1,
              <coroutine object Crawler.top_level at 0x000001BE792DED00>: 1,
              <coroutine object Crawler.top_level at 0x000001BE79FE0780>: 1,
              <coroutine object Crawler.top_level at 0x000001BE7AAF70A0>: 1}),
 0)

In [13]:
loop = asyncio.get_event_loop()
loop.run_until_complete(crawl('https://www.google.com',4))

defaultdict(int, {'google.be': 2, 'google.com': 3})

In [20]:
import re

a = re.search(r'(?<=://)[a-z0-9.]*','https://www.google.com')
a[0].split('.')[]

['www', 'google', 'com']

In [7]:
def small():
    yield 2
    a = 2
    while True:
        a = a**2
        yield a
        
g = small()
for i in range(10):
    print(next(g))


2
4
16
256
65536
4294967296
18446744073709551616
340282366920938463463374607431768211456
115792089237316195423570985008687907853269984665640564039457584007913129639936
13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006084096


In [None]:
import asyncio
import aiohttp

from bs4 import BeautifulSoup as bs
import collections as co

pattern = re.compile(r'''[a-z]*://|          # protocol                                             
                         (?<=://)[a-z0-9.]*| # main_url
                         (?<=[a-z])/.*|      # detail
                         ^/.*                # local ref
                         ''',re.X)

def process_link(link,old=''):
   
    found = re.findall(pattern,link)
    
    if len(found)>1:
        parts = found.pop(1).split('.')
        two_parts = ['.'.join(parts[:-2]) + '.'*bool(parts[:-2]),
                     '.'.join(parts[-2:])]
        d_if = {True:found.pop, False:lambda:''}   
        found += [*two_parts, d_if[bool(len(found)>1)]()]
    elif found:
        found = old + found
    else:
        found = old
    return found

async def visit(link):
    
    r = requests.get(link)
    if r.status_code != 200:
        return
    else:
        old = process_link(link)[:3]
        out_links = []
        soup = bs(r.text,'html.parser')
        for line in soup.find_all('a'):
            a_link = line.get('href')
            if a_link:
                out_links += [process_link(a_link,old)]
        return out_links
            
async def crawl(a_link,visits):
    
    l_list = process_link(a_link)
    queue = co.deque([l_list])
    main_visited = co.defaultdict(int) 
    main_visited[l_list[2]] += 1
    all_visited = set()

    while queue and visits:
        
        l_list = queue.popleft()
        link = ''.join(l_list)
        if not link in all_visited and main_visited[l_list[2]] < 4:
            main_visited[l_list[2]] += 1
            all_visited.add(link)
            visits -= 1
            now = await visit(''.join(link))
            queue.extend(now)
    
    return main_visited
            
crawl('https://www.google.com',4)