In [5]:
import multiprocessing as mp
import time
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import re

base_url = 'https://morvanzhou.github.io'

In [6]:
def crawl(url):
    response = urlopen(url)
    return response.read().decode('utf-8')

def parse(html):
    soup = BeautifulSoup(html, features = 'lxml')
    urls = soup.find_all('a', {'href' : re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url, url['href']) for url in urls])
    url = soup.find('meta', {'property' : 'og:url'})['content']
    return title, page_urls, url

In [13]:
unseen = set([base_url])
seen = set()

### Normal Crawling

In [12]:
while len(unseen) != 0:
    if len(seen) >= 20:
        break
    
    print('Crawling...')
    htmls = [crawl(url) for url in unseen]
    
    print('Parsing')
    results = [parse(html) for html in htmls]
    
    seen.update(unseen)
    unseen.clear()
    
    count = 1;
    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)
    
    

Crawling...
Parsing
1 教程 https://morvanzhou.github.io/
Crawling...
Parsing
1 高级爬虫: 高效无忧的 Scrapy 爬虫库 https://morvanzhou.github.io/tutorials/data-manipulation/scraping/5-02-scrapy/
2 其他教学系列 https://morvanzhou.github.io/tutorials/others/
3 数据处理教程系列 https://morvanzhou.github.io/tutorials/data-manipulation/
4 进化算法 Evolutionary Strategies 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/evolutionary-algorithm/
5 Git 版本管理 教程系列 https://morvanzhou.github.io/tutorials/others/git/
6 说吧~ https://morvanzhou.github.io/discuss/
7 Threading 多线程教程系列 https://morvanzhou.github.io/tutorials/python-basic/threading/
8 Linux 简易教学 https://morvanzhou.github.io/tutorials/others/linux-basic/
9 推荐学习顺序 https://morvanzhou.github.io/learning-steps/
10 multiprocessing 多进程教程系列 https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/
11 Why? https://morvanzhou.github.io/tutorials/data-manipulation/scraping/1-00-why/
12 机器学习实践 https://morvanzhou.github.io/tutorials/machine-learning/ML-practice/
1

### Distributed Crawling

In [15]:
pool = mp.Pool(4)
while len(unseen) != 0:
    if len(seen) >= 20:
        break
        
    print('Crawling...')
    crawl_jobs = [pool.apply_async(crawl, args = (url,)) for url in unseen]
    htmls = [j.get() for j in crawl_jobs]
    
    print('Parsing...')
    parse_jobs = [pool.apply_async(parse, args = (html,)) for html in htmls]
    results = [j.get() for j in parse_jobs]
    
    seen.update(unseen)
    unseen.clear()
    count = 1
    
    for title, page_url, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_url - seen)

Crawling...
Parsing...
1 教程 https://morvanzhou.github.io/
Crawling...
Parsing...
1 高级爬虫: 高效无忧的 Scrapy 爬虫库 https://morvanzhou.github.io/tutorials/data-manipulation/scraping/5-02-scrapy/
2 其他教学系列 https://morvanzhou.github.io/tutorials/others/
3 数据处理教程系列 https://morvanzhou.github.io/tutorials/data-manipulation/
4 进化算法 Evolutionary Strategies 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/evolutionary-algorithm/
5 Git 版本管理 教程系列 https://morvanzhou.github.io/tutorials/others/git/
6 说吧~ https://morvanzhou.github.io/discuss/
7 Linux 简易教学 https://morvanzhou.github.io/tutorials/others/linux-basic/
8 Threading 多线程教程系列 https://morvanzhou.github.io/tutorials/python-basic/threading/
9 推荐学习顺序 https://morvanzhou.github.io/learning-steps/
10 multiprocessing 多进程教程系列 https://morvanzhou.github.io/tutorials/python-basic/multiprocessing/
11 Why? https://morvanzhou.github.io/tutorials/data-manipulation/scraping/1-00-why/
12 机器学习实践 https://morvanzhou.github.io/tutorials/machine-learning/ML-pract