In [20]:
from urllib import request
from bs4 import BeautifulSoup
import re
import os

class WebPageDownload():
    """ 网页下载, 只截取标题,内容及内容中所包含的图片 """
    def __init__(self, url, titleKey, contentKey):
        self.url = url
        self.titleKey = titleKey
        self.contentKey = contentKey
        self.title = ''
        self.content = ''
    
    def url_to_html(self):
        """ 通过url取得网页 """
        req = request.Request(self.url)
        res = request.urlopen(req)
        html = res.read().decode('utf-8')
        return html
    
    def extract_from_soup(self, html):
        """ 提取标题和文章内容 """
        soup = BeautifulSoup(html, 'html.parser')
        self.title = soup.select(self.titleKey)[0].text.strip()
        self.content = soup.select(self.contentKey)[0]
        
    def md5(self, string):
        """ 将字符串转成哈希码 """
        import hashlib
        if not isinstance(string, str):
            string = str(string)
        md5 = hashlib.md5()  
        md5.update(string.encode('utf-8'))
        return md5.hexdigest()

    def download_images(self):
        """ 下载文章中的图片 """
        pattern = '<img .*?src=\"(.*?)\"'
        re_image = re.compile(pattern)
        for imageUrl in re_image.findall(self.content):
            md5 = self.md5(imageUrl)
            if not os.path.exist('./images'):
                os.path.mkdir('./images')
            filename = 'images/' + imageMd5 + os.path.splitext(imageUrl)[-1]
            try:
                request.urlretrieve(imageUrl, filename)
            except Exception as e:
                print('图片出错', e)
            else:
                self.content = self.content.replace(imageUrl, filename)
        
    def ouputHtml(self):
        """ 输出html文件 """
        html_template = """<!DOCTYPE html>
            <html><head><meta charset="UTF-8">
            </head><body>
            <p><center><h1>{title}</h1></center></p>
                {content}
            </body></html>"""
        html = html_template.format(title=title, content=content)
        filename = str(self.title) + ".html"
        with open(filename, "w") as f:
            f.write(html)
    
    def run(self):
        """ 跑起来 """
        html = self.url_to_html()
        self.extract_from_soup(html)
        self.download_images(self.content)
        pass
    

url = 'http://www.codingpy.com/article/getting-started-with-jupyter-notebook-part-1/'
titleKey = '.header h1'
contentKey = '.article-content'
download = WebPageDownload(url, titleKey, contentKey)
download.run()

In [22]:
print(download.title)

Jupyter Notebook 快速入门（上）


In [23]:
print(download.content)

<div class="article-content">
<blockquote>
<p>本文作者为 <a href="https://www.packtpub.com/books/content/getting-started-jupyter-notebook-part-1">Marin Gilles</a>，他是来自法国的一位物理学博士生，用 Python 开发了自己的物理学模拟框架。本文分为两部分，是<a href="https://github.com/PythonTG/PythonTG"><strong> Python 翻译组</strong></a>成立后的第一篇译文，译者 <a href="http://codingpy.com">EarlGrey</a>。</p>
</blockquote>
<p>Jupyter Notebook（此前被称为 IPython notebook）是一个交互式笔记本，支持运行 40 多种编程语言。在本文中，我们将介绍 Jupyter notebook 的主要特性，以及为什么对于希望编写漂亮的交互式文档的人来说是一个强大工具。</p>
<p>在开始使用 notebook 之前，我们先需要安装该库。你可以在<a href="https://jupyter.readthedocs.org/en/latest/install.html"> Jupyter 官网</a>上找到完整的步骤。</p>
<blockquote>
<p>译者注：其实只要<code>pip install jupyter</code>就可以了</p>
</blockquote>
<div class="codehilite"><pre>jupyter notebook
</pre></div>
<p>运行上面的命令之后，你将看到类似下面这样的输出：</p>
<div class="codehilite"><pre>[I 20:06:36.367 NotebookApp] Writing notebook server cookie secret to /run/user/1000/jupyter/notebook_cookie_secret
[I 20:06:36.813 NotebookApp] Serving notebooks from local 

In [10]:
download.contentKey

'.article-content'