# 从网页中截取文章内容

## 步骤1：参数设置

In [1]:
url = 'https://www.zhihu.com/question/49909565'
print(url)

https://www.zhihu.com/question/49909565


## 步骤2：查找相应的截取关键字

In [2]:
if 'blog.jobbole.com' in url:  # 伯乐在线
    title_key = '.entry-header'
    content_key = '.entry'
    print('伯乐在线 http://blog.jobbole.com')
elif 'blog.csdn.net' in url:  # csdn
    title_key = '.link_title'
    content_key = '#article_content'
    print('csdn http://blog.csdn.net')
elif 'www.codingpy.com' in url:  # 编程派网址
    title_key = '.header h1'
    content_key = '.article-content'
    print('编程派网址 http://www.codingpy.com')
elif 'www.zhihu.com' in url:  # 知乎
    title_key = '.QuestionHeader-title'
    content_key = '.QuestionHeader-main'
    content_key2 = '.RichContent-inner'
    print('知乎 http://www.zhihu.com')
else:
    title_key = '#keraspython'
    content_key = '.section'
    print('其它')

知乎 http://www.zhihu.com


## 步骤3：发出网页请求，接收响应

In [5]:
from urllib import request
req = request.Request(url)
res = request.urlopen(req)
html = res.read().decode('utf-8')
print(html)

<!doctype html>
<html lang="zh" data-hairline="true" data-reactroot="" data-reactid="1" data-react-checksum="294310922"><head data-reactid="2"><meta charset="utf-8" data-reactid="3"/><title data-react-helmet="true" data-reactid="4">TensorFlow 如何入门？ - 知乎</title><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1" data-reactid="5"/><meta name="renderer" content="webkit" data-reactid="6"/><meta name="force-rendering" content="webkit" data-reactid="7"/><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" data-reactid="8"/><meta name="google-site-verification" content="FTeR0c8arOPKh8c5DYh_9uu98_zJbaWw53J-Sch9MTg" data-reactid="9"/><meta data-react-helmet="true" name="apple-itunes-app" content="app-id=432274380, app-argument=zhihu://questions/49909565" data-reactid="10"/><link rel="shortcut icon" type="image/x-icon" href="https://static.zhihu.com/static/favicon.ico" data-reactid="11"/><link rel="dns-prefetch" href="//static.zhimg.com" data-reactid="12"/

## 步骤4：从网页中提取文章标题和内容

In [32]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
title = soup.select(title_key)[0].text.strip()  # 文章标题
print('文章标题：', title)
content = soup.select(content_key)[0]  # 文章内容
content = str(content)
if content_key2:
    answers = soup.select(content_key2)
    for one in answers:
        content = content + '\n\n======================\n\n' + str(one)
#         print(type(content))
#         break
# print(content)

文章标题： TensorFlow 如何入门？


## 步骤5：下载文章中的图片

### 5.1 哈希码生成函数，用于给图片重新命名

In [34]:
import hashlib
def md5(name):
    """ 将字符串转成哈希码 """
    if not isinstance(name, str):
        name = str(name)
    md5 = hashlib.md5()
    md5.update(name.encode('utf-8'))
    return md5.hexdigest()

### 5.2 下载图片，并修改文章图片的超链接

In [35]:
import re
import os
content = str(content)
pattern = '<img .*?src=\"(.*?)\"'
re_image = re.compile(pattern)
for image_link in re_image.findall(content):
    if not os.path.exists('images'):
        os.mkdir('images')
    filename = 'images/' + md5(image_link) + os.path.splitext(image_link)[-1]
    try:
        request.urlretrieve(image_link, filename)
        print('下载完成', filename)
    except Exception as e:
        print('图片出错', e)
    else:
        content = content.replace(image_link, filename)
print('已完成--------')

已完成--------


## 步骤6：将截取的文章标题和内容重新组合成新的网页文件

In [36]:
html_template = """<!DOCTYPE html>
<html><head><meta charset="UTF-8">
</head><body>
<p><a href="{origin}">原文链接</a></p>
<p><center><h1>{title}</h1></center></p>
    {content}
</body></html>"""
html = html_template.format(origin=url, title=title, content=content)
print(html)

<!DOCTYPE html>
<html><head><meta charset="UTF-8">
</head><body>
<p><a href="https://www.zhihu.com/question/49909565">原文链接</a></p>
<p><center><h1>TensorFlow 如何入门？</h1></center></p>
    <div class="QuestionHeader-main"><div class="QuestionHeader-tags"><div class="QuestionHeader-topics"><div class="Tag QuestionTopic"><span class="Tag-content"><a class="TopicLink" href="/topic/19552832"><div class="Popover"><div aria-expanded="false" aria-haspopup="true" aria-owns="null-content" id="null-toggle">Python</div></div></a></span></div><div class="Tag QuestionTopic"><span class="Tag-content"><a class="TopicLink" href="/topic/19556664"><div class="Popover"><div aria-expanded="false" aria-haspopup="true" aria-owns="null-content" id="null-toggle">科技</div></div></a></span></div><div class="Tag QuestionTopic"><span class="Tag-content"><a class="TopicLink" href="/topic/19813032"><div class="Popover"><div aria-expanded="false" aria-haspopup="true" aria-owns="null-content" id="null-toggle">深度学习（Deep Le

## 步骤7：将文件写入磁盘

In [37]:
filename = title + ".html"
with open(filename, "w") as f:
    f.write(html.replace(u'\xa0', u' ').replace(u'\U0001f60a', u' '))  # 在windows中出错，所以这里进行了字符串替换

## (完)  不知道为啥内容只是取了两段