# 从网页中截取文章内容

## 步骤1：网址

In [1]:
url = 'http://blog.jobbole.com/105602/'

## 步骤2：查找相应的截取关键字

In [2]:
if 'blog.jobbole.com' in url:  # 伯乐在线
    title_key = '.entry-header'
    content_key = '.entry'
elif 'blog.csdn.net' in url:  # csdn
    title_key = '.link_title'
    content_key = '#article_content'
elif 'www.codingpy.com' in url:  # 编程派网址
    title_key = '.header h1'
    content_key = '.article-content'
elif 'www.infoq.com' in url:  # InfoQ
    title_key = '.title_canvas'
    content_key = '.text_info_article'
else:
    title_key = ''
    content_key = ''
print("标题提取键: " + title_key)
print("内容提取键: " + content_key)

标题提取键: .entry-header
内容提取键: .entry


## 步骤3：发出网页请求，接收响应

In [3]:
from urllib import request
req = request.Request(url)
res = request.urlopen(req)
html = res.read().decode('utf-8')

In [4]:
import re
pattern = r'<body[\s\S]*?</body>'  # 只选择body部分内容内容
body = re.findall(pattern, html)[0]
pattern = r'<script [\s\S]*?</script>'  # 去掉脚本语句
body = re.sub(pattern, '', body)
print(body)

<body class="single single-post postid-105602 single-format-standard chrome">

		
	    <nav id="top-nav" class="menu-nav">

        <!-- BEGIN .container -->
        <div class="container">

            <div class="grid-7 hide-on-480 hide-on-767">
                <ul id="menu-main-menu" class="menu left">
                    <li class="menu-item">
                        <a href="http://www.jobbole.com">首页</a>
                    </li>
                    <li class="menu-item">
                        <a href="http://top.jobbole.com">资讯</a>
                    </li>
                    <li class="menu-item">
                        <span><a href="http://blog.jobbole.com">文章 <i class="fa fa-angle-double-down"></i></a></span>
                        <ul class="sub-menu sf-js-enabled">
                            <li class="menu-item"><a href="http://blog.jobbole.com">全部文章</a></li>
                            <li class="menu-item"><a href="http://web.jobbole.com">Web前端</a></li>
       

In [5]:
array = []
for eachline in body.split('\n'):  # 去掉空行
    eachline = eachline.strip()
    if eachline:
        array.append(eachline + '\n')
new_body = ''.join(array)
print(new_body)

<body class="single single-post postid-105602 single-format-standard chrome">
<nav id="top-nav" class="menu-nav">
<!-- BEGIN .container -->
<div class="container">
<div class="grid-7 hide-on-480 hide-on-767">
<ul id="menu-main-menu" class="menu left">
<li class="menu-item">
<a href="http://www.jobbole.com">首页</a>
</li>
<li class="menu-item">
<a href="http://top.jobbole.com">资讯</a>
</li>
<li class="menu-item">
<span><a href="http://blog.jobbole.com">文章 <i class="fa fa-angle-double-down"></i></a></span>
<ul class="sub-menu sf-js-enabled">
<li class="menu-item"><a href="http://blog.jobbole.com">全部文章</a></li>
<li class="menu-item"><a href="http://web.jobbole.com">Web前端</a></li>
<li class="menu-item"><a href="http://python.jobbole.com">Python开发</a></li>
<li class="menu-item"><a href="http://www.importnew.com/?utm_source=home-top-nav">Java技术</a></li>
<li class="menu-item"><a href="http://android.jobbole.com">Android应用</a></li>
<li class="menu-item"><a href="http://ios.jobbole.com">iOS应用</a><

## 步骤4：从网页中提取文章标题和内容

In [6]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(new_body, 'html.parser')
title = soup.select(title_key)[0].text.strip()  # 文章标题
print('文章标题：', title)
content = soup.select(content_key)[0]  # 文章内容
# content

文章标题： TensorFlow深度学习，一篇文章就够了


## 步骤5：下载文章中的图片

### 5.1 哈希码生成函数，用于给图片重新命名

In [7]:
import hashlib
def md5(name):
    """ 将字符串转成哈希码 """
    if not isinstance(name, str):
        name = str(name)
    md5 = hashlib.md5()
    md5.update(name.encode('utf-8'))
    return md5.hexdigest()

### 5.2 下载图片，并修改文章图片的超链接

In [None]:
import re
import os
content = str(content)
pattern = '<img .*?src=\"(.*?)\"'
re_image = re.compile(pattern)
for image_link in re_image.findall(content):
    if not os.path.exists('output'):
        os.mkdir('output')
    if not os.path.exists('output/images'):
        os.mkdir('output/images')
    filename = 'images/' + md5(image_link) + os.path.splitext(image_link)[-1]
    try:
        request.urlretrieve(image_link, 'output/' + filename)
        print('下载完成', filename)
    except Exception as e:
        print('图片出错', e)
    else:
        content = content.replace(image_link, filename)
print('== 完成 ==')

下载完成 images/3bfd4a249a13fddf184b9d375ea74564.jpg


## 步骤6：将截取的文章标题和内容重新组合成新的网页文件

In [None]:
html_template = """<!DOCTYPE html>
<html><head><meta charset="UTF-8">
</head><body>
<p><a href="{origin}">原文链接</a></p>
<p><center><h1>{title}</h1></center></p>
    {content}
</body></html>"""
html = html_template.format(origin=url, title=title, content=content)
print(html)

## 步骤7：将文件写入磁盘

In [None]:
import codecs
filename = 'output/' + title + ".html"
with codecs.open(filename, "w", "utf-8") as f:
    f.write(html)
    #     f.write(html.replace(u'\xa0', u' ').replace(u'\U0001f60a', u' '))  # 在windows中出错，所以这里进行了字符串替换

## (完)