In [1]:
# 通过url返回网页的内容
import requests
def get_html(url):
    try:
        response = requests.get(url)
        response.encoding = response.apparent_encoding
        return response.text
    except:
        return ''

In [2]:
# 从网页中获取url列表
from bs4 import BeautifulSoup

def get_urls(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.select('.wikiPageNavMenu > .navBar > li > a')  # 麦子学院的教程
    
    urls = []
    for link in links:
        urls.append('http://www.maiziedu.com' + link['href'])
    return urls

def get_file(html):
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.select('.cont h1')[0].text  # 麦子学院的提取关键字
    content = str(soup.select('.cont')[0])

    content = download_images(title, content)
    download_file(title, content)

In [3]:
# 将图片的url转换成哈希吗
import hashlib

def md5(image_link):  
    if not isinstance(image_link, str):
        image_url = str(image_url)
    md5 = hashlib.md5()
    md5.update(image_link.encode('utf-8'))
    return md5.hexdigest()

In [4]:
# 下载网页中的图片
import re
import os

def download_images(title, content):
    pattern = '<img .*?src=\"(.*?)\"'
    re_image = re.compile(pattern)
    path = 'output/images/'
    if not os.path.exists(path):
        os.mkdir(path)
    
    for image_link in re_image.findall(content):
        filename = 'images/' + md5(image_link) + '.png'  # 修改图片的文件名，避免重名
        image_link_full = 'http://www.maiziedu.com' + image_link
        try:
            response = requests.get(image_link_full)
            with open('output/' + filename, 'wb') as f:
                f.write(response.content)
            print('下载完成 >>> ', image_link_full)
            content = content.replace(image_link, filename)  # 替换网页中的图片链接地址
        except Exception as e:
            print('图片出错', e)
    return content

In [5]:
# 下载html文件
def download_file(title, content):
    html_template = """<!DOCTYPE html>
        <html><head><meta charset="UTF-8">
        </head><body>
        <p><center><h1>{title}</h1></center></p>
            {content}
        </body></html>"""
    html = html_template.format(title=title, content=content)
    filename = 'output/' + title + '.html'
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html)
    print('== 当前网页下载完成 ==')

In [6]:
def main():
    start_url = 'http://www.maiziedu.com/wiki/crawler/introduce/'
    html = get_html(start_url)
    urls = get_urls(html)
    for url in urls:
        html = get_html(url)
        get_file(html)

#     print(urls)
main()
print('程序运行结束')

下载完成 >>>  http://www.maiziedu.com/uploads/new_img/DIMcAqJxJHQbCn3wgM.png
== 当前网页下载完成 ==
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/V5OnBLl4FXFxdHZrIi.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/u7Ptw7bqkkM8P1H8fv.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/LprnuX5W4jKnhAqqDH.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/d6vRaSmw5dOr8a4iop.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/I77UZxGTi5eyT37KeT.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/oMcVJf3T2OklcGCBLm.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/g0Co9yzBM4OcZ9tyxi.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/4SYFcd19b26CNPIoFy.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/1bJuM2EHLyqOsYFQDJ.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/d4TgGKaIHSIrenyYcq.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/ILV9uFYgbRpmmn9RxK.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/oXYJqCDAsoT019UEVV.png
下载完成 >>>  http://www.maiziedu.com/up

下载完成 >>>  http://www.maiziedu.com/uploads/new_img/l4a0AmxMMyeVhWZSK6.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/WFJrJnJjjimxthYPM9.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/5HMmUdaHYAdAgl93pc.png
== 当前网页下载完成 ==
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/0E0YYJnnXWgDsoKcSW.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/GLWkGPiKAGypvieLJX.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/5u5jYJATGYsR8ciwDG.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/RsPTUiE8OXcD8obTpj.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/fO2AdPmqJY2cf8z3OZ.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/rMTFKUSqvI0CO18sXc.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/34kOysA1r2aQQ3kd23.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/qVDeWhRcieOPlmXAqC.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/qoiVlLwwfI4kt0Ha7x.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/ZASfl2TA3KpkurBgSa.png
下载完成 >>>  http://www.maiziedu.com/up

下载完成 >>>  http://www.maiziedu.com/uploads/new_img/UG5nX2qalEKdv2mD7n.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/tOIIjYn3bNojR9ghCy.png
== 当前网页下载完成 ==
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/aZWxEVLEP7COHd91k8.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/f8wEZf1RGRY8CazVTx.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/95RVc5g18jcgDQUbGF.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/ytMG4j1An7D1FVYZva.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/VNf8aXpvymJwSthvuh.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/f1bN1apNrDcunDoLKq.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/yqku0m1ZBHDVEJY0Cj.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/G9wickfAwhNV6Skfaw.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/9RelyhyfPy1bJAyEx4.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/HUWTuW5EfNzVSMhAAu.png
== 当前网页下载完成 ==
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/DeWxFFePFHtAdkx32P.png
下载完成 >>>  http://www.

下载完成 >>>  http://www.maiziedu.com/uploads/new_img/FlrCgBx4MB17RBuR56.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/KKpcHZp7XYQIWQW4NP.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/50JC0G1U3ZJytss0bx.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/w7ov76AHTLFFE6ZoIZ.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/nR2oDjKlikkkThjT1B.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/2zm7cwHRnQFfSUyWqv.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/D43OBz45IMRJtVJ6mO.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/CS833mCerOgpc1Ory0.png
== 当前网页下载完成 ==
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/wk9mW3n3v4mM5pMVOU.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/ZpLHTnORDJLytLhWw9.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/5t1WrERNcoP2xJCJTi.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/0QWahMNWOm5H9P8ULO.png
下载完成 >>>  http://www.maiziedu.com/uploads/new_img/GViwQAuY8ohvQUME34.png
下载完成 >>>  http://www.maiziedu.com/up

In [10]:
# 下载目录
start_url = 'http://www.maiziedu.com/wiki/crawler/introduce/'
html = get_html(start_url)
soup = BeautifulSoup(html, 'html.parser')
title = 'index'
content = soup.select('.wikiPageNavMenu')[0]

In [15]:
a = content.select('a')
for i in a:
    content = str(content).replace(i['href'], i.text + '.html')
    

In [17]:
download_file(title, content)

== 当前网页下载完成 ==
