## 获取漫画列表

In [2]:
import requests
from bs4 import BeautifulSoup

def get_chapters(target_url):

    r = requests.get(url=target_url)
    bs = BeautifulSoup(r.text, 'lxml')
    list_con_li = bs.find('ul', class_="list_con_li")
    comic_list = list_con_li.find_all('a')
    chapter_names = []
    chapter_urls = []
    for comic in comic_list:
        href = comic.get('href')
        name = comic.text
        chapter_names.insert(0, name)
        chapter_urls.insert(0, href)
    return chapter_names, chapter_urls

In [3]:
target_url = "https://www.dmzj.com/info/yaoshenji.html"

chapter_names, chapter_urls = get_chapters(target_url)
for k, v in zip(chapter_names, chapter_urls):
    print(k, v)

第01话 重生 https://www.dmzj.com/view/yaoshenji/41917.html
第02话 坐井观天 https://www.dmzj.com/view/yaoshenji/41919.html
第03话 兄弟 https://www.dmzj.com/view/yaoshenji/41952.html
第04话 钱可通神 https://www.dmzj.com/view/yaoshenji/41985.html
第05话 开始行动 https://www.dmzj.com/view/yaoshenji/42030.html
第06话 角羊杀手 https://www.dmzj.com/view/yaoshenji/42096.html
第07话 肖凝儿 https://www.dmzj.com/view/yaoshenji/42137.html
第08话 疗伤 https://www.dmzj.com/view/yaoshenji/42201.html
第09话 肖凝儿的态度 https://www.dmzj.com/view/yaoshenji/42278.html
第10话 课前 https://www.dmzj.com/view/yaoshenji/42327.html
第11话 意外？意外？ https://www.dmzj.com/view/yaoshenji/42363.html
第12话 赤焰炎爆符文 https://www.dmzj.com/view/yaoshenji/42405.html
第13话 抄袭？ https://www.dmzj.com/view/yaoshenji/42445.html
第14话 退学 https://www.dmzj.com/view/yaoshenji/42482.html
第15话 修炼！ https://www.dmzj.com/view/yaoshenji/42542.html
第16话 顶级功法！ https://www.dmzj.com/view/yaoshenji/42670.html
第17话 天道神诀 https://www.dmzj.com/view/yaoshenji/42737.html
第18话 偶遇 https://www.dmzj.com/view/yao

## 进入具体漫画下载页面
1. 此时，发现页面不能鼠标页面点击，可以通过 F12 调出，或者通过在地址栏加个前缀，如：view-source:https://www.dmzj.com/view/yaoshenji/41917.html
2. 有些页面是通过 js 动态加载的
   1. 外部加载：
      ```html
      <script type="text/javascript" src="https://cuijiahua.com/call.js"></script>
      ```
   1. 内部加载：
      ```html
      <script type="text/javascript">
         var arr_img = new Array();
         var page = '';
         eval(function(p,a,c,k,e,d){...}))
      </script>
      ```

In [4]:
import requests
from bs4 import BeautifulSoup
import re

def get_imgs(chapter_url):
    r = requests.get(url=chapter_url)
    html = BeautifulSoup(r.text, 'lxml')
    script_info = html.script
    # print("raw js code:", script_info)
    pics = re.findall('\d{13,14}', str(script_info))
    # print("pics", pics)
    for idx, pic in enumerate(pics):
        if len(pic) == 13:
            pics[idx] = pic + '0'
    pics = sorted(pics, key=lambda x:int(x))
    chapterpic_hou = re.findall('\|(\d{5})\|', str(script_info))[0]
    chapterpic_qian = re.findall('\|(\d{4})\|', str(script_info))[0]

    imgs = []
    for pic in pics:
        if pic[-1] == '0':
            url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic[:-1] + '.jpg'
        else:
            url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic + '.jpg'
        imgs.append(url)
    return imgs

In [9]:
chapter_url = chapter_urls[1]
imgs = get_imgs(chapter_url)
imgs

['https://images.dmzj.com/img/chapterpic/3059/14245/14395246506397.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246508628.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246514858.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246521947.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246525416.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246531383.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246538125.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246544205.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246550472.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/1439524655494.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246560172.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246566183.jpg',
 'https://images.dmzj.com/img/chapterpic/3059/14245/14395246571296.jpg']

## 下载单张图片

In [14]:
import pathlib
import requests
from contextlib import closing

def download_img(img_url, img_name, dir="temp", header=None):
    save_dir = pathlib.Path(dir)
    save_dir.mkdir(exist_ok=True)
    pic_save_path = f'{dir}/{img_name}.jpg'
    with closing(requests.get(img_url, headers=header, stream=True)) as response:  
        chunk_size = 1024  
        content_size = int(response.headers['content-length'])  
        if response.status_code == 200:
            with open(pic_save_path, "wb") as file:  
                for data in response.iter_content(chunk_size=chunk_size):  
                    file.write(data)  
        else:
            print('链接异常')

In [16]:
from email import header
from pydoc import describe
from tqdm import tqdm
import time
temp_dir = pathlib.Path('./temp')
for index, img in enumerate(tqdm(imgs, desc='downloader')):
    header = {
        'Referer': chapter_urls[1]
    }
    download_img(img, index, temp_dir, header=header)
    # time.sleep(1)

downloader: 100%|██████████| 11/11 [00:00<00:00, 12.23it/s]


## 汇总

In [17]:
from turtle import down
from tqdm import tqdm
import time

# 创建保存目录
root_dir = '妖神记'
target_url = "https://www.dmzj.com/info/yaoshenji.html"

# 获取动漫章节链接和章节名
chapter_names, chapter_urls = get_chapters(target_url)

# 下载漫画 
for name, url in tqdm(zip(chapter_names, chapter_urls), desc="downloading", total=len(chapter_names)):
    if '.' in name:
        name = name.replace('.', '')
    # 假装从这页面进行访问图片 url
    download_header = {
        'Referer': url
    }
    chapter_dir = pathlib.Path(f'{root_dir}/{name}')
    chapter_dir.mkdir(parents=True, exist_ok=True)
    imgs = get_imgs(url)
    for i, img in enumerate(imgs):
        download_img(img_url=img, img_name=i, dir=chapter_dir, header=download_header)
    time.sleep(5)

downloading:   0%|          | 1/253 [00:13<56:22, 13.42s/it]


KeyboardInterrupt: 