In [1]:
import os
import re
import time
import math
import requests

from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common import keys

from bs4 import BeautifulSoup

In [3]:
driver = webdriver.Chrome()
url = 'https://web-ace.jp/youngaceup/contents/1000117/episode/'
driver.get(url)

In [3]:
base_url = 'https://web-ace.jp'
dl_root = 'web-ace'

def get_chap_images(url):
    """
    Sample url: https://web-ace.jp/youngaceup/contents/1000117/episode/3711/
    
    Turn off image loading to reduce load time (we will download directly from url later).
    """
    driver.get(url)
    # scroll to the end of the page to load all the images (they use JavaScript to save bandwidth)
    body = driver.find_element_by_tag_name('body')
#     body.send_keys(keys.Keys.END)
    for _ in range(5):
        body.send_keys(keys.Keys.END)
        time.sleep(0.2)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source)
    img_list = soup.find('div', attrs={
        'class': 'lazy-container'
    }).find_all('img', attrs={
        'class': 'viewerFixedImage'
    })
    img_urls = [base_url + img.attrs['src'] for img in img_list]
    return img_urls

def dl_image(url, basename, root):
    if not os.path.exists(root):
        os.makedirs(root)
    ext = os.path.splitext(url)[1]
    filename = basename + ext
    filepath = f'{root}/{filename}'
    if os.path.exists(filepath):
        return
    with open(filepath, 'wb') as handle:
        res = requests.get(url, stream=True)
        if not res.ok:
            print(res)
        for block in res.iter_content(1024):
            if not block:
                break
            handle.write(block)

def dl_manga(url):
    """
    Sample url: https://web-ace.jp/youngaceup/contents/1000117/episode/
    """
    driver.get(url)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source)
    # get the title
    title = soup.find('h2', attrs={'class': 'clr-young-ace-up'}).text
    # get the list of chapters
    anchor_list = soup.find('ul', attrs={'class': 'table-view'}).find_all('a')
    chap_dicts = [{
        'title': a.find('p', attrs={'class': 'text-bold'}).text,
        'url': base_url + a.attrs['href'],
    } for a in anchor_list if a.find('p', attrs={'class': 'text-bold'}) is not None]
    chap_dicts.reverse()

    for chap_dict in tqdm(chap_dicts):
        chap_url = chap_dict['url']
        img_urls = get_chap_images(chap_url)
        chap_dict['images'] = img_urls
    print('Got all images. Starting to download.')
    for chap_idx in tqdm(range(len(chap_dicts))):
        chap_dict = chap_dicts[chap_idx]
        chap_root = f'{dl_root}/{title}/{str(chap_idx).zfill(2)}'
        img_urls = chap_dict['images']
        # for img_idx in tqdm(range(len(img_urls))):
        for img_idx in range(len(img_urls)):
            img_url = img_urls[img_idx]
            basename = str(img_idx).zfill(2)
            dl_image(img_url, basename, chap_root)
    return chap_dicts

## Download manga

In [148]:
url = 'https://web-ace.jp/youngaceup/contents/1000118/episode/'
chap_dicts = dl_manga(url)

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:28<00:00,  3.74s/it]


Got all images. Starting to download.


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [01:51<00:00, 13.19s/it]


In [147]:
url = 'https://web-ace.jp/youngaceup/contents/1000118/episode/'
chap_dicts = dl_manga(url)

100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [01:57<00:00,  2.80s/it]


Got all images. Starting to download.


100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [06:12<00:00,  6.75s/it]


In [4]:
manga_list = [
#     'https://web-ace.jp/youngaceup/contents/1000117/episode/', # 世界最高の暗殺者、異世界貴族に転生するを読む
    'https://web-ace.jp/youngaceup/contents/1000091/episode/', # 勇者、辞めます
#     'https://web-ace.jp/youngaceup/contents/1000118/episode/', # Fate/Grand Order -Epic of Remnant- 亜種特異点EX 深海電脳楽土 SE.RA.PHを読む
#     'https://web-ace.jp/youngaceup/contents/1000064/episode/', # パシリな僕と恋する番長さんを読む
]

dl_hist = []
for manga_url in manga_list:
    print(f'Downloading {manga_url}')
    dl_dict = dl_manga(manga_url)
    dl_hist.append(dl_dict)

Downloading https://web-ace.jp/youngaceup/contents/1000091/episode/


100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [02:18<00:00,  2.77s/it]


Got all images. Starting to download.


100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [00:06<00:00,  7.39it/s]
