In [1]:
import os
import re
import time

import matplotlib.pyplot as plt
from tqdm import tqdm

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
options = webdriver.ChromeOptions()
options.add_extension('./chrome_exts/css_block.crx')
driver = webdriver.Chrome(chrome_options=options)
# driver = webdriver.Chrome('D:/chromedriver_win32/chromedriver.exe')
# go to this site to disable javascipt, images, etc.
driver.get('https://ncode.syosetu.com/n7756cy/')

  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
def normalize_filename(path):
    return re.sub(r'[:/]+', '_', path)

In [3]:
base_url = 'https://ncode.syosetu.com'

def split_syosetu_url(url):
    # 'https://ncode.syosetu.com/n7756cy/'
    # ['https:', 'ncode.syosetu.com', 'n7756cy']
    url_comps = url.split('/')
    url_comps = [x for x in url_comps if x != '']
    return url_comps

def dl_html(url, path):
    driver.get(url)
    content = driver.page_source
    with open(path, mode='w', encoding='utf-8') as outf:
        outf.write(content)

def dl_chapter(url, save_dir='syosetu'):
    url_comps = split_syosetu_url(url)
    novel_id = url_comps[2]
    chap_num = url_comps[3]
    novel_dir = f'{save_dir}/{novel_id}'
    if not os.path.exists(novel_dir):
        os.makedirs(novel_dir)
    
    chap_path = f'{novel_dir}/{chap_num}.html'
    if not os.path.exists(chap_path):
        dl_html(url, chap_path)

def dl_syosetu(url, save_dir='syosetu', dl_chapters=True):
    url_comps = split_syosetu_url(url)
    novel_id = url_comps[2]
    novel_dir = f'{save_dir}/{novel_id}'
    if not os.path.exists(novel_dir):
        os.makedirs(novel_dir)
    hp_path = f'{novel_dir}/index.html'
    if not os.path.exists(hp_path):
        dl_html(url, hp_path)
    
    if dl_chapters:
        with open(hp_path, 'r', encoding='utf-8') as inpf:
            content = inpf.read()

        soup = BeautifulSoup(content)
        chap_div_list = soup.find_all(attrs={'class': 'novel_sublist2'})
        for chap_div in tqdm(chap_div_list):
            chap_anchor = chap_div.find('a')
            chap_url = base_url + chap_anchor.attrs['href']
            dl_chapter(chap_url, save_dir=save_dir)

In [5]:
# novel_url = 'https://ncode.syosetu.com/n7756cy/' # 異世界建国記
# novel_url = 'http://ncode.syosetu.com/n1961bm/' # 女の子、買いました
# novel_url = 'https://ncode.syosetu.com/n7500bd/' # 異世界チート魔術師（マジシャン）
# novel_url = 'https://ncode.syosetu.com/n4701bu/' # 捨てられ勇者は帰宅中～隠しスキルで異世界を駆け抜ける
# novel_url = 'https://ncode.syosetu.com/n1094bz/' # 精霊幻想記
# novel_url = 'http://ncode.syosetu.com/n1443bp/' # 異世界はスマートフォンとともに

dl_syosetu(novel_url)

100%|████████████████████████████████████████████████████████████████████████████████| 501/501 [02:15<00:00,  3.08it/s]


In [8]:
novel_list = [
    'https://ncode.syosetu.com/n7756cy/', # 異世界建国記
    'http://ncode.syosetu.com/n1961bm/', # 女の子、買いました
    'https://ncode.syosetu.com/n7500bd/', # 異世界チート魔術師（マジシャン）
    'https://ncode.syosetu.com/n4701bu/', # 捨てられ勇者は帰宅中～隠しスキルで異世界を駆け抜ける
    'https://ncode.syosetu.com/n1094bz/', # 精霊幻想記
    'http://ncode.syosetu.com/n1443bp/', # 異世界はスマートフォンとともに
    'https://ncode.syosetu.com/n3275cf/', # 俺の幼馴染は女子高生で異世界の勇者なんだがそれだけじゃないっぽい
    'https://ncode.syosetu.com/n8611bv/', # ありふれた職業で世界最強
    'https://ncode.syosetu.com/n1222ci/', # 黒の召喚士
    'https://ncode.syosetu.com/n6312de/', # 転生勇者の成り上がり
    'https://ncode.syosetu.com/n7031bs/', # 金色の文字使い
]

In [9]:
for novel_url in novel_list:
    print(novel_url)
    dl_syosetu(novel_url)

https://ncode.syosetu.com/n7756cy/


100%|██████████████████████████████████████████████████████████████████████████████| 305/305 [00:00<00:00, 1282.04it/s]


http://ncode.syosetu.com/n1961bm/


100%|████████████████████████████████████████████████████████████████████████████████| 91/91 [00:00<00:00, 1210.90it/s]


https://ncode.syosetu.com/n7500bd/


100%|██████████████████████████████████████████████████████████████████████████████| 149/149 [00:00<00:00, 1025.30it/s]


https://ncode.syosetu.com/n4701bu/


100%|████████████████████████████████████████████████████████████████████████████████| 66/66 [00:00<00:00, 2182.12it/s]


https://ncode.syosetu.com/n1094bz/


100%|██████████████████████████████████████████████████████████████████████████████| 206/206 [00:00<00:00, 1762.30it/s]


http://ncode.syosetu.com/n1443bp/


100%|██████████████████████████████████████████████████████████████████████████████| 501/501 [00:00<00:00, 1730.88it/s]


https://ncode.syosetu.com/n3275cf/


100%|████████████████████████████████████████████████████████████████████████████████| 87/87 [00:00<00:00, 1221.09it/s]


https://ncode.syosetu.com/n8611bv/


100%|██████████████████████████████████████████████████████████████████████████████| 358/358 [00:00<00:00, 2072.36it/s]


https://ncode.syosetu.com/n1222ci/


100%|██████████████████████████████████████████████████████████████████████████████| 634/634 [00:00<00:00, 1690.59it/s]


https://ncode.syosetu.com/n6312de/


100%|██████████████████████████████████████████████████████████████████████████████| 119/119 [00:00<00:00, 3048.13it/s]


https://ncode.syosetu.com/n7031bs/


100%|████████████████████████████████████████████████████████████████████████████████| 838/838 [05:40<00:00,  2.03it/s]
