In [None]:
import requests
import re
from bs4 import BeautifulSoup
import time
import random
from ebooklib import epub

class Page:
    def __init__(self, url, base_url):
        self.url = url
        self.base_url = base_url
        req = requests.get(self.url, headers)
        self.soup = BeautifulSoup(req.content)
        
    def get_next_url(self):
        next_url = self.soup.find('a', string=re.compile(css_nexttext))['href']
        next_url = base_url + next_url
        self.next_url = next_url
        return next_url

    def get_booktitle(self):
        """soup.find('span', class_='shuming')"""
        title = self.soup.find(**css_booktitle)
        self.booktitle = title
        return title.text

    def get_author(self):
        """soup.find('h1', id='timu')"""
        author = self.soup.find(**css_author)
        self.author = author
        return author.text
    
    def get_chaptertitle(self):
        """soup.find('h1', id='timu')"""
        title = self.soup.find(**css_chaptertitle)
        self.chaptertitle = title
        return title.text
    
    def get_content(self):
        """soup.find('h1', id='timu')"""
        content = self.soup.find_all(**css_content)
        content = [x.get_text() for x in content]
        content = '\n'.join(content)
        
        if len(content) < 10:
            content = self.soup.find('div', class_=re.compile('cont')).text
        
        content = content.replace('\u3000', '\n')
        content = content.replace('\xa0', '\n')
        content = content.replace('<br/>', '\n')
        content = re.sub(r'\n+', '\n', content)
        content = '\t'.join(content.splitlines(True))
        self.content = content
        return content

def run_test():
    page = Page(url, base_url)
    print(page.get_booktitle())
    print(page.get_author())
    print(page.get_chaptertitle())
    print(page.get_next_url())
    print(len(page.get_content()))
    # print(page.get_content())

# Config

In [None]:
# url
url = """
https://tw.uukanshu.com/b/130973/23802.html
"""
url = url.strip()
base_url = 'https://tw.uukanshu.com'
count = 0

# css
css_booktitle    = {'name': 'span', 'class_': 'shuming'}
css_author       = {'name': 'span', 'class_': 'author'}
css_chaptertitle = {'name': 'h1', 'id': 'timu'}
css_content      = {'name': 'p'}
css_nexttext    = '下一章'

# optional
language = 'zh'
file_format = "epub"

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

run_test()

# Save as epub

In [None]:
page = Page(url, base_url)
continue_extract = True
counter = 0
booktitle = page.get_booktitle()

"""
declare file format
"""
if file_format == 'epub':
    toc = []
    spine = []
    book = epub.EpubBook()
    book.set_title(booktitle)
    book.set_language(language)
    try:
        author = page.get_author()
        book.add_get_author(author)
    except:
        print('No author added')
else: # save as txt
    filename = booktitle + ".txt"
    f = open(filename, "a+")

"""
start scraping
"""
while url is not None and continue_extract and (counter < count or count == 0):
    counter += 1
    time.sleep(random.randint(3, 6))
    
    page = Page(url, base_url)
    chaptertitle = page.get_chaptertitle()
    content = page.get_content()
    print(chaptertitle, len(content))
    
    # save as epub or txt file
    if file_format == 'epub':
        # create chapter
        chap = epub.EpubHtml(title=chaptertitle, file_name=chaptertitle + '.xhtml')
        chap.content = content.replace('\n', '<br>').replace('\t', '&nbsp;')

        # add chapter
        book.add_item(chap)
        spine.append(chap)

        # create toc
        link = epub.Link(chap.file_name, chap.title, chap.id)
        toc.append(link)
    else:
        chapter = ""
        chapter = chaptertitle + "\n\n\n\n"
        chapter = chapter + content
        f.write(chapter)

    # get next url
    try:
        next_url = page.get_next_url()
        url = next_url
    except:
        print("End of url")
        break
        
"""
save file
"""
if file_format == 'epub':
    # toc
    toc = tuple(toc)
    book.toc = toc

    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())

    # spine = tuple(spine)
    book.spine = ['nav', *spine]

    # write to the file
    epub.write_epub('book/' + booktitle + '.epub', book, {})
else:
    f.close()
    
print("=====\nDONE")

# Test

In [None]:
page = Page(url, base_url)
print(page.get_content())
# page.get_content()

In [None]:
page.soup.find('div', class_=re.compile('cont')).text