In [None]:
import requests
import re
from bs4 import BeautifulSoup
import time
import random
from ebooklib import epub #http://docs.sourcefabric.org/projects/ebooklib/en/latest/tutorial.html#creating-epub

In [None]:
class Page:
    def __init__(self, url, base_url):
        self.url = url
        self.base_url = base_url
        req = requests.get(self.url, headers)
        self.soup = BeautifulSoup(req.content)
        
    def get_next_url(self):
        next_url = self.soup.find('a', string=re.compile(css_nexttext))['href']
        next_url = base_url + next_url
        return next_url

    def get_booktitle(self):
        """soup.find('span', class_='shuming')"""
        title = self.soup.find(**css_booktitle).text
        title = self.clean_text(title)
        return title

    def get_author(self):
        """soup.find('h1', id='timu')"""
        author = self.soup.find(**css_author).text
        author = self.clean_text(author)
        return author
    
    def get_chaptitle(self):
        """soup.find('h1', id='timu')"""
        title = self.soup.find(**css_chaptitle).text
        title = self.clean_text(title)
        return title
    
    def get_content(self):
        """soup.find('h1', id='timu')"""
        # find all 'p'
        # content = self.soup.find_all(**css_content)
        # content = [x.get_text() for x in content]
        # content = '\n'.join(content)
        content = self.soup.find('div', class_=re.compile('cont')).text
        
        # clean
        content = content.strip()
        content = content.replace('\u3000', '\n')
        content = content.replace('\xa0', '\n')
        content = content.replace('<br/>', '\n')
        content = re.sub(r'\n+', '\n', content)
        content = '\t'.join(('\n'+content.lstrip()).splitlines(keepends=True))
        
        # add chaptitle
        chaptitle = self.get_chaptitle()
        print(chaptitle, len(content))
        content = chaptitle + '\n' + content
        
        # html
        content = content.replace('\n', '<br>').replace('\t', '&emsp;')
        return content
        
    def clean_text(self, text):
        text = text.replace('小說', '')
        text = text.replace('作者', '')
        text = text.replace('：', ' ')
        text = text.replace(':', ' ')
        text = re.sub(r' +', ' ', text)
        text = text.strip()
        return text
    
def run_test():
    page = Page(url, base_url)
    print('Booktitle:\t', page.get_booktitle())
    print('Author:\t\t', page.get_author())
    print('chap:\t', page.get_chaptitle())
    print('Next url:\t', page.get_next_url())
    print('Content length:\t', len(page.get_content()))
    # print(page.get_content())
    
def create_book(first_page, language='zh'):
    booktitle = first_page.get_booktitle()
    print('Booktitle:', booktitle)

    # define book
    book = epub.EpubBook()
    book.set_title(booktitle)
    book.set_language(language)
    try:
        author = first_page.get_author()
        print('Author:', author)
        book.add_author(author)
    except:
        print('No author added')
    return book

def create_chap(page):
    chaptitle = page.get_chaptitle()
    chap = epub.EpubHtml(title=chaptitle, file_name=chaptitle + '.xhtml')
    chap.set_content(page.get_content())
    # chap.content = content
    return chap

def write_book(book, booktitle, toc, spine):
    # toc
    toc = tuple(toc)
    book.toc = toc

    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())

    # spine = tuple(spine)
    book.spine = ['nav', *spine]

    # write to the file
    epub.write_epub('book/' + booktitle + '.epub', book, {})

# Config

In [None]:
count = 0
url = """
https://tw.uukanshu.com/b/150317/10451.html
"""
url = url.strip()
base_url = 'https://tw.uukanshu.com'

# css
css_booktitle    = {'name': 'span', 'class_': 'shuming'}
css_author       = {'name': 'span', 'class_': 'author'}
css_chaptitle = {'name': 'h1', 'id': 'timu'}
css_content      = {'name': 'p'}
css_nexttext    = '下一章'

# optional
language = 'zh'
file_format = "epub"

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

run_test()

# Save book

In [None]:
first_page = Page(url, base_url)
booktitle = first_page.get_booktitle()
continue_extract = True
counter = 0

"""
declare file format
"""
if file_format == 'epub':
    book = create_book(first_page, language=language)
    # define toc
    toc = []
    spine = []
else:
    filename = booktitle + ".txt"
    f = open(filename, "a+")

"""
start scraping
"""
while url is not None and continue_extract and (counter < count or count == 0):
    counter += 1
    time.sleep(random.randint(3, 6))
    
    page = Page(url, base_url)
    
    # save as epub or txt file
    if file_format == 'epub':
        chap = create_chap(page)

        # add chap
        book.add_item(chap)
        spine.append(chap)
        # create toc
        link = epub.Link(chap.file_name, chap.title, chap.id)
        toc.append(link)
    else:
        f.write(page.get_content())

    # get next url
    try:
        next_url = page.get_next_url()
        url = next_url
    except:
        print("End of url")
        break
        
"""
save file
"""
if file_format == 'epub':
    write_book(book, booktitle, toc, spine)
else:
    f.close()
    
print("=====\nDONE")

# Test

In [None]:
page = Page(url, base_url)
print(page.get_content())
# page.get_content()

In [None]:
page.soup.find('div', class_=re.compile('cont')).text