In [None]:
import requests
import re
from bs4 import BeautifulSoup
import time
import random
from ebooklib import epub #http://docs.sourcefabric.org/projects/ebooklib/en/latest/tutorial.html#creating-epub
from IPython.display import display, clear_output

In [None]:
class Webpage:
    def __init__(self, url, base_url):
        self.url = url
        self.base_url = base_url
        self.soup = BeautifulSoup(requests.get(self.url, headers).content)
        
        self.extract_webpage()
        self.extract_content()
        
    def extract_webpage(self):
        next_url = self.soup.find('a', string=re.compile(css_nexttext))['href']
        self.next_url = base_url + next_url
        
        """soup.find('span', class_='shuming')"""
        booktitle = self.soup.find(**css_booktitle).text
        self._booktitle = self._clean_text(booktitle)
        self._add_booktitle = ''
        
        """soup.find('h1', id='timu')"""
        try:
            author = self.soup.find(**css_author).text
            self.author = self._clean_text(author)
        except:
            self.author = ''
        
        """soup.find('h1', id='timu')"""
        chaptitle = self.soup.find(**css_chaptitle).text
        self.chaptitle = self._clean_text(chaptitle)
        
    def extract_content(self):
        """soup.find('h1', id='timu')"""
        # find all 'p'
        # content = self.soup.find_all(**css_content)
        # content = [x.get_text() for x in content]
        # content = '\n'.join(content)
        content = self.soup.find('div', class_=re.compile('cont')).get_text('\n')
        # print(webpage.soup.find('div', class_=re.compile('cont')).get_text("\n"))
        
        # clean
        content = content.strip()
        content = content.replace('\u3000', '\n')
        content = content.replace('\xa0', '\n')
        content = content.replace('<br/>', '\n')
        
        content = re.sub(r'\n+', '\n', content)
        content = '\t'.join(('\n'+content.lstrip()).splitlines(keepends=True))

        self._content = content
        
    @property
    def booktitle(self):
        return self._booktitle + self._add_booktitle
    
    @property
    def add_booktitle(self):
        return self._add_booktitle
        
    @add_booktitle.setter
    def add_booktitle(self, add_booktitle):
        self._add_booktitle = add_booktitle
        
    @property
    def content(self):
        # add chaptitle
        chaptitle = self.chaptitle
        content = self._content
        
        # print
        t = time.localtime()
        current_time = time.strftime("%H:%M", t)
        content_update_info = f"{current_time}  {chaptitle} {len(content)}"
        try:
            content_display.update(content_update_info)
        except:
            print(content_update_info)
        
        # add chaptitle to content
        content = chaptitle + '\n' + content
        # html encode
        content = content.replace('\n', '<br>')
        content = content.replace('\t', '&emsp;')
        return content

    def _clean_text(self, text):
        text = text.replace('小說', '')
        text = text.replace('作者', '')
        text = text.replace('：', ' ')
        text = text.replace(':', ' ')
        text = re.sub(r' +', ' ', text)
        text = text.strip()
        return text
    
class Book:
    def __init__(self, first_webpage, language='zh'):
        self.booktitle = first_webpage.booktitle
        self.author = first_webpage.author
        print('Booktitle:\t', self.booktitle)
        print('Author:\t\t', self.author)
            
        self.language = language
        # print current chapter
        global content_display
        content_display = display('Getting chapter content...', display_id=True)
        
        self.book = self._create_epub()

        # toc
        self.toc = []
        self.spine = []
    
    def _create_epub(self):
        # define book
        book = epub.EpubBook()
        book.set_title(self.booktitle)
        book.set_language(self.language)
        if self.author: book.add_author(self.author)
        return book
    
    def add_chap(self, webpage):
        chap = self._create_epub_chap(webpage)
        
        # add chap to book
        self.book.add_item(chap)
        self.spine.append(chap)
        # create toc
        link = epub.Link(chap.file_name, chap.title, chap.id)
        self.toc.append(link)
        
    def _create_epub_chap(self, webpage):
        chaptitle = webpage.chaptitle
        chap = epub.EpubHtml(title=chaptitle, file_name=chaptitle + '.xhtml')
        chap.set_content(webpage.content)
        return chap

    def save(self):
        book = self.book
        # toc
        toc = tuple(self.toc)
        book.toc = toc

        # add default NCX and Nav file
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())

        # spine = tuple(spine)
        book.spine = ['nav', *self.spine]

        # write to the file
        epub.write_epub('book/' + self.booktitle + '.epub', book, {})
        print('saving {}...'.format(self.booktitle))
        print('-------')
    
"""
utils function
"""    
def run_test():
    webpage = Webpage(url, base_url)
    print('Booktitle:\t', webpage.booktitle)
    print('Author:\t\t', webpage.author)
    print('Chap:\t\t', webpage.chaptitle)
    print('Content length:\t', end=' ')
    webpage.content
    print('Next url:\t', webpage.next_url)

# Config

In [None]:
page_limit = 0
vol_break = 1000
# https://tw.uukanshu.com/b/16893/52390.html
url = """
https://tw.uukanshu.com/b/62531/1389.html
"""
url = url.strip()
base_url = 'https://tw.uukanshu.com'

# css
css_booktitle    = {'name': 'span', 'class_': 'shuming'}
css_author       = {'name': 'span', 'class_': 'author'}
css_chaptitle    = {'name': 'h1', 'id': 'timu'}
css_content      = {'name': 'p'}
css_nexttext     = '下一章'

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

run_test()

# Save book

In [None]:
first_webpage = Webpage(url, base_url)
book = Book(first_webpage)
"""
start scraping
"""
cnt = 0
while cnt < page_limit or page_limit == 0:
    if cnt > 1 & cnt % vol_break == 0:
        volume = int(cnt / vol_break)
        book.save()
        
        first_webpage.add_booktitle = str(volume)
        book = Book(first_webpage)
        
    time.sleep(random.randint(1, 4))
    cnt += 1
    
    # add webpage to book
    webpage = Webpage(url, base_url)
    book.add_chap(webpage)


    # get next url
    if page_limit == 1: break
    try:
        next_url = webpage.next_url
        url = next_url
    except:
        print("End of url")
        break
        
book.save()
print("DONE")

In [None]:
assert False, "breakpoint"

# Test

In [None]:
webpage = Webpage(url, base_url)
print(webpage.content)
# webpage.content

In [None]:
print(webpage.soup.find('div', class_=re.compile('cont')).get_text("\n"))
# webpage.soup.find('div', class_=re.compile('cont')).get_text("\n")