## Preparation

In [491]:
import re
import pandas as pd
import ebooklib

#https://andrew-muller.medium.com/getting-text-from-epub-files-in-python-fbfe5df5c2da
from ebooklib import epub
from bs4 import BeautifulSoup

## Book

In [498]:
book_name = "1 gardens_of_the_moon"
book = epub.read_epub("./data/" + book_name + ".epub")
book_items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))



## Chapters and Paragraphs

In [493]:
def chapter_to_str(chapter):
    # get the chapter as strings
    soup = BeautifulSoup(chapter.get_body_content(), 'html.parser')
    text = [p.get_text() for p in soup.find_all('p')]
    
    # clean the strings:
    # remove newlines (in between words of one sentence)
    text = [s.replace("\n", " ").strip() for s in text]
    # add spaces at the end of a sentence
    text = [re.sub(r'(?<=[.,?!)])(?=[^\s^\'])', r' ', s) for s in text]
    # add a newline after a paranthesis (end of chapter poem)
    text = [re.sub(r'(?<=[)])', r'\n', s) for s in text]
    # remove spaces in front of dots (for "...")
    text = [re.sub(r'\s([.])', r'\1', s) for s in text]
    
    # add newlines between paragraphs to make a chapter string
    return '\n'.join(text)

In [494]:
# prepare the dataframe for all the book chapters
chapters = pd.DataFrame(columns=['paragraphs', "#sentences","#paragraphs"])

# add chapters and their paragraphs
for i,c in enumerate(book_items):
    # get chapter string
    c = chapter_to_str(c)
    # split it into paragraphs
    ps = re.split('\n\n',c)
    # remove chapter poems
    ps = [p for p in ps if not p.endswith(")")]
    # join again into a chapter string
    c = "\n".join(ps)
    # add chapter row to dataframe
    chapters.loc[i] = [ps,len(re.split('(?<=[.!?]) +',c)),len(ps)]

# remove the irrelevant chapters (dramatis personae, boook poems, acknowledgment, greeting)
chapters = chapters[(chapters["#sentences"]>5)]
chapters = chapters.drop(index=2)

In [499]:
chapters.to_csv(f"data/{book_name}.csv", index=False)