## preparation

In [1]:
from uti import remove_empty_rows, remove_bad_characters
from pathlib import Path

input = Path("./input")
output = Path("./output")


def cleanup(txt: str) -> str:
    return remove_bad_characters(remove_empty_rows(txt))

def write_txt_file(in_path: Path, txt: str):
    out_path = (output / in_path.stem).with_suffix(".txt")
    out_path.write_text(txt)


## convert docx

In [2]:
import docx2txt

for doc in input.glob("*.docx"):
    txt = docx2txt.process(doc)
    txt = cleanup(txt)
    write_txt_file(doc, txt)


## convert pdf

In [5]:
from pypdf import PdfReader

for pdf in input.glob('*.pdf'):
    reader = PdfReader(pdf)
    txt_pages = [page.extract_text() for page in reader.pages]
    txt = '\n'.join(txt_pages)
    txt = cleanup(txt)
    write_txt_file(pdf, txt)


## convet epub

In [3]:
from ebooklib import ITEM_DOCUMENT
from ebooklib.epub import read_epub
from bs4 import BeautifulSoup
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [4]:
for epub in input.glob("*.epub"):
    bk = read_epub(epub)

    # extract HTML from ITEM_DOCUMENT
    htmls = [
        item.get_body_content().decode("utf-8")
        for item in bk.get_items()
        if item.get_type() == ITEM_DOCUMENT and type(item.get_body_content()) is bytes
    ]
    html = "".join(htmls)

    # parse html to txt
    txt = BeautifulSoup(html, "html.parser").get_text()
    txt = cleanup(txt)
    write_txt_file(epub, txt)
