## How To: Use `Libgen` class to Download Books

In [43]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from autobook.libgen import Libgen

data = Libgen(q="the sun also rises")
df = data.search()
df = data.filtered_download("Ernest Hemingway")

In [46]:
df.get_filtered_df()

Unnamed: 0,author(s),series,title,language,file,mirror1,mirror2,md5,topic,extension,size
0,"Hemingway, Ernest",,"Collected Works: A Farewell to Arms, The Sun A...",English,EPUB / 5.10 Mb,http://library.lol/fiction/774DCFA7F622848BBB2...,https://library.bz/fiction/edit/774DCFA7F62284...,774DCFA7F622848BBB2E642C8E51247D,fiction,epub,5.10 Mb


## How To: Use `EpubConvert` to convert Epub to PDF



In [45]:
from autobook.epubconvert import EpubConvert

fileIn = "./books/Anthony Doerr - Cloud Cuckoo Land.epub"
fileOut = "./books/txt/cloud_cuckoo_land.txt"

convert = EpubConvert(fileIn).epub_to_txt(fileOut=fileOut)



FileNotFoundError: [Errno 2] No such file or directory: './books/Anthony Doerr - Cloud Cuckoo Land.epub'

## Version 1: Text Extraction

In [None]:
import ebooklib
from autobook.ebook_wrapper import HTMLFilter
from ebooklib import epub

fileIn = "books/Anthony Doerr - Cloud Cuckoo Land.epub"
fileOut = "books/Anthony Doerr - Cloud Cuckoo Land.txt"

book = epub.read_epub(fileIn)
content = ""

for item in book.get_items():
    if item.get_type() == ebooklib.ITEM_DOCUMENT:
        bodyContent = item.get_body_content().decode()
        f = HTMLFilter()
        f.feed(bodyContent)
        content += f.text

with open(fileOut, "w", encoding="utf-8") as fout:
    fout.write(content)


## Version 2: Text Extraction

In [None]:
import ebooklib
from ebooklib import epub

file_name = "books/Anthony Doerr - Cloud Cuckoo Land.epub"

book = epub.read_epub(file_name)
items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
items_chapters = [item for item in items if "ch" in item.get_name()]

# for item in items:
#     if "ch" in item.get_name():
#         print(item.get_name())


In [None]:
for item in items_chapters:
    print(item.get_name())


In [None]:
from bs4 import BeautifulSoup


def chapter_to_str(chapter):
    soup = BeautifulSoup(chapter.get_body_content(), "html.parser")
    h1 = [header.get_text() for header in soup.find_all("h1")]
    h2 = [header.get_text() for header in soup.find_all("h2")]
    h3 = [header.get_text() for header in soup.find_all("h3")]
    if h1:  # only perform h1.append if h1 is not an empty list
        if not any("Chapter" in header for header in h1):
            h1 = ["Chapter " + header for header in h1]
        h1.insert(0, "      ")  # Adding spaces to the beginning of the list
        h1.append("     ")  # Adding spaces for a pause after `Chapter One`
    if h2:  # only perform h2.append if h2 is not an empty list
        h2.insert(0, "      ")  # Adding spaces to the beginning of the list
        h2.append(
            """


        """
        )  # Adding spaces for a pause after h2
    if h3:  # only perform h3.append if h3 is not an empty list
        h3.insert(0, "      ")  # Adding spaces to the beginning of the list
        h3.append(
            """


        """
        )  # Adding spaces for a pause after h3
    text = [para.get_text() for para in soup.find_all("p")]
    return "".join(h1 + h2 + h3 + text)


# chapter_to_str(items_chapters[16])


def chapters_to_txt(chapters: list, filename: str = None):
    texts = {}
    for chapter in chapters:
        texts[chapter.get_name()] = chapter_to_str(chapter)
    compiled_text = "\n\n\n".join(texts.values())
    with open(filename, "w", encoding="utf-8") as f:
        f.write(compiled_text)
    return texts


texts = chapters_to_txt(items_chapters)


In [None]:
compiled_text = """


""".join(
    texts.values()
)

In [None]:
with open("epubtxt.txt", "w", encoding="utf-8") as f:
    f.write(compiled_text)
