In [30]:
import json
import os
from collections import Counter
from dataclasses import asdict
from dataclasses import dataclass
from dataclasses import field
from pathlib import Path
from pprint import pprint
from typing import Self

import structlog
from lxml import etree
from lxml.etree import HTMLParser
from lxml.etree import _Element as Element
from structlog.stdlib import BoundLogger

# from data_adapter.standard_ebook_toc import Chapter
from utils.data_io import read_dict
from utils.data_io import read_xhtml
from utils.data_io import save_chunk
from utils.data_io import save_xhtml
from utils.logger_config import configure_logger

BOOK_DIR = Path(os.environ.get("BOOK_DIR", "/books"))
file_name_counter = Counter()


def grep_chapter_books():
    good_repos = []
    for repo_dir in list(BOOK_DIR.glob("*")):
        if repo_dir.is_dir():
            for file_path in repo_dir.glob("*"):
                if file_path.is_file() and file_path.name.startswith("chapter-1"):
                    good_repos.append(repo_dir)
                    break
    return good_repos


def get_max_nest_level(chapters):
    max_result = 0
    for chapter in chapters:
        if sub_chapters := chapter.get("subchapters"):
            max_result = max(get_max_nest_level(sub_chapters), max_result)
        else:
            max_result = max(chapter["nest_level"], max_result)
    return max_result


def grep_shallow_nested_books(repos: list[Path]):
    results = []
    for repo_dir in repos:
        chapters = read_dict(repo_dir / "toc.json")
        print(f"f{repo_dir} has {len(chapters)} chapters.")
        if not chapters:
            continue
        nest_level = get_max_nest_level(chapters)
        if 1 <= nest_level < 2:
            print(f"{repo_dir} append: {nest_level}")
            results.append(repo_dir)
        else:
            print(f"{repo_dir} has deep nest level: {nest_level}")
    return results


@dataclass
class Chapter:
    title: str
    number: str
    name: str
    href: str
    nest_level: int
    query: str
    url: str
    subchapters: list[Self]


def from_dict(chapter_data: dict) -> Chapter:
    """辞書データから `Chapter` オブジェクトを生成します。.

    Args:
        chapter_data (dict): 辞書データ

    Returns:
        Chapter: `Chapter` オブジェクト
    """
    subchapters = []
    if "subchapters" in chapter_data:
        for subchapter_data in chapter_data["subchapters"]:
            subchapters.append(from_dict(subchapter_data))

    return Chapter(
        title=chapter_data["title"],
        number=chapter_data["number"],
        name=chapter_data["name"],
        href=chapter_data["href"],
        nest_level=chapter_data["nest_level"],
        query=chapter_data["query"],
        url=chapter_data["url"],
        subchapters=subchapters,
    )


def load_chapters_from_json(json_path: str) -> list[Chapter]:
    """保存された JSON ファイルから `Chapter` オブジェクトのリストを読み込みます。.

    Args:
        json_path (str): JSON ファイルのパス

    Returns:
        List[Chapter]: `Chapter` オブジェクトのリスト
    """
    with open(json_path) as fp:
        data = json.load(fp)

    chapters = []
    for chapter_data in data:
        chapters.append(from_dict(chapter_data))

    return chapters


def get_content_from_xhtml(xhtml: str) -> str:
    # pass
    parser = HTMLParser()
    tree = etree.fromstring(xhtml, parser)
    content = tree.xpath("//body")[0]
    return etree.tostring(content, pretty_print=True).decode("utf-8")


def main():
    configure_logger()
    logger = structlog.get_logger(__name__)
    book_paths = read_dict(BOOK_DIR / "easy_readable_books.json", logger)
    target_book = book_paths[1]
    toc: list[Chapter] = load_chapters_from_json(target_book + "/toc.json")
    target_files = []
    for file in toc:
        match file.title:
            case "Titlepage":
                pass
            case "Imprint":
                pass
            case "Colophon":
                pass
            case "Uncopyright":
                pass
            case _:
                target_files.append(target_book + "/" + file.href.split("text/")[-1])
                pprint(file)
                # pprint(file.title)

                # read_xhtml(target_book + "/" + file.href.split("text/")[-1], logger))
    print(target_files)
    get_content_from_xhtml(read_xhtml(target_files[0]))


if __name__ == "__main__":
    main()

[2m2024-04-16 07:47.22[0m [[32m[1minfo     [0m] [1mReading data.                 [0m [[0m[1m[34m__main__[0m][0m [36mpath[0m=[35mPosixPath('/home/user/dev/kasi-x/akizora/books/easy_readable_books.json')[0m
[2m2024-04-16 07:47.22[0m [[32m[1minfo     [0m] [1mReading data.                 [0m [[0m[1m[34m__main__[0m][0m [36mpath[0m=[35mPosixPath('/home/user/dev/kasi-x/akizora/books/easy_readable_books.json')[0m
[2m2024-04-16 07:47.22[0m [[32m[1minfo     [0m] [1mReading data.                 [0m [[0m[1m[34m__main__[0m][0m [36mpath[0m=[35mPosixPath('/home/user/dev/kasi-x/akizora/books/easy_readable_books.json')[0m
[2m2024-04-16 07:47.22[0m [[32m[1minfo     [0m] [1mReading data.                 [0m [[0m[1m[34m__main__[0m][0m [36mpath[0m=[35mPosixPath('/home/user/dev/kasi-x/akizora/books/easy_readable_books.json')[0m
[2m2024-04-16 07:47.22[0m [[32m[1minfo     [0m] [1mReading data.                 [0m [[0m[1m[34m__main__[

ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.

In [11]:
target_file = '/home/user/dev/kasi-x/akizora/books/agatha-christie_the-mystery-of-the-blue-train/dedication.xhtml'

In [16]:
xhtml = read_xhtml(target_file).encode("utf-8")

In [17]:
parser = HTMLParser()
tree = etree.fromstring(xhtml, parser)
content = tree.xpath("//body")[0]
a = etree.tostring(content, pretty_print=True).decode("utf-8")


In [26]:
def show(element: Element | list[Element]) -> None:
    # This is for debugging.
    if len(element) == 0:
        print("No elements are found.")
    if isinstance(element, list):
        print(f"{len(element)} elements are found.")
        print("{:=^30}".format(" Show Data "))
        for _i, e in enumerate(element, 1):
            print(f"{_i:=^30}")
            show(e)
    if isinstance(element, Element):
        print(etree.tostring(element, pretty_print=True, encoding=str))
    else:
        print("No elements are found.")

In [27]:
show(tree)

<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-GB">
	<head>
		<title>Dedication</title>
		<link href="../css/core.css" rel="stylesheet" type="text/css"/>
		<link href="../css/local.css" rel="stylesheet" type="text/css"/>
	</head>
	<body epub:type="frontmatter z3998:fiction">
		<section id="dedication" epub:type="dedication">
			<p><span>Dedicated to</span><br/>
			two distinguished members<br/>
			of the <abbr class="eoc" epub:type="z3998:initialism">O.F.D.</abbr></p>
			<p>
				<b>Carlotta</b>
				<i>and</i>
				<b>Peter</b>
			</p>
		</section>
	</body>
</html>



In [None]:
//section[@id='dedication'][1]/@epub:type

In [29]:
show(tree.xpath("//section[@id='dedication'][1]/p"))

2 elements are found.
<p><span>Dedicated to</span><br/>
			two distinguished members<br/>
			of the <abbr class="eoc" epub:type="z3998:initialism">O.F.D.</abbr></p>
			

<p>
				<b>Carlotta</b>
				<i>and</i>
				<b>Peter</b>
			</p>
		

No elements are found.


In [15]:
file

{'title': 'Titlepage',
 'number': '',
 'name': 'Titlepage',
 'href': 'text/titlepage.xhtml',
 'nest_level': 0,
 'query': "//html/body/nav[@id='toc'][1]/ol/li[1]",
 'url': '',
 'subchapters': []}

In [16]:
file["href"].split("/")[-1]

'titlepage.xhtml'

In [20]:
a = 'text/chapter-35.xhtml'

In [21]:
a.split("/")[-1]

'chapter-35.xhtml'