In [26]:
import json
from dataclasses import asdict
from dataclasses import dataclass
from dataclasses import field
from pathlib import Path
from typing import Optional

from lxml import etree
from lxml.etree import Element
from lxml.etree import HTMLParser

# import structlog
# from structlog.stdlib import BoundLogger

namespaces = {"xhtml": "http://www.w3.org/1999/xhtml"}


def show(element, level=0):
    print(etree.tostring(element, pretty_print=True, encoding=str))


title = "john-maynard-keynes_the-economic-consequences-of-the-peace"
path = Path(f"/home/user/dev/kasi-x/akizora/books/{title}/toc.xhtml")

with open(path) as file:
    xml_data = file.read().rstrip().encode("utf-8")


def find_root(xml_data: bytes = xml_data) -> Element:
    # MEMO: HTMLParser is easy for me more than lxml.etree.XMLParser.
    parser = HTMLParser(encoding="UTF-8")
    # WHYNOT: if I use xml.etree.ElementTree, I can read from utf-8 string, but I don't want to do mix-usage of xml.etree.ElementTree and lxml.etree.
    return etree.fromstring(xml_data, parser)


def parse_from_xml_data(query: str, xml_data: bytes = xml_data) -> list[Element]:
    return find_root(xml_data).xpath(query)


def find_file_title() -> str | None:
    return parse_from_xml_data("//html/head/title")[0].text


def find_chapters(append=None) -> list[Element]:
    if append:
        return parse_from_xml_data("//html/body/nav[@id='toc'][1]/ol/li" + append)
    return parse_from_xml_data("//html/body/nav[@id='toc'][1]/ol/li")


@dataclass
class Chapter:
    """input data format is like this <a href="text/chapter-3.xhtml"><span epub:type="z3998:roman">III</span>: The Conference</a>."""

    title: str
    href: str
    nest_level: int
    span: str
    subchapters: list["Chapter"] = field(default_factory=list)
    self_query: str | None = None  # for debug

    def __post_init__(self):
        self.subchapters = process_raw_chapters_into_formated(
            self.subchapters,
            self.nest_level + 1,
            self.self_query + "/ol/li" if self.self_query else None,
        )

    def to_dict(self):
        if self.subchapters:
            self.subchapters = [subchapter.to_dict() for subchapter in self.subchapters]  # type: ignore

        return asdict(self)


def process_raw_chapters_into_formated(
    raw_chapters, nest_level=0, self_query=None
) -> list[Chapter]:
    return [
        Chapter(
            title=raw_chapter.xpath("a/text()")[0] if raw_chapter.xpath("a/text()") else "",
            href=raw_chapter.xpath("a/@href")[0] if raw_chapter.xpath("a/@href") else "",
            nest_level=nest_level,
            span=raw_chapter.xpath("a/span/text()")[0]
            if raw_chapter.xpath("a/span/text()")
            else "",
            subchapters=raw_chapter.xpath("ol/li"),
            self_query=self_query + f"[{index}]" if self_query else None,
        )
        for index, raw_chapter in enumerate(raw_chapters, start=1)
    ]


# parse_from_xml_data("//html/body/nav[@id='toc'][1]/ol/li")


def create_dict_formated_chapters(chapters):
    return [chapter.to_dict() for chapter in chapters]


def create_formated_chapters() -> list[Chapter]:
    return process_raw_chapters_into_formated(
        find_chapters(), self_query="//html/body/nav[@id='toc'][1]/ol/li"
    )

# This is for debugging.
def show(element):
    if len(element) == 0:
        print("No elements are found.")
    if isinstance(element, list):
        print(f"{len(element)} elements are found.")
        print("{:=^30}".format(" Show Data "))
        for _i, e in enumerate(element, 1):
            print(f"{_i:=^30}")
            show(e)
    else:
        print(etree.tostring(element, pretty_print=True, encoding=str))

chapters = create_formated_chapters()

l = []
for c in chapters:
    l.append(c.to_dict())

import json
with open("test.json", "w") as fp:
     json.dump(l, fp)

from pprint import pprint

pprint(chapters)

In [27]:
chapters = create_formated_chapters()

l = []
for c in chapters:
    l.append(c.to_dict())

import json
with open("test.json", "w") as fp:
     json.dump(l, fp)

from pprint import pprint

pprint(chapters)

In [28]:
from pprint import pprint

pprint(chapters)

In [44]:
with open(Path(f"/home/user/dev/kasi-x/akizora/books/{title}/info.json")) as info_file:
    json.load(info_file)

In [45]:
with open(Path(f"/home/user/dev/kasi-x/akizora/books/{title}/info.json")) as info_file:
    toc_file_info = [info for info in json.load(info_file) if info["path"] == "toc.xhtml"]

In [49]:
from scrayping.github_api import GithubAPI

In [50]:
toc_url = f"https://api.github.com/repos/standardebooks/{title}/contents/src/epub/toc.xhtml?ref=master"
toc_data = GithubAPI().get_single_file_content_data(toc_url)
print(toc_data["content"])


[2m2024-04-11 21:23:38[0m [[32m[1minfo     [0m] [1mAPI Request                   [0m [36mmethod[0m=[35mGET[0m [36mmodule[0m=[35mgithub_api[0m [36murl[0m=[35mhttps://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/contents/src/epub/toc.xhtml?ref=master[0m
[2m2024-04-11 21:23:39[0m [[32m[1mdebug    [0m] [1mAPI Request Successful        [0m [36mmodule[0m=[35mgithub_api[0m [36mstatus_code[0m=[35m200[0m [36murl[0m=[35mhttps://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/contents/src/epub/toc.xhtml?ref=master[0m
PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPGh0bWwg
eG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGh0bWwiIHhtbG5zOmVw
dWI9Imh0dHA6Ly93d3cuaWRwZi5vcmcvMjAwNy9vcHMiIGVwdWI6cHJlZml4
PSJ6Mzk5ODogaHR0cDovL3d3dy5kYWlzeS5vcmcvejM5OTgvMjAxMi92b2Nh
Yi9zdHJ1Y3R1cmUvLCBzZTogaHR0cHM6Ly9zdGFuZGFyZGVib29rcy5vcmcv
dm9jYWIvMS4wIiB4bWw6bGFuZz0iZW4tVVMiPgoJPGhlYWQ+

In [51]:
toc_data

{'name': 'toc.xhtml',
 'path': 'src/epub/toc.xhtml',
 'sha': '473246fdd33a25a230f1612f382dd62730f6855b',
 'size': 3601,
 'url': 'https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/contents/src/epub/toc.xhtml?ref=master',
 'html_url': 'https://github.com/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/blob/master/src/epub/toc.xhtml',
 'git_url': 'https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/blobs/473246fdd33a25a230f1612f382dd62730f6855b',
 'download_url': 'https://raw.githubusercontent.com/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/master/src/epub/toc.xhtml',
 'type': 'file',
 'content': 'PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPGh0bWwg\neG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGh0bWwiIHhtbG5zOmVw\ndWI9Imh0dHA6Ly93d3cuaWRwZi5vcmcvMjAwNy9vcHMiIGVwdWI6cHJlZml4\nPSJ6Mzk5ODogaHR0cDovL3d3dy5kYWlzeS5vcmcvejM5OTgv

In [46]:
toc_file_info

[]

In [29]:
pprint(chapters)

[Chapter(title='Titlepage',
         href='text/titlepage.xhtml',
         nest_level=0,
         span='',
         subchapters=[],
         self_query="//html/body/nav[@id='toc'][1]/ol/li[1]"),
 Chapter(title='Imprint',
         href='text/imprint.xhtml',
         nest_level=0,
         span='',
         subchapters=[],
         self_query="//html/body/nav[@id='toc'][1]/ol/li[2]"),
 Chapter(title='Preface',
         href='text/preface.xhtml',
         nest_level=0,
         span='',
         subchapters=[],
         self_query="//html/body/nav[@id='toc'][1]/ol/li[3]"),
 Chapter(title='The Economic Consequences of the Peace',
         href='text/halftitlepage.xhtml',
         nest_level=0,
         span='',
         subchapters=[Chapter(title=': Introductory',
                              href='text/chapter-1.xhtml',
                              nest_level=1,
                              span='I',
                              subchapters=[],
                              self_query

In [30]:
l = []
for c in chapters:
    l.append(c.to_dict())

import json
with open("test.json", "w") as fp:
     json.dump(l, fp)

In [37]:
import json
with open("test.json", "w") as fp:
     json.dump(l, fp)

In [35]:
l.dump()

AttributeError: 'list' object has no attribute 'dump'

In [None]:
chapters