In [200]:
show(root)

<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-US">
	<head>
		<title>Table of Contents</title>
	</head>
	<body epub:type="frontmatter">
		<nav id="toc" epub:type="toc">
			<h2 epub:type="title">Table of Contents</h2>
			<ol>
				<li>
					<a href="text/titlepage.xhtml">Titlepage</a>
				</li>
				<li>
					<a href="text/imprint.xhtml">Imprint</a>
				</li>
				<li>
					<a href="text/preface.xhtml">Preface</a>
				</li>
				<li>
					<a href="text/halftitlepage.xhtml">The Economic Consequences of the Peace</a>
					<ol>
						<li>
							<a href="text/chapter-1.xhtml"><span epub:type="z3998:roman">I</span>: Introductory</a>
						</li>
						<li>
							<a href="text/chapter-2.xhtml"><span epub:type="z3998:roman">II</span>: Europe Before the War</a>
							<ol>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-1">

In [221]:
import json
import xml.etree.ElementTree as ET
from dataclasses import asdict
from dataclasses import dataclass
from dataclasses import field
from pathlib import Path

from lxml.etree import XMLParser

# import structlog
# from structlog.stdlib import BoundLogger

namespaces = {"xhtml": "http://www.w3.org/1999/xhtml"}


@dataclass
class Chapter:
    title: str
    href: str
    subchapters: list["Chapter"] = field(default_factory=list)

def get_chapter_info(chapter_element):
  """章の情報 (タイトル、リンク) を取得.

  Args:
    chapter_element (lxml.etree.Element): 章 `<li>` 要素

  Returns:
    chapter_info (dict): タイトルとリンクを含む辞書
  """
  title_elements = chapter_element.xpath(".//xhtml:a/text()", namespaces=namespaces)
  title = title_elements[0] if title_elements else None
  href_elements = chapter_element.xpath(".//xhtml:a/@href", namespaces=namespaces)
  href = href_elements[0] if href_elements else None

  return {"title": title, "href": href}


def extract_chapters(toc_element):
  """再帰的にEPUB目次を解析し、ネスト構造を維持した章リストを返す.

  Args:
    toc_element (lxml.etree.Element): EPUB目次 `<nav>` 要素

  Returns:
    chapters (list): Chapter オブジェクトのリスト
  """
  chapters = []
  for chapter_element in toc_element.xpath(".//xhtml:li", namespaces=namespaces):
    chapter_info = get_chapter_info(chapter_element)

    subchapters_element = chapter_element.xpath(".//xhtml:ol", namespaces=namespaces)
    subchapters = []
    for subchapter_element in subchapters_element:
      subchapters.append(extract_chapters(subchapter_element))

    chapters.append(
      Chapter(
        title=chapter_info["title"], href=chapter_info["href"], subchapters=subchapters
      )
    )

  return chapters


# EPUB目次ファイルの読み込み
title = "john-maynard-keynes_the-economic-consequences-of-the-peace"
path = Path(f"/home/user/dev/kasi-x/akizora/books/{title}/toc.xhtml")

with open(path) as file:
  xml_data = file.read().rstrip()

parser = XMLParser(
  encoding="UTF-8", resolve_entities=False, strip_cdata=False, recover=True, ns_clean=True
)
root = ET.fromstring(xml_data, parser)


toc_element = root.xpath("//xhtml:nav[1]", namespaces=namespaces)[0]

# 目次を解析し、章リストを取得
chapters = parse_toc(toc_element)

# 再帰的に解析し、章リストを取得
chapters = extract_chapters(toc_element)

# 結果の確認
for chapter in chapters:
    print(f"- {chapter.title}: {chapter.href}")
    for subchapter in chapter.subchapters:
        print(len(subchapter))
        for a in subchapter:
        print(f"  ーーーー {a.title}")


chapters_json = json.dumps([asdict(chapter) for chapter in chapters], indent=4)

path = Path(f"/home/user/dev/kasi-x/akizora/parsed_data/{title}/toc.json")
path.parent.mkdir(parents=True, exist_ok=True)

#with open(path, "w") as f:
    #f.write(chapters_json)

- Titlepage: text/titlepage.xhtml
- Imprint: text/imprint.xhtml
- Preface: text/preface.xhtml
- The Economic Consequences of the Peace: text/halftitlepage.xhtml
  ーーーー : Introductory
  ーーーー : Europe Before the War
  ーーーー : Population
  ーーーー : Organization
  ーーーー : The Psychology of Society
  ーーーー : The Relation of the Old World to the New
  ーーーー : The Conference
  ーーーー : The Treaty
  ーーーー : Reparation
  ーーーー : Undertakings Given Prior to the Peace Negotiations
  ーーーー : The Conference and the Terms of the Treaty
  ーーーー : Germany’s Capacity to Pay
  ーーーー : The Reparation Commission
  ーーーー : The German Counterproposals
  ーーーー : Europe After the Treaty
  ーーーー : Remedies
- : Introductory: text/chapter-1.xhtml
- : Europe Before the War: text/chapter-2.xhtml
  ーーーー : Population
  ーーーー : Organization
  ーーーー : The Psychology of Society
  ーーーー : The Relation of the Old World to the New
- : Population: text/chapter-2.xhtml#chapter-2-1
- : Organization: text/chapter-2.xhtml#chapter-2-2
- : The Psy

In [318]:
import json
import xml.etree.ElementTree as ET
from dataclasses import asdict
from dataclasses import dataclass
from dataclasses import field
from pathlib import Path

from lxml.etree import XMLParser

# import structlog
# from structlog.stdlib import BoundLogger

namespaces = {"xhtml": "http://www.w3.org/1999/xhtml"}


@dataclass
class Chapter:
    title: str
    href: str
    subchapters: list["Chapter"] = field(default_factory=list)


def get_chapter_info(chapter_li):
    """章の情報 (タイトル、リンク) を取得.

    Args:
        chapter_li (lxml.etree.Element): 章 `<li>` 要素

    Returns:
        chapter_info (dict): タイトルとリンクを含む辞書
    """
    title_elements = chapter_li.xpath(".//xhtml:a/text()", namespaces=namespaces)
    title = title_elements[0] if title_elements else None
    href_elements = chapter_li.xpath(".//xhtml:a/@href", namespaces=namespaces)
    href = href_elements[0] if href_elements else None

    return {"title": title, "href": href}


def extract_chapters(toc_element):
    """再帰的にEPUB目次を解析し、ネスト構造を維持した章リストを返す.

    Args:
        toc_element (lxml.etree.Element): EPUB目次 `<nav>` 要素

    Returns:
        chapters (list): Chapter オブジェクトのリスト
    """
    chapters = []
    all_chapter_lis = toc_element.xpath(".//xhtml:li", namespaces=namespaces)
    for chapter_li in all_chapter_lis:
        chapter_li.xpath(".//xhtml:ol/li", namespaces=namespaces)



    for chapter_li in toc_element.xpath(".//xhtml:li", namespaces=namespaces):
        chapter_info = get_chapter_info(chapter_li)

        subchapters_ol = chapter_li.xpath(".//xhtml:ol", namespaces=namespaces)
        subchapters = extract_chapters(subchapters_ol[0]) if subchapters_ol else []

        chapters.append(
            Chapter(
                title=chapter_info["title"], href=chapter_info["href"], subchapters=subchapters
            )
        )

    return chapters


title = "john-maynard-keynes_the-economic-consequences-of-the-peace"
path = Path(f"/home/user/dev/kasi-x/akizora/books/{title}/toc.xhtml")

with open(path) as file:
    xml_data = file.read().rstrip()

parser = XMLParser(
    encoding="UTF-8", resolve_entities=False, strip_cdata=False, recover=True, ns_clean=True
)
root = ET.fromstring(xml_data, parser)

In [342]:
from lxml.etree import HTMLParser, XMLParser

In [None]:

xml_data = xml_data.encode("utf-8")

In [348]:
parser = HTMLParser(
    encoding="UTF-8", strip_cdata=False, recover=True,
)
root = ET.fromstring(xml_data, parser)

In [349]:
root

<Element html at 0x7fe7d6bb3e40>

In [350]:
x = root.xpath("//html", namespaces=namespaces)[0]

In [351]:
show(x)

<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-US">
	<head>
		<title>Table of Contents</title>
	</head>
	<body epub:type="frontmatter">
		<nav id="toc" epub:type="toc">
			<h2 epub:type="title">Table of Contents</h2>
			<ol>
				<li>
					<a href="text/titlepage.xhtml">Titlepage</a>
				</li>
				<li>
					<a href="text/imprint.xhtml">Imprint</a>
				</li>
				<li>
					<a href="text/preface.xhtml">Preface</a>
				</li>
				<li>
					<a href="text/halftitlepage.xhtml">The Economic Consequences of the Peace</a>
					<ol>
						<li>
							<a href="text/chapter-1.xhtml"><span epub:type="z3998:roman">I</span>: Introductory</a>
						</li>
						<li>
							<a href="text/chapter-2.xhtml"><span epub:type="z3998:roman">II</span>: Europe Before the War</a>
							<ol>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-1">

In [237]:
show(toc_element)

<nav xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" id="toc" epub:type="toc">
			<h2 epub:type="title">Table of Contents</h2>
			<ol>
				<li>
					<a href="text/titlepage.xhtml">Titlepage</a>
				</li>
				<li>
					<a href="text/imprint.xhtml">Imprint</a>
				</li>
				<li>
					<a href="text/preface.xhtml">Preface</a>
				</li>
				<li>
					<a href="text/halftitlepage.xhtml">The Economic Consequences of the Peace</a>
					<ol>
						<li>
							<a href="text/chapter-1.xhtml"><span epub:type="z3998:roman">I</span>: Introductory</a>
						</li>
						<li>
							<a href="text/chapter-2.xhtml"><span epub:type="z3998:roman">II</span>: Europe Before the War</a>
							<ol>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-1"><span epub:type="z3998:roman">I</span>: Population</a>
								</li>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-2"><span epub:type="z3998:roman">II</span>: Organization</a>
								</li>
								<li>
			

In [None]:
/html/body/main/nav[1]/ol/li[4]/a

In [262]:
show(root)

<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-US">
	<head>
		<title>Table of Contents</title>
	</head>
	<body epub:type="frontmatter">
		<nav id="toc" epub:type="toc">
			<h2 epub:type="title">Table of Contents</h2>
			<ol>
				<li>
					<a href="text/titlepage.xhtml">Titlepage</a>
				</li>
				<li>
					<a href="text/imprint.xhtml">Imprint</a>
				</li>
				<li>
					<a href="text/preface.xhtml">Preface</a>
				</li>
				<li>
					<a href="text/halftitlepage.xhtml">The Economic Consequences of the Peace</a>
					<ol>
						<li>
							<a href="text/chapter-1.xhtml"><span epub:type="z3998:roman">I</span>: Introductory</a>
						</li>
						<li>
							<a href="text/chapter-2.xhtml"><span epub:type="z3998:roman">II</span>: Europe Before the War</a>
							<ol>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-1">

In [301]:
x = root.xpath("//xhtml:html", namespaces=namespaces)

In [303]:
show(x[0])

<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-US">
	<head>
		<title>Table of Contents</title>
	</head>
	<body epub:type="frontmatter">
		<nav id="toc" epub:type="toc">
			<h2 epub:type="title">Table of Contents</h2>
			<ol>
				<li>
					<a href="text/titlepage.xhtml">Titlepage</a>
				</li>
				<li>
					<a href="text/imprint.xhtml">Imprint</a>
				</li>
				<li>
					<a href="text/preface.xhtml">Preface</a>
				</li>
				<li>
					<a href="text/halftitlepage.xhtml">The Economic Consequences of the Peace</a>
					<ol>
						<li>
							<a href="text/chapter-1.xhtml"><span epub:type="z3998:roman">I</span>: Introductory</a>
						</li>
						<li>
							<a href="text/chapter-2.xhtml"><span epub:type="z3998:roman">II</span>: Europe Before the War</a>
							<ol>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-1">

In [236]:
l

[{'title': 'Titlepage', 'href': 'text/titlepage.xhtml'},
 {'title': 'Imprint', 'href': 'text/imprint.xhtml'},
 {'title': 'Preface', 'href': 'text/preface.xhtml'},
 {'title': 'The Economic Consequences of the Peace',
  'href': 'text/halftitlepage.xhtml'},
 {'title': ': Introductory', 'href': 'text/chapter-1.xhtml'},
 {'title': ': Europe Before the War', 'href': 'text/chapter-2.xhtml'},
 {'title': ': Population', 'href': 'text/chapter-2.xhtml#chapter-2-1'},
 {'title': ': Organization', 'href': 'text/chapter-2.xhtml#chapter-2-2'},
 {'title': ': The Psychology of Society',
  'href': 'text/chapter-2.xhtml#chapter-2-3'},
 {'title': ': The Relation of the Old World to the New',
  'href': 'text/chapter-2.xhtml#chapter-2-4'},
 {'title': ': The Conference', 'href': 'text/chapter-3.xhtml'},
 {'title': ': The Treaty', 'href': 'text/chapter-4.xhtml'},
 {'title': ': Reparation', 'href': 'text/chapter-5.xhtml'},
 {'title': ': Undertakings Given Prior to the Peace Negotiations',
  'href': 'text/chapte

In [234]:
chapter_data

{'title': 'Uncopyright', 'href': 'text/uncopyright.xhtml'}

In [232]:
all_chapter_lis = toc_element.xpath(".//xhtml:li/following-sibling:ol", namespaces=namespaces)

subs = []
for chapter_li in all_chapter_lis:
    subs.append(chapter_li.xpath(".//xhtml:ol", namespaces=namespaces))


XPathEvalError: Undefined namespace prefix

In [231]:
show(subs[5][0])

<ol xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-1"><span epub:type="z3998:roman">I</span>: Population</a>
								</li>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-2"><span epub:type="z3998:roman">II</span>: Organization</a>
								</li>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-3"><span epub:type="z3998:roman">III</span>: The Psychology of Society</a>
								</li>
								<li>
									<a href="text/chapter-2.xhtml#chapter-2-4"><span epub:type="z3998:roman">IV</span>: The Relation of the Old World to the New</a>
								</li>
							</ol>
						



In [None]:

# 再帰的に解析し、章リストを取得
chapters = extract_chapters(toc_element)

# 結果の確認
for chapter in chapters:
    print(f"- {chapter.title}: {chapter.href}")
    for subchapter in chapter.subchapters:
        print(f"  - {subchapter.title}")


chapters_json = json.dumps([asdict(chapter) for chapter in chapters], indent=4)

path = Path(f"/home/user/dev/kasi-x/akizora/parsed_data/{title}/toc.json")
path.parent.mkdir(parents=True, exist_ok=True)

with open(path, "w") as f:
    f.write(chapters_json)

In [193]:
chapters

[Chapter(title='Titlepage', href='text/titlepage.xhtml', subchapters=[]),
 Chapter(title='Imprint', href='text/imprint.xhtml', subchapters=[]),
 Chapter(title='Preface', href='text/preface.xhtml', subchapters=[]),
 Chapter(title='The Economic Consequences of the Peace', href='text/halftitlepage.xhtml', subchapters=[Chapter(title=': Introductory', href='text/chapter-1.xhtml', subchapters=[]), Chapter(title=': Europe Before the War', href='text/chapter-2.xhtml', subchapters=[Chapter(title=': Population', href='text/chapter-2.xhtml#chapter-2-1', subchapters=[]), Chapter(title=': Organization', href='text/chapter-2.xhtml#chapter-2-2', subchapters=[]), Chapter(title=': The Psychology of Society', href='text/chapter-2.xhtml#chapter-2-3', subchapters=[]), Chapter(title=': The Relation of the Old World to the New', href='text/chapter-2.xhtml#chapter-2-4', subchapters=[])]), Chapter(title=': Population', href='text/chapter-2.xhtml#chapter-2-1', subchapters=[]), Chapter(title=': Organization', h