In [7]:
from dataclasses import dataclass
from dataclasses import field


def remove_noise_space(text: str) -> str:
    return text.strip()


def count_llm_tokens(text: str) -> int:
    return len(text)


@dataclass
class Sentence:
    text: str

    def __post_init__(self):
        self.text = remove_noise_space(self.text)

    @property
    def token_count(self):
        return len(self.text.split())

    @property
    def word_count(self):
        return count_llm_tokens(self.text)

    @property
    def char_count(self):
        return len(self.text)


@dataclass
class Section:
    sentences: list[Sentence] = field(default_factory=list)

    @property
    def token_count(self):
        return sum(sentence.token_count for sentence in self.sentences)

    @property
    def word_count(self):
        return sum(sentence.word_count for sentence in self.sentences)

    @property
    def char_count(self):
        return sum(sentence.char_count for sentence in self.sentences)

    @property
    def sentences_count(self):
        return len(self.sentences)


@dataclass
class Chapter:
    chapter_title: str = "NO_TITLE"
    sections: list[Section] = field(default_factory=list)

    @property
    def token_count(self):
        return sum(section.token_count for section in self.sections)

    @property
    def word_count(self):
        return sum(section.word_count for section in self.sections)

    @property
    def char_count(self):
        return sum(section.char_count for section in self.sections)

    @property
    def sentences_count(self):
        return sum(section.sentences_count for section in self.sections)

    @property
    def sections_count(self):
        return len(self.sections)


@dataclass
class Book:
    title: str = "NO_TITLE"
    chapters: list[Chapter] = field(default_factory=list)

    @property
    def token_count(self):
        return sum(chapter.token_count for chapter in self.chapters)

    @property
    def word_count(self):
        return sum(chapter.word_count for chapter in self.chapters)

    @property
    def char_count(self):
        return sum(chapter.char_count for chapter in self.chapters)

    @property
    def sentences_count(self):
        return sum(chapter.sentences_count for chapter in self.chapters)

    @property
    def sections_count(self):
        return sum(chapter.sections_count for chapter in self.chapters)

    @property
    def chapters_count(self):
        return len(self.chapters)

In [1]:
import re


def replace_single_newlines(text: str) -> str:
    """テキスト内の単独の改行を空白に置換し、タグを含む行はそのまま保持する関数.
    Some Book is inserted newline for A4 paper view, so it is not required to remove it.
    """
    SINGLE_NEWLINE_PATTERN = re.compile(r"(?<!\n)\n(?!\n)")

    def is_tag_line(line):
        """行内にHTMLタグが存在するかを判定する関数."""
        return "<" in line and ">" in line

    def replace_single_newline_with_space(line):
        """単独の改行を空白に置換する関数."""
        return SINGLE_NEWLINE_PATTERN.sub(" ", line)

    lines = text.split("\n")
    # HEADER tag is not so good, but it is convenient for now.
    # In some case, it can be wrong.
    lines.insert(0, "<HEADER>")

    modified_lines = []

    for line in lines:
        if is_tag_line(line):
            modified_lines.append(line)
        else:
            # NOTE: KEEP LEST DEPENDENT NEWLINE MAKR
            """if not alone newline mark, it would have meaning, not remove it.(sometime it is falte-positive)

             hello/n world -> hello world
             hello/n -> hello
             hello/n/n/n -> hello/n/n/n # it may have meaning.
             """
            line = replace_single_newline_with_space(line)
            modified_lines.append(line)

    return "\n".join(modified_lines)


def mark_sections(text: str) -> str:
    lines = text.split("\n")
    modified_lines = []

    for line in lines:
        line = line.strip()
        if re.match(r"(?i)^CHAPTER\s+\d+", line):
            modified_lines.append(f"<CHAPTER><CHAPTER_TITLE> {line} </CHAPTER_TITLE>")
        if re.match(r"(?i)^PART\s+\d+", line):
            modified_lines.append(f"<PART><PART_TITLE> {line} </PART_TITLE>")
        if re.match(r"(?i)^EPILOGUE", line):
            modified_lines.append(f"<EPILOGUE> {line}")
        if re.match(r"^THE END$", line):
            modified_lines.append(f"<THE_END> {line} </THE_END>")
        if re.match(r"(?i)^OPENING", line):
            modified_lines.append(f"<OPENING> {line}")
        if re.match(r"(?i)^PROLOGUE", line):
            modified_lines.append(f"<PROLOGUE><PROLOGUE_TITLE> {line} </PROLOGUE_TITLE>")
        if re.match(r"(?i)^TRANSCRIBER NOTES", line):
            modified_lines.append(f"<TRANSCRIBER_NOTES> {line} ")
        else:
            modified_lines.append(line)

    return "\n".join(modified_lines)


def auto_close_tags(text: str) -> str:
    # FUTURE: expand this priority later.
    """Maybe more priority is required later and in some special book.
    this priority is just for idea for me.
    for example, PART vs EPIOLOGUE, which is more important?
    In some book, each part has epilogue, so PART can be more important.
    """
    priority = {
        "PART": 3,
        "CHAPTER": 2,
        "TRANSCRIBER_NOTES": 4,
        "THE_END": 1,
        "EPILOGUE": 4,
        "PROLOGUE": 4,
        "DATA": 1,
        "HEADER": 2,
    }
    priority = dict(sorted(priority.items(), key=lambda item: item[1]))

    def sort_tags(tags):
        return sorted(tags, key=lambda tag: priority.get(tag, -1))

    def get_priority(tag):
        return priority.get(tag, -1)

    # Not yet used que. But you should change this to que.
    living_tag_stack = []
    contents = []
    remove_list = []

    tag_pattern = re.compile(r"</?([A-Z_]+)>")

    lines = text.split("\n")

    # REFACTOR: make more readable this function by separating into functions.
    for line in lines:
        tags_found = tag_pattern.findall(line)
        for new_tag in tags_found:
            start_tag = f"<{new_tag}>"
            end_tag = f"</{new_tag}>"
            if start_tag in line:
                if living_tag_stack:
                    for open_tag in living_tag_stack:
                        if get_priority(open_tag) <= get_priority(new_tag):
                            # OPTIMIZE: rewrite for change algorithm.
                            """Use stack and pop and change this algorithm.
                            We can use html tag relate idea. Never happen '<t><a> </t></a>', close tag must be in order.
                            In other words, when stack is '<chapter><hoge>' and </chapeter> event happend, <hoge> must be closed at previous point of close chapter.
                            Be careful to keep passing test case.
                            """

                            contents.append(f"</{open_tag}>")
                            remove_list.append(open_tag)
                    living_tag_stack = [
                        new_tag for new_tag in living_tag_stack if new_tag not in remove_list
                    ]
                    remove_list = []
                living_tag_stack.append(new_tag)
                living_tag_stack = sort_tags(living_tag_stack)

            if end_tag in line:
                living_tag_stack.remove(new_tag)
                if living_tag_stack:  # POINTLESS: not required. Because for loop is not executed when living_tag_stack is empty.
                    for open_tag in living_tag_stack:
                        if get_priority(open_tag) <= get_priority(new_tag):
                            contents.append(f"</{open_tag}>")
                            remove_list.append(open_tag)
                    living_tag_stack = [
                        new_tag for new_tag in living_tag_stack if new_tag not in remove_list
                    ]
                    remove_list = []
        contents.append(line)

    while living_tag_stack:
        contents.append(f"</{living_tag_stack.pop()}>")

    return "\n".join(contents)

In [5]:
from pprint import pprint

file = "horizon_trial/sample.txt"
text = open(file).read()
marked_text = mark_sections(text)
#pprint(marked_text)
marked_space_removed_text = replace_single_newlines(marked_text)
#pprint(marked_space_removed_text)
auto_closed_text = auto_close_tags(marked_space_removed_text)
#pprint(auto_closed_text)


In [21]:

import re
from dataclasses import dataclass, field

# Assuming previous class definitions for Sentence, Section, Chapter, Book

def remove_tags(text: str) -> str:
    """Remove HTML tags from a string."""
    return re.sub(r'<[^>]*>', '', text)

def parse_book(text: str) -> Book:
    parts = re.split(r'(<[^>]+>)', text)
    chapters = []
    current_chapter = Chapter()
    current_section = Section()
    title = "NO_TITLE"
    in_header = True

    for part in parts:
        if part.strip() == '':
            continue

        # Handle header as the first non-tag text
        if in_header and not part.startswith('<'):
            title = remove_tags(part).strip()
            in_header = False
            continue

        if part == '<CHAPTER>' or part == '<PROLOGUE>' or part == '<EPILOGUE>':
            if current_section.sentences:
                current_chapter.sections.append(current_section)
            current_section = Section()
            if current_chapter.sections:
                chapters.append(current_chapter)
            current_chapter = Chapter()
            continue

        if '<CHAPTER_TITLE>' in part or '<PROLOGUE_TITLE>' in part:
            # Ensure to capture only the content within the title tags
            title_match = re.search(r'>([^<]+)<', part)
            if title_match:
                chapter_title = title_match.group(1).strip()
                current_chapter.chapter_title = chapter_title
            continue

        if part.startswith('<') and part.endswith('>'):
            continue  # Skip standalone tags

        # Handling content
        sentences = [Sentence(text=remove_tags(sentence).strip()) for sentence in part.split('\n') if sentence.strip()]
        current_section.sentences.extend(sentences)

    # Ensure the last chapter and section are added
    if current_section.sentences:
        current_chapter.sections.append(current_section)
    if current_chapter.sections:
        chapters.append(current_chapter)

    return Book(title=title, chapters=chapters)


book = parse_book(auto_closed_text)
# print(f"Book Title: {book.title}")
for chapter in book.chapters:
    print(f"Chapter Title: {chapter.chapter_title}, Sentences: {chapter.sentences_count}, Sections: {chapter.sections_count}")


Chapter Title: NO_TITLE, Sentences: 3, Sections: 1
Chapter Title: NO_TITLE, Sentences: 3, Sections: 1
Chapter Title: NO_TITLE, Sentences: 28, Sections: 1
Chapter Title: NO_TITLE, Sentences: 34, Sections: 1


In [22]:
book

Book(title="=* A Distributed Proofreaders Canada eBook *=\n\nThis ebook is made available at no cost and with very few restrictions.\n\n_Title:_ Lost Horizon\n_Date of first publication:_ 1936\n_Author:_ James Hilton (1900-1954)\n_Date first posted:_ Nov. 6, 2018\n_Date last updated:_ Nov. 6, 2018\nFaded Page eBook #20181110\n\n\n\n\n\n\n\n[Cover Illustration]\n\n\n\n\n\n\nNOVELS BY JAMES HILTON\n————\nAND NOW GOODBYE\nCONTANGO\nKNIGHT WITHOUT ARMOUR\nMURDER AT SCHOOL\nCATHERINE HERSELF\nTHE SILVER FLAME\n\n\n\n\n\n\nL O S T   H O R I Z O N\n\n\n\nBY\nJAMES HILTON\n\nMACMILLAN  AND  CO.,  LIMITED\nST.  MARTIN'S  STREET,  LONDON\n1936\n\n\n\n\nCOPYRIGHT\n\n_First published 1933_\n_The Cottage Library 1936_\n\n\nPRINTED IN GREAT BRITAIN\nBY R. & R. CLARK, LIMITED, EDINBURGH", chapters=[Chapter(chapter_title='NO_TITLE', sections=[Section(sentences=[Sentence(text='PROLOGUE'), Sentence(text='PROLOGUE'), Sentence(text='Cigars had burned low, and we were beginning to sample the disillusionmen

In [24]:
book.chapters[0]

Chapter(chapter_title='NO_TITLE', sections=[Section(sentences=[Sentence(text='PROLOGUE'), Sentence(text='PROLOGUE'), Sentence(text='Cigars had burned low, and we were beginning to sample the disillusionment...')])])