In [86]:
%pip install -qU more_itertools

Note: you may need to restart the kernel to use updated packages.


## 1. State machine parser

Because the outline structure is so simple, it's feasible to implement a parser using a hand-coded state machine that
shifts between states according to the level of the outline. This may be a simpler approach than specifying a
BNF-style grammar for Lark or a similar parser generator, because in some cases outlines skip levels, which would
complicate the formal grammar.

In [100]:
from enum import Enum
from typing import List, Dict, Iterator
from dataclasses import dataclass, field
from itertools import tee
from more_itertools import spy
import re

In [88]:
class Level(Enum):
    H0 = 0 # top level (initial state)
    H1 = 1
    H2 = 2
    H3 = 3

@dataclass
class HeadingPattern:
    level: Level
    regex: str # pattern matching the first line of the heading
    multi_line: bool # whether the heading spans multiple lines

@dataclass
class Heading:
    level: Level
    # heading_type: str # e.g. "section", "subsection", "article", "chapter"
    enumeration: str # number or letter (e.g. "1", "a", "i", "A", "XVII")
    heading_text: str

@dataclass
class Segment:
    level: Level
    headings: Dict[Level, Heading|None] = field(default_factory=dict)
    body: List[str] = field(default_factory=list) # list of paragraphs

## For our purposes, a document is just a list of segments -- the structure is
## implicit in the headings, which will be uploaded to a relational database

In [89]:
def seek_first_nonempty_line(line_iter: Iterator[str]) -> Iterator[str]:
    """Advance the iterator until a non-empty line is found. The start of the iterator
    should be that non-empty line"""
    for i in range(5):
        try:
            (next_line,), line_iter = spy(line_iter) # peek at the next line
        except ValueError: # no more lines
            return iter([])
        if next_line.strip() == "":
            next(line_iter)
        else:
            break

    return line_iter

test_lines = ["foo", "bar"]
assert list(seek_first_nonempty_line(iter(test_lines))) == ["foo", "bar"]
test_lines2 = ["", "", "foo", "bar"]
assert list(seek_first_nonempty_line(iter(test_lines2))) == ["foo", "bar"]
test_empty_lines = ["", "", "", ""]
assert list(seek_first_nonempty_line(iter(test_empty_lines))) == []

In [90]:
def get_paragraph(line_iter: Iterator[str]) -> str | None:
    """Skip empty lines and return the next paragraph."""
    paragraph = ""

    line_iter = seek_first_nonempty_line(line_iter)
    try:
        line = next(line_iter)
    except StopIteration:
        return None

    # read the paragraph
    paragraph += line + "\n"
    while True:
        try:
            line = next(line_iter)
            line = line.strip()
        except StopIteration:
            break
        if line.strip() == "":
            break
        paragraph += line + "\n"

    #return paragraph if paragraph.strip() else None
    return paragraph

test_paragraphs = [
    "",
    "This is a",
    "paragraph.",
    "",
    "This is another"
]
assert get_paragraph(iter(test_paragraphs)) == "This is a\nparagraph.\n"
assert get_paragraph(iter([""])) == None

In [101]:
def match_heading(line_iter: Iterator[str], patterns: Dict[Level, HeadingPattern]) -> Heading | None:
    """For each patern in `patterns`, check if the next line matches (e.g., pattern r'^Chapter [IVXLC]+'
    matches 'Chapter VII'). If a match is found, return a Heading object. Otherwise, return None.
    The iterator should be advanced to the next line after the heading after a match."""
    
    line_iter, line_iter2 = tee(line_iter)
    line_iter2 = seek_first_nonempty_line(line_iter2)
    # peek at the next line
    try:
        (line,), line_iter2 = spy(line_iter2) # peek at the next line
        line = line.strip()
        print(f"spy: {line}")
    except ValueError: # no more lines
        print("no more lines")
        return None
    
    #try:
    #    line = next(line_iter)
    #    line = line.strip()
    #except StopIteration:
    #    return None
    
    for level, pattern in patterns.items():
        match = re.match(pattern.regex, line)
        print(f"pre-advance line: {line}, pattern: {pattern.regex}, match: {match}")
        if match:
            line_iter = seek_first_nonempty_line(line_iter)
            print("advancing...")
            line = next(line_iter) # have to advance the iterator since we used spy() above
            print(f"...line: {line}")
            if not pattern.multi_line:
                return Heading(level=level, enumeration=match.group(1), heading_text=match.group(2))
            else:
                text = get_paragraph(line_iter) # in the multi-line case, the heading text is the rest of the paragraph
                text = text.strip() if text else ""
                return Heading(level=level, enumeration=match.group(1), heading_text=text)


In [None]:
## Tests
test_doc1 = ["Chapter VII: The Final Chapter", "", "This is the first paragraph of the chapter."]
test_pattern1 = HeadingPattern(level=Level.H1, regex=r'^Chapter ([IVXLC]+): (.+)$', multi_line=False)
test_doc2 = ["", "Chapter 7:", "The Final Chapter", "", "This is the first paragraph of the chapter."]
test_pattern2 = HeadingPattern(level=Level.H1, regex=r'^Chapter (\d+):$', multi_line=True)

test_doc1_iter = iter(test_doc1)
assert match_heading(test_doc1_iter, {Level.H1: test_pattern1}) == \
    Heading(level=Level.H1, enumeration="VII", heading_text="The Final Chapter")
assert get_paragraph(test_doc1_iter) == "This is the first paragraph of the chapter.\n"

test_doc2_iter = iter(test_doc2)
assert match_heading(test_doc2_iter, {Level.H1: test_pattern2}) == \
    Heading(level=Level.H1, enumeration="7", heading_text="The Final Chapter")
print("X")
assert match_heading(test_doc2_iter, {Level.H1: test_pattern2}) == None
print("Y")
assert match_heading(test_doc2_iter, {Level.H1: test_pattern2}) == None
print("Z")
print(get_paragraph(test_doc2_iter)) # should print "This is the first paragraph of the chapter."

## FIXME - there's a bug here which is advancing the iterator after a failed match

spy: Chapter VII: The Final Chapter
pre-advance line: Chapter VII: The Final Chapter, pattern: ^Chapter ([IVXLC]+): (.+)$, match: <re.Match object; span=(0, 30), match='Chapter VII: The Final Chapter'>
advancing...
...line: Chapter VII: The Final Chapter
spy: Chapter 7:
pre-advance line: Chapter 7:, pattern: ^Chapter (\d+):$, match: <re.Match object; span=(0, 10), match='Chapter 7:'>
advancing...
...line: Chapter 7:
X
spy: This is the first paragraph of the chapter.
pre-advance line: This is the first paragraph of the chapter., pattern: ^Chapter (\d+):$, match: None
Y
no more lines
Z
None


In [37]:

class StateMachineParser:
    def __init__(self, document_name: str, heading_patterns: Dict[Level, HeadingPattern]):
        self.document = []
        self.heading_names = {Level.H0: document_name, Level.H1: None, Level.H2: None, Level.H3: None}
        self.patterns = heading_patterns
        self.state = Level.H0

    def parse(self, text):
        lines = text.split('\n')
        line_iter = iter(lines)

        segment_headings = {Level.H0: self.heading_names[Level.H0]}
        segment = Segment(level=Level.H0, headings=segment_headings, body=[]) # preamble

        while True:
            # check for a heading (skipping empty lines)
            match = match_heading(line_iter, self.patterns)
            print(f"* Checking for heading match: {match}")

            # no heading found, so add a paragraph to the current segment
            if not match:
                paragraph = get_paragraph(line_iter)
                if paragraph:
                    print(f"* Found paragraph: {paragraph}")
                    segment.body.append(paragraph)
                    continue
                else:
                    break

            # found a heading!
            print(f"* Found heading; appending segment {segment}\n")
            self.document.append(segment) # add the last segment to document

            print(f"* New heading: {match}")
            self.state = match.level
            new_headings = segment.headings.copy()
            new_headings[match.level] = match
            for level in Level: # have to delete the headings at higher levels in case of skips later
                if level.value in new_headings and level.value > match.level.value:
                    del new_headings[level]
            segment = Segment(level=self.state, headings=new_headings, body=[]) # start a new segment
    
    def _str_segment(self, segment: Segment) -> str:
        
        heading = segment.headings[segment.level]
        heading_str = f"Heading {heading.level} {heading.enumeration}: {heading.heading_text}" if heading else ""
        return f"{heading_str}\n\n{segment.body}"

    def __str__(self):
        return "\n\n".join([self._str_segment(segment) for segment in self.document])


## 2. Try it out

In [33]:
snippet2 = """
preamble paragraph

TITLE 1
Title to 1

Here is some text in 1
Second line of the first paragraph in 1

CHAPTER 1-1
Chapter 1-1

Here is some text in 1-1

1-1-010 Section 1-1-010

Here is some text in 1-1-010

Second paragraph of 1-1-010

1-1-020 Section 1-1-020

Here is some text in 1-1-020

Second paragraph of 1-1-020

CHAPTER 1-2
Chapter 1-2

Here is some text in 1-2

1-2-010 Section 1-2-010

Here is some text in 1-2-010

1-2-020 Section 1-2-020

Here is some text in 1-2-020

TITLE 2
Title to 2

Here is some text in 2

"""

In [34]:
patterns = {
    Level.H1: HeadingPattern(level=Level.H1, regex=r'^TITLE (\d+)$', multi_line=True),
    Level.H2: HeadingPattern(level=Level.H2, regex=r'^CHAPTER (\d+-\d+)$', multi_line=True),
    Level.H3: HeadingPattern(level=Level.H3, regex=r'^(\d+-\d+-\d+) (.+)', multi_line=False)
}

# TESTS: make sure the patterns are working
title1 = "TITLE 1\nTitle to 1\n\nHere is some text in 1\nSecond line of the first paragraph in 1"
chapter1_1 = "\nCHAPTER 1-1\nChapter 1-1\n\nHere is some text in 1-1"
section1_1_010 = "\n\n1-1-010 Section 1-1-010\n\nHere is some text in 1-1-010\n\nSecond paragraph of 1-1-010"

assert match_heading(iter(title1.split('\n')), {Level.H1: patterns[Level.H1]}) == \
    Heading(level=Level.H1, enumeration="1", heading_text="Title to 1")
assert match_heading(iter(chapter1_1.split('\n')), {Level.H2: patterns[Level.H2]}) == \
    Heading(level=Level.H2, enumeration="1-1", heading_text="Chapter 1-1")
assert match_heading(iter(section1_1_010.split('\n')), {Level.H3: patterns[Level.H3]}) == \
    Heading(level=Level.H3, enumeration="1-1-010", heading_text="Section 1-1-010")

In [46]:
lines2_iter = iter(snippet2.split('\n'))
match = match_heading(lines2_iter, patterns)
#print(list(lines2_iter)[1])
list(seek_first_nonempty_line(lines2_iter))[1]

'Title to 1'

In [35]:
# parse the whole snippet
parser = StateMachineParser("snippet2", patterns)
parser.parse(snippet2)

* Checking for heading match: None
* No heading found
* Found paragraph: TITLE 1
Title to 1

* Checking for heading match: None
* No heading found
* Found paragraph: Second line of the first paragraph in 1

* Checking for heading match: Heading(level=<Level.H2: 2>, enumeration='1-1', heading_text='Chapter 1-1')
* Found heading; appending segment Segment(level=<Level.H0: 0>, headings={<Level.H0: 0>: 'snippet2'}, body=['TITLE 1\nTitle to 1\n', 'Second line of the first paragraph in 1\n'])

* New heading: Heading(level=<Level.H2: 2>, enumeration='1-1', heading_text='Chapter 1-1')
* Checking for heading match: None
* No heading found
* Found paragraph: 1-1-010 Section 1-1-010

* Checking for heading match: None
* No heading found
* Found paragraph: Second paragraph of 1-1-010

* Checking for heading match: Heading(level=<Level.H3: 3>, enumeration='1-1-020', heading_text='Section 1-1-020')
* Found heading; appending segment Segment(level=<Level.H2: 2>, headings={<Level.H0: 0>: 'snippet2', <

In [36]:
for s in parser.document:
    print(s)
    print()

Segment(level=<Level.H0: 0>, headings={<Level.H0: 0>: 'snippet2'}, body=['TITLE 1\nTitle to 1\n', 'Second line of the first paragraph in 1\n'])

Segment(level=<Level.H2: 2>, headings={<Level.H0: 0>: 'snippet2', <Level.H2: 2>: Heading(level=<Level.H2: 2>, enumeration='1-1', heading_text='Chapter 1-1')}, body=['1-1-010 Section 1-1-010\n', 'Second paragraph of 1-1-010\n'])

Segment(level=<Level.H3: 3>, headings={<Level.H0: 0>: 'snippet2', <Level.H2: 2>: Heading(level=<Level.H2: 2>, enumeration='1-1', heading_text='Chapter 1-1'), <Level.H3: 3>: Heading(level=<Level.H3: 3>, enumeration='1-1-020', heading_text='Section 1-1-020')}, body=['Second paragraph of 1-1-020\n'])

