In [12]:
%pip install -qU more_itertools

Note: you may need to restart the kernel to use updated packages.


## 1. State machine parser

Because the outline structure is so simple, it's feasible to implement a parser using a hand-coded state machine that
shifts between states according to the level of the outline. This may be a simpler approach than specifying a
BNF-style grammar for Lark or a similar parser generator, because in some cases outlines skip levels, which would
complicate the formal grammar.

In [2]:
from enum import Enum
from typing import List, Dict, Iterator
from dataclasses import dataclass, field
import re

In [3]:
class Level(Enum):
    H0 = 0 # top level (initial state)
    H1 = 1
    H2 = 2
    H3 = 3

@dataclass
class HeadingPattern:
    level: Level
    regex: str
    multi_line: bool # whether the heading spans multiple lines

@dataclass
class Heading:
    level: Level
    # heading_type: str # e.g. "section", "subsection", "article", "chapter"
    enumeration: str # number or letter (e.g. "1", "a", "i", "A", "XVII")
    heading_text: str

@dataclass
class Segment:
    level: Level
    headings: Dict[Level, Heading|None] = field(default_factory=dict)
    body: List[str] = field(default_factory=list) # list of paragraphs

## For our purposes, a document is just a list of segments -- the structure is
## implicit in the headings, which will be uploaded to a relational database

In [4]:
def split_paragraph(paragraph: str) -> tuple[str, str]:
    lines = paragraph.split('\n', 1)
    if len(lines) == 0:
        return '', ''
    first_line = lines[0]
    rest_of_paragraph = lines[1] if len(lines) > 1 else ''
    return first_line, rest_of_paragraph

assert split_paragraph("") == ("", "")
assert split_paragraph("This is a\nparagraph.\n") == ("This is a", "paragraph.\n")

In [5]:

def match_heading(paragraph: str, patterns: Dict[Level, HeadingPattern]) -> Heading | None:
    """For each patern in `patterns`, check if the paragraph matches (e.g., pattern r'^Chapter [IVXLC]+'
    matches 'Chapter VII'). If a match is found, return a Heading object. Otherwise, return None."""
    
    paragraph = paragraph.strip()

    for level, pattern in patterns.items():
        pattern_regex = re.compile(pattern.regex, re.DOTALL)
        match = pattern_regex.match(paragraph)
        if match:
            if pattern.multi_line == False:
                return Heading(level=level, enumeration=match.group(1), heading_text=match.group(2))
            else:
                _, rest = split_paragraph(paragraph)
                return Heading(level=level, enumeration=match.group(1), heading_text=rest)

In [6]:
## Tests
test_doc1 = "Chapter VII: The Final Chapter"
test_pattern1 = HeadingPattern(level=Level.H1, regex=r'^Chapter ([IVXLC]+): (.+)$', multi_line=False)

test_doc2 = "\n\nChapter 7:\nThe Final Chapter"
test_pattern2 = HeadingPattern(level=Level.H1, regex=r'^Chapter (\d+):', multi_line=True)

assert match_heading(test_doc1, {Level.H1: test_pattern1}) == \
    Heading(level=Level.H1, enumeration="VII", heading_text="The Final Chapter")

assert match_heading(test_doc2, {Level.H1: test_pattern2}) == \
    Heading(level=Level.H1, enumeration="7", heading_text="The Final Chapter")

In [7]:
class StateMachineParser:
    def __init__(self, document_name: str, heading_patterns: Dict[Level, HeadingPattern]):
        self.document = []
        self.heading_names = {Level.H0: document_name, Level.H1: None, Level.H2: None, Level.H3: None}
        self.patterns = heading_patterns
        self.state = Level.H0

    def parse(self, text):
        paragraphs = text.split('\n\n')

        segment_headings = {Level.H0: self.heading_names[Level.H0]}
        segment = Segment(level=Level.H0, headings=segment_headings, body=[]) # preamble

        for paragraph in paragraphs:

            match = match_heading(paragraph, self.patterns)

            # no heading found, so add paragraph to the current segment
            if not match:
                segment.body.append(paragraph)
                continue

            # found a heading!
            self.document.append(segment) # add the last segment to document

            # print(f"* New heading: {match}")
            self.state = match.level
            new_headings = segment.headings.copy()
            new_headings[match.level] = match
            for level in Level: # have to delete the headings at higher levels in case of skips later
                if level.value in new_headings and level.value > match.level.value:
                    del new_headings[level]
            segment = Segment(level=self.state, headings=new_headings, body=[]) # start a new segment
    
    def _str_segment(self, segment: Segment) -> str:
        
        heading = segment.headings[segment.level]
        heading_str = f"Heading {heading.level} {heading.enumeration}: {heading.heading_text}" if heading else ""
        return f"{heading_str}\n\n{segment.body}"

    def __str__(self):
        return "\n\n".join([self._str_segment(segment) for segment in self.document])


## 2. Try it out

In [8]:
snippet2 = """
preamble paragraph

TITLE 1
Title to 1

Here is some text in 1
Second line of the first paragraph in 1

CHAPTER 1-1
Chapter 1-1

Here is some text in 1-1

1-1-010 Section 1-1-010

Here is some text in 1-1-010

Second paragraph of 1-1-010

1-1-020 Section 1-1-020

Here is some text in 1-1-020

Second paragraph of 1-1-020

CHAPTER 1-2
Chapter 1-2

Here is some text in 1-2

1-2-010 Section 1-2-010

Here is some text in 1-2-010

1-2-020 Section 1-2-020

Here is some text in 1-2-020

TITLE 2
Title to 2

Here is some text in 2

"""

In [9]:
patterns = {
    Level.H1: HeadingPattern(level=Level.H1, regex=r'^TITLE (\d+)', multi_line=True),
    Level.H2: HeadingPattern(level=Level.H2, regex=r'^CHAPTER (\d+-\d+)', multi_line=True),
    Level.H3: HeadingPattern(level=Level.H3, regex=r'^(\d+-\d+-\d+) (.+)', multi_line=False)
}

# TESTS: make sure the patterns are working
title1 = "TITLE 1\nTitle to 1\n"
chapter1_1 = "CHAPTER 1-1\nChapter 1-1\n"
section1_1_010 = "\n1-1-010 Section 1-1-010\n"

m = match_heading(title1, {Level.H1: patterns[Level.H1]})

assert match_heading(title1, {Level.H1: patterns[Level.H1]}) == \
    Heading(level=Level.H1, enumeration="1", heading_text="Title to 1")
assert match_heading(chapter1_1, {Level.H2: patterns[Level.H2]}) == \
    Heading(level=Level.H2, enumeration="1-1", heading_text="Chapter 1-1")
assert match_heading(section1_1_010, {Level.H3: patterns[Level.H3]}) == \
    Heading(level=Level.H3, enumeration="1-1-010", heading_text="Section 1-1-010")

In [10]:
# parse the whole snippet
parser = StateMachineParser("snippet2", patterns)
parser.parse(snippet2)

In [11]:
for s in parser.document:
    print(s)
    print()

Segment(level=<Level.H0: 0>, headings={<Level.H0: 0>: 'snippet2'}, body=['\npreamble paragraph'])

Segment(level=<Level.H1: 1>, headings={<Level.H0: 0>: 'snippet2', <Level.H1: 1>: Heading(level=<Level.H1: 1>, enumeration='1', heading_text='Title to 1')}, body=['Here is some text in 1\nSecond line of the first paragraph in 1'])

Segment(level=<Level.H2: 2>, headings={<Level.H0: 0>: 'snippet2', <Level.H1: 1>: Heading(level=<Level.H1: 1>, enumeration='1', heading_text='Title to 1'), <Level.H2: 2>: Heading(level=<Level.H2: 2>, enumeration='1-1', heading_text='Chapter 1-1')}, body=['Here is some text in 1-1'])

Segment(level=<Level.H3: 3>, headings={<Level.H0: 0>: 'snippet2', <Level.H1: 1>: Heading(level=<Level.H1: 1>, enumeration='1', heading_text='Title to 1'), <Level.H2: 2>: Heading(level=<Level.H2: 2>, enumeration='1-1', heading_text='Chapter 1-1'), <Level.H3: 3>: Heading(level=<Level.H3: 3>, enumeration='1-1-010', heading_text='Section 1-1-010')}, body=['Here is some text in 1-1-010', 