Develop a Lark parser step-by-step that is able to parse typical legal codes (outlines organized by headings)

## 0. Imports and utilities

In [62]:
%pip install -q -U lark

Note: you may need to restart the kernel to use updated packages.


In [63]:
import lark

def parse(grammar, text):
    parser = lark.Lark(grammar)
    parsed = parser.parse(text)
    print(parsed.pretty())
    return parsed

## 1. Parse simple outline, ignoring whitespace

In [64]:
snippet1 = """
preamble
TITLE title-1
    CHAPTER chapter-1-1
        SECTION section-1-1-010
        SECTION section-1-1-020
    CHAPTER chapter-1-2
        SECTION section-1-2-010
        SECTION section-1-2-020
TITLE title-2
"""

grammar1 = """
start: myword? h1+

h1: "TITLE" myword h2*
h2: "CHAPTER" myword h3*
h3: "SECTION" myword

myword: /[a-z0-9-]+/

%import common.NEWLINE
%import common.INT
%import common.DIGIT
%import common.LETTER
%import common.WS
%import common.WS_INLINE
%ignore WS
"""

tree1 = parse(grammar1, snippet1)

start
  myword	preamble
  h1
    myword	title-1
    h2
      myword	chapter-1-1
      h3
        myword	section-1-1-010
      h3
        myword	section-1-1-020
    h2
      myword	chapter-1-2
      h3
        myword	section-1-2-010
      h3
        myword	section-1-2-020
  h1
    myword	title-2



## 2. Add multiple words in each text block

In [65]:
snippet2 = """
preamble
TITLE title-1 lemon
    CHAPTER chapter-1-1 apple
        SECTION section-1-1-010 orange
        SECTION section-1-1-020 banana
    CHAPTER chapter-1-2 grape
        SECTION section-1-2-010 pear
        SECTION section-1-2-020 cherry
TITLE title-2 strawberry
"""

# The change to grammar1 is to allow for multiple words in the text_block
# by matching any character except newline (implied by regex /.+/)
grammar2 = """
start: text_block? h1+

h1: "TITLE" text_block h2*
h2: "CHAPTER" text_block h3*
h3: "SECTION" text_block

text_block: TEXT

TEXT: /.+/

%import common.NEWLINE
%import common.INT
%import common.DIGIT
%import common.LETTER
%import common.WS
%import common.WS_INLINE
%ignore WS
"""

tree2 = parse(grammar2, snippet2)

start
  text_block	preamble
  h1
    text_block	 title-1 lemon
    h2
      text_block	 chapter-1-1 apple
      h3
        text_block	 section-1-1-010 orange
      h3
        text_block	 section-1-1-020 banana
    h2
      text_block	 chapter-1-2 grape
      h3
        text_block	 section-1-2-010 pear
      h3
        text_block	 section-1-2-020 cherry
  h1
    text_block	 title-2 strawberry



## 3. Parse simple outline with text blocks containing multiple paragraphs, separated by double-newlines

In [66]:
snippet3 = """
preamble paragraph

TITLE

Here is 1
Second line of the first paragraph in 1

CHAPTER

Here is 1-1

SECTION

Here is 1-1-010

Second paragraph of 1-1-010

SECTION

Here is 1-1-020

Second paragraph of 1-1-020

CHAPTER

Here is 1-2

SECTION

Here is 1-2-010

SECTION

Here is 1-2-020

TITLE

Here is 2

"""

grammar3 = """
start: text_block? h1*

h1: "TITLE" text_block? h2*
h2: "CHAPTER" text_block? h3*
h3: "SECTION" text_block

text_block: paragraph (BLANK_LINE paragraph)*

paragraph: LINE+

// Exclude the TITLE, CHAPTER, and SECTION keywords using a negative lookahead
LINE: /(?!(TITLE|CHAPTER|SECTION)).+/

BLANK_LINE: NEWLINE NEWLINE

%import common.NEWLINE
%import common.INT
%import common.DIGIT
%import common.WORD
%import common.WS
%import common.WS_INLINE
%ignore WS
"""

tree3 = parse(grammar3, snippet3)

start
  text_block
    paragraph	preamble paragraph
  h1
    text_block
      paragraph
        Here is 1
        Second line of the first paragraph in 1
    h2
      text_block
        paragraph	Here is 1-1
      h3
        text_block
          paragraph	Here is 1-1-010
          


          paragraph	Second paragraph of 1-1-010
      h3
        text_block
          paragraph	Here is 1-1-020
          


          paragraph	Second paragraph of 1-1-020
    h2
      text_block
        paragraph	Here is 1-2
      h3
        text_block
          paragraph	Here is 1-2-010
      h3
        text_block
          paragraph	Here is 1-2-020
  h1
    text_block
      paragraph	Here is 2



## 4. Parse outline with sections matching an enumeration scheme, without a keyword

In [67]:
snippet4 = """
preamble paragraph

TITLE

Here is 1
Second line of the first paragraph in 1

CHAPTER

Here is 1-1

1-1-010

Here is 1-1-010

Second paragraph of 1-1-010

1-1-020

Here is 1-1-020

Second paragraph of 1-1-020

CHAPTER

Here is 1-2

1-2-010

Here is 1-2-010

1-2-020

Here is 1-2-020

TITLE

Here is 2

"""

grammar4 = """
start: text_block? h1*

h1: "TITLE" text_block? h2*
h2: "CHAPTER" text_block? h3*
h3: DIGIT "-" DIGIT "-" DIGIT~3 text_block

text_block: paragraph (BLANK_LINE paragraph)*

paragraph: LINE+

LINE: /(?!(TITLE|CHAPTER|\\d-\\d-\\d\\d\\d)).+/

BLANK_LINE: NEWLINE NEWLINE

%import common.NEWLINE
%import common.INT
%import common.DIGIT
%import common.WORD
%import common.WS
%import common.WS_INLINE
%ignore WS
"""

tree4 = parse(grammar4, snippet4)

start
  text_block
    paragraph	preamble paragraph
  h1
    text_block
      paragraph
        Here is 1
        Second line of the first paragraph in 1
    h2
      text_block
        paragraph	Here is 1-1
      h3
        1
        1
        0
        1
        0
        text_block
          paragraph	Here is 1-1-010
          


          paragraph	Second paragraph of 1-1-010
      h3
        1
        1
        0
        2
        0
        text_block
          paragraph	Here is 1-1-020
          


          paragraph	Second paragraph of 1-1-020
    h2
      text_block
        paragraph	Here is 1-2
      h3
        1
        2
        0
        1
        0
        text_block
          paragraph	Here is 1-2-010
      h3
        1
        2
        0
        2
        0
        text_block
          paragraph	Here is 1-2-020
  h1
    text_block
      paragraph	Here is 2



## 5. Enumeration schemes for Titles, Chapters, and Sections, with verbose headings

In [68]:
snippet5 = """
preamble paragraph

TITLE 1
Title to 1

Here is some text in 1
Second line of the first paragraph in 1

CHAPTER 1-1
Chapter 1-1

Here is some text in 1-1

1-1-010 Section 1-1-010

Here is some text in 1-1-010

Second paragraph of 1-1-010

1-1-020 Section 1-1-020

Here is some text in 1-1-020

Second paragraph of 1-1-020

CHAPTER 1-2
Chapter 1-2

Here is some text in 1-2

1-2-010 Section 1-2-010

Here is some text in 1-2-010

1-2-020 Section 1-2-020

Here is some text in 1-2-020

TITLE 2
Title to 2

Here is some text in 2

"""

grammar5 = """
start: text_block? h1*

!h1: "TITLE" H1_ENUM H1_TEXT text_block? h2*
!h2: "CHAPTER" H2_ENUM H2_TEXT text_block? h3*
h3: H3_ENUM H3_TEXT text_block

text_block: paragraph (BLANK_LINE paragraph)*

paragraph: LINE+

LINE: /(?!(TITLE|CHAPTER|\\d-\\d-\\d\\d\\d)).+/

BLANK_LINE: NEWLINE NEWLINE

H1_ENUM: DIGIT
H2_ENUM: DIGIT "-" DIGIT
H3_ENUM: DIGIT "-" DIGIT "-" DIGIT~3

H1_TEXT: LINE
H2_TEXT: LINE
H3_TEXT: LINE

%import common.NEWLINE
%import common.INT
%import common.DIGIT
%import common.WORD
%import common.WS
%import common.WS_INLINE
%ignore WS
"""

tree5 = parse(grammar5, snippet5)

start
  text_block
    paragraph	preamble paragraph
  h1
    TITLE
    1
    Title to 1
    text_block
      paragraph
        Here is some text in 1
        Second line of the first paragraph in 1
    h2
      CHAPTER
      1-1
      Chapter 1-1
      text_block
        paragraph	Here is some text in 1-1
      h3
        1-1-010
         Section 1-1-010
        text_block
          paragraph	Here is some text in 1-1-010
          


          paragraph	Second paragraph of 1-1-010
      h3
        1-1-020
         Section 1-1-020
        text_block
          paragraph	Here is some text in 1-1-020
          


          paragraph	Second paragraph of 1-1-020
    h2
      CHAPTER
      1-2
      Chapter 1-2
      text_block
        paragraph	Here is some text in 1-2
      h3
        1-2-010
         Section 1-2-010
        text_block
          paragraph	Here is some text in 1-2-010
      h3
        1-2-020
         Section 1-2-020
        text_block
          paragraph	Here is some text i

## 6. Preprocess text to add marker keywords before segment divisions

(There must be a better way to do this, but for now it makes the negative lookahead easier to implement in the case where will automatically determine matching patterns for the headings)

In [69]:
snippet6 = """
preamble paragraph

TITLE 1
Title to 1

Here is some text in 1
Second line of the first paragraph in 1

CHAPTER 1-1
Chapter 1-1

Here is some text in 1-1

1-1-010 Section 1-1-010

Here is some text in 1-1-010

Second paragraph of 1-1-010

1-1-020 Section 1-1-020

Here is some text in 1-1-020

Second paragraph of 1-1-020

CHAPTER 1-2
Chapter 1-2

Here is some text in 1-2

1-2-010 Section 1-2-010

Here is some text in 1-2-010

1-2-020 Section 1-2-020

Here is some text in 1-2-020

TITLE 2
Title to 2

Here is some text in 2

"""

In [70]:
## Preprocessing kluge: split the text into lines, distinguishing lines matching one of the heading patterns.
## Insert marker keywords before the headings to facilitate the subsequent parsing steps.

import re

def preprocess(input_text: str) -> str:
    paragraphs = input_text.split('\n\n')
    
    h1_pattern = re.compile(r'^TITLE \d+')
    h2_pattern = re.compile(r'^CHAPTER \d+-\d+')
    h3_pattern = re.compile(r'^\d+-\d+-\d+ .+')
    
    processed_paragraphs = []
    
    for paragraph in paragraphs:
        lines = paragraph.split('\n')
        processed_lines = []
        
        for line in lines:
            if h1_pattern.match(line):
                processed_lines.append('H1_MARKER')
            elif h2_pattern.match(line):
                processed_lines.append('H2_MARKER')
            elif h3_pattern.match(line):
                processed_lines.append('H3_MARKER')
            processed_lines.append(line)
        
        processed_paragraphs.append('\n'.join(processed_lines))
    
    # Join the paragraphs back into the final output string
    output_text = '\n\n'.join(processed_paragraphs)
    
    return output_text

snippet6_preprocessed = preprocess(snippet6)
print(snippet6_preprocessed)


preamble paragraph

H1_MARKER
TITLE 1
Title to 1

Here is some text in 1
Second line of the first paragraph in 1

H2_MARKER
CHAPTER 1-1
Chapter 1-1

Here is some text in 1-1

H3_MARKER
1-1-010 Section 1-1-010

Here is some text in 1-1-010

Second paragraph of 1-1-010

H3_MARKER
1-1-020 Section 1-1-020

Here is some text in 1-1-020

Second paragraph of 1-1-020

H2_MARKER
CHAPTER 1-2
Chapter 1-2

Here is some text in 1-2

H3_MARKER
1-2-010 Section 1-2-010

Here is some text in 1-2-010

H3_MARKER
1-2-020 Section 1-2-020

Here is some text in 1-2-020

H1_MARKER
TITLE 2
Title to 2

Here is some text in 2




In [71]:
grammar6 = """
start: text_block? h1*

h1: "H1_MARKER" H1_PATTERN H1_TEXT text_block? h2*
h2: "H2_MARKER" H2_PATTERN H2_TEXT text_block? h3*
h3: "H3_MARKER" H3_PATTERN H3_TEXT text_block

text_block: paragraph (BLANK_LINE paragraph)*

paragraph: LINE+

LINE: /(?!(H1_MARKER|H2_MARKER|H3_MARKER)).+/

BLANK_LINE: NEWLINE NEWLINE

// Use the same patterns as the preprocessor to match the headings, except omit
// the start-of-line anchor (^)

H1_PATTERN: /TITLE \\d+/
H2_PATTERN: /CHAPTER \\d+-\\d+/
H3_PATTERN: /\\d+-\\d+-\\d+/

H1_TEXT: LINE
H2_TEXT: LINE
H3_TEXT: LINE

%import common.NEWLINE
%import common.INT
%import common.DIGIT
%import common.WORD
%import common.WS
%import common.WS_INLINE
%ignore WS
"""

tree6 = parse(grammar6, snippet6_preprocessed)

start
  text_block
    paragraph	preamble paragraph
  h1
    TITLE 1
    Title to 1
    text_block
      paragraph
        Here is some text in 1
        Second line of the first paragraph in 1
    h2
      CHAPTER 1-1
      Chapter 1-1
      text_block
        paragraph	Here is some text in 1-1
      h3
        1-1-010
         Section 1-1-010
        text_block
          paragraph	Here is some text in 1-1-010
          


          paragraph	Second paragraph of 1-1-010
      h3
        1-1-020
         Section 1-1-020
        text_block
          paragraph	Here is some text in 1-1-020
          


          paragraph	Second paragraph of 1-1-020
    h2
      CHAPTER 1-2
      Chapter 1-2
      text_block
        paragraph	Here is some text in 1-2
      h3
        1-2-010
         Section 1-2-010
        text_block
          paragraph	Here is some text in 1-2-010
      h3
        1-2-020
         Section 1-2-020
        text_block
          paragraph	Here is some text in 1-2-020
  h1
 