In [1]:
%pip install -qU openai marvin
%pip install -qU "psycopg[binary]"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Getting ready

Begin by:
1. creating a new directory `data/<jurisdiction>` and populate with one or more
docx files containing the jurisdiction's municipal code
2. run `scripts/convert_docx.sh` to convert those files into a single text file
3. make a copy of `notebooks/template-workflow.ipynb` to `notebooks/<jurisdiction>.ipynb`
and continue processing in that notebook

In [2]:
import sys
sys.path.insert(0, '..')

## set up auto-reloading for development
%reload_ext autoreload
%autoreload 2

In [15]:
from dataclasses import dataclass, field
import re

from muni.code import Level, HeadingPattern, Heading, Segment, Jurisdiction

## Specify heading patterns

Replace the `jurisdiction_headings` dict with examples from your jurisdiction

In [12]:
heading_examples = {
    Level.H1: ["TITLE 1\nGENERAL PROVISION\n",
              "TITLE 2\nCITY GOVERNMENT AND ADMINISTRATION\n",
              "TITLE 3\nREVENUE AND FINANCE\n",
    ],
    Level.H2: ["CHAPTER 1-4\nCODE ADOPTION - ORGANIZATION\n",
              "CHAPTER 1-8\nCITY SEAL AND FLAG\n",
              "CHAPTER 1-12\nCITY EMBLEMS\n",
     ],
    Level.H3: ["1-4-010 Municipal Code of Chicago adopted.\n",
              "2-1-020 Code to be kept up-to-date.\n",
              "3-4-030 Official copy on file.\n",
      ],
}

In [8]:
from muni.code import infer_heading_patterns, infer_level_names

In [13]:
## Verify that the regular expressions matching outline levels look okay
heading_patterns = infer_heading_patterns(heading_examples)
for level, pattern in heading_patterns.items():
    print(f"{level.name}: r'{pattern.regex}'")

print()

## Verify that the names of the sections look okay
level_names = infer_level_names(heading_patterns)
for level, name in level_names.items():
    print(f"{level.name}: {name}")

H1: r'^TITLE (\d+)$'
H2: r'^CHAPTER (\d+-\d+)$'
H3: r'^(\d+-\d+-\d+) (.+)\.$'

H1: Title
H2: Chapter
H3: Section


## Specify the parameters of the jurisdiction and parse the code

In [None]:
place = Jurisdiction(
    name="Chicago Mini",
    patterns=heading_patterns,
    source_local="../data/chicago-mini/code.txt",
    source_url="https://www.chicago.gov/city/en/depts/doit/supp_info/municipal_code.html",
)

In [29]:
chicago_mini.patterns

{<Level.H1: 1>: HeadingPattern(level=<Level.H1: 1>, regex='^TITLE (\\d+)$', multi_line=True),
 <Level.H2: 2>: HeadingPattern(level=<Level.H2: 2>, regex='^CHAPTER (\\d+-\\d+)$', multi_line=True),
 <Level.H3: 3>: HeadingPattern(level=<Level.H3: 3>, regex='^(\\d+-\\d+-\\d+) (.+)$', multi_line=False)}

In [30]:
chicago_mini.parser = StateMachineParser(document_name="Chicago Mini Code", heading_patterns=chicago_mini.patterns)
chicago_mini.parser.summarize_matches(chicago_mini.raw_text)

  H1 heading: 1 GENERAL PROVISIONS
    H2 heading: 1-4 CODE ADOPTION - ORGANIZATION*
      H3 heading: 1-4-010   Municipal Code of Chicago adopted.
      H3 heading: 1-4-020   Adoption of chapter and section numbers.
      H3 heading: 1-4-030   Effective date.
      H3 heading: 1-4-040   Administrative copies - City clerk powers
and duties.
      H3 heading: 1-4-050   Code revisions - Publication.
      H3 heading: 1-4-060   Publication of Code - Conditions.
      H3 heading: 1-4-070   Distribution of Code.
      H3 heading: 1-4-080   Numbering of Code sections - References
to former Code provisions.
      H3 heading: 1-4-090   Definitions for Code provisions.
      H3 heading: 1-4-100   Interpretation of language.
      H3 heading: 1-4-110   References to sections include penalty
references - Exceptions.
      H3 heading: 1-4-120   Penalty for violation of Code.
      H3 heading: 1-4-125   Restitution - License or permit
violations.
      H3 heading: 1-4-130   Maximum fine or penalty.

In [31]:
chicago_mini.document = chicago_mini.parser.parse(chicago_mini.raw_text)

In [33]:
def summarize_document(document: list[Segment]):
    for segment in document:
        if len(segment.paragraphs) == 0:
            continue
        text = '\n'.join(segment.paragraphs)
        print(f"{segment.level}: {len(segment.paragraphs)} paragraphs, {len(text)} characters")

summarize_document(chicago_mini.document)

Level.H1: 12 paragraphs, 503 characters
Level.H2: 1 paragraphs, 178 characters
Level.H3: 2 paragraphs, 478 characters
Level.H3: 2 paragraphs, 463 characters
Level.H3: 2 paragraphs, 129 characters
Level.H3: 3 paragraphs, 514 characters
Level.H3: 2 paragraphs, 195 characters
Level.H3: 2 paragraphs, 322 characters
Level.H3: 3 paragraphs, 615 characters
Level.H3: 3 paragraphs, 987 characters
Level.H3: 15 paragraphs, 4096 characters
Level.H3: 9 paragraphs, 1606 characters
Level.H3: 2 paragraphs, 528 characters
Level.H3: 6 paragraphs, 1855 characters
Level.H3: 3 paragraphs, 889 characters
Level.H3: 2 paragraphs, 265 characters
Level.H3: 2 paragraphs, 301 characters
Level.H3: 1 paragraphs, 126 characters
Level.H3: 2 paragraphs, 259 characters
Level.H3: 3 paragraphs, 862 characters
Level.H3: 3 paragraphs, 381 characters
Level.H3: 2 paragraphs, 200 characters
Level.H3: 2 paragraphs, 232 characters
Level.H3: 2 paragraphs, 352 characters
Level.H3: 3 paragraphs, 1129 characters
Level.H3: 2 paragra

In [36]:
def chunkify_document(document: list[Segment], n: int):
    for segment in document:
        if len(segment.paragraphs) == 0:
            continue
        segment.chunkify(n)

def summarize_chunks(document: list[Segment]):
    for segment in document:
        for i, chunk in enumerate(segment.chunks):
            print(f"{segment.level} chunk {i}: {len(chunk)} characters")

chunkify_document(chicago_mini.document, 1000)
summarize_chunks(chicago_mini.document)

Level.H0 chunk 0: 0 characters
Level.H1 chunk 0: 504 characters
Level.H2 chunk 0: 179 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 0 characters
Level.H3 chunk 0: 479 characters
Level.H3 chunk 0: 464 characters
Level.H3 chunk 0: 130 characters
Level.H3 chunk 0: 515 characters
Level.H3 chunk 0: 196 characters
Level.H3 chunk 0: 323 characters
Level.H3 chunk 0: 616 c

In [None]:
from psycopg import connect
from muni.llm import create_embedding, summarize
from muni.structure import Node

RESET = False
EMBEDDING_LENGTH = len(create_embedding("test"))

def connection():
    return connect(
        dbname="regrag",
        host="localhost",
        port="5432",
        autocommit=True
    )

with connection() as conn:
    if RESET:
        with conn.cursor() as cursor:
            cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
            cursor.execute("DROP TABLE IF EXISTS muni_associations;")
            cursor.execute("DROP TABLE IF EXISTS muni CASCADE;")

    with conn.cursor() as cursor:
        cursor.execute(
            """
            CREATE TABLE IF NOT EXISTS muni_associations (
                jurisdiction TEXT,
                association TEXT,
                left_id INTEGER,
                right_id INTEGER
            );
            """)
        cursor.execute(
            """
            CREATE TABLE IF NOT EXISTS muni (
                id SERIAL PRIMARY KEY,
                jurisdiction TEXT,
                L1_ref TEXT, L1_heading TEXT,
                L2_ref TEXT, L2_heading TEXT,
                L3_ref TEXT, L3_heading TEXT,
                L4_ref TEXT, L4_heading TEXT,
                segment INTEGER,
                text TEXT,
                embedding VECTOR(%s)
            );
            """, (EMBEDDING_LENGTH,))
        cursor.execute(
            """
            ALTER TABLE muni
                ADD COLUMN IF NOT EXISTS textsearchable tsvector
                    GENERATED ALWAYS AS
                    (to_tsvector('english',
                        coalesce(jurisdiction, '') || ' ' ||
                        coalesce(L1_heading, '') || ' ' ||
                        coalesce(L2_heading, '') || ' ' ||
                        coalesce(L3_heading, '') || ' ' ||
                        coalesce(L4_heading, '') || ' ' ||
                        coalesce(text, '') || ' '))
                    STORED;
            """
        )
        cursor.execute(
            """
            DROP INDEX IF EXISTS muni_fulltext;
            CREATE INDEX muni_fulltext ON muni USING GIN (textsearchable);
            """
        )

def node_embedding(node: Node) -> list[float]:
    pre = '\n'.join(list(node.metadata['headings'].values()))
    summary = summarize(node.text)
    if summary is not None:
        embedding_text = pre + summary
    else:
        embedding_text = pre
    return create_embedding(embedding_text)

def upload(node: Node) -> None:
    if node.text:
        references = node.metadata['references']
        headings = node.metadata['headings']
        with connection() as conn:
            with conn.cursor() as cursor:
                cursor.execute(
                    """
                    INSERT INTO muni (
                        jurisdiction,
                        L1_ref, L1_heading,
                        L2_ref, L2_heading,
                        L3_ref, L3_heading,
                        L4_ref, L4_heading,
                        segment,
                        text,
                        embedding
                    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                    """,
                    (
                        "Chicago",
                        references.get("title", ""),   headings.get("title", ""),   # L1
                        references.get("chapter", ""), headings.get("chapter", ""), # L2
                        references.get("article", ""), headings.get("article", ""), # L3
                        references.get("section", ""), headings.get("section", ""), # L3
                        0, # can add break-down segments later for large text blocks
                        node.text,
                        node_embedding(node),
                    )
                )
    
    if not node.children:
        return
    for child in node.children:
        upload(child)

## [**Action**] Upload municipal code

In [None]:
chicago = Jurisdiction(
    name="Chicago",
    hierarchy={
        "title":   r"TITLE \d+",
        "chapter": r"CHAPTER \d+-\d+",
        "article": r"ARTICLE [IVX]+\\.",
        "section": r"\d+-\d+-\d+",
    },
    source_local="../data/chicago/chicago.txt",
    source_url="https://www.chicago.gov/city/en/sites/covid-19/home.html",
)
chicago_tree = chicago.parse()

upload(chicago_tree)

## [**Code**] Find associations among sections

In [None]:
# Go through rows in the muni database and identify definitions

from muni.llm import definition, analyze_context

sql_select = """
    SELECT  id,
        L1_ref, L1_heading,
        L2_ref, L2_heading,
        L3_ref, L3_heading,
        L4_ref, L4_heading,
        text
    FROM muni;
    """

sql_unique = """
    BEGIN
        IF NOT EXISTS (
            SELECT FROM pg_constraint
            WHERE conname = 'unique_associations')
            AND   conrelid = 'muni_associations'::regclass
        ) 
        THEN
            ALTER TABLE muni_associations
            ADD CONSTRAINT unique_associations UNIQUE (jurisdiction, association, left_id, right_id);
        END IF;
    END;
    """

sql_assoc = """
    INSERT INTO muni_associations (jurisdiction, association, left_id, right_id)
    VALUES (%s, %s, %s, %s)
    ON CONFLICT (jurisdiction, association, left_id, right_id) DO NOTHING;
    """

def scope_map(scope):
    """For a given scope, what are the columns in muni that need to match?"""
    table = {'global': ['jurisdiction'],
             'title': ['jurisdiction', 'L1_ref'],
             'chapter': ['jurisdiction', 'L1_ref', 'L2_ref'],
             'article': ['jurisdiction', 'L1_ref', 'L2_ref', 'L3_ref'],
             'section': ['jurisdiction', 'L1_ref', 'L2_ref', 'L3_ref', 'L4_ref']
             }
    if scope not in table.keys():
        return None
    return table[scope]

In [None]:

def set_associations(conn, id_, scope, context_type):
    """Set associations with a row in muni with all rows matching the scope.
    Args:
        conn: a connection to the database
        id_: the id of the row to associate
        scope: the scope of the association (e.g. 'title', 'chapter', 'article', 'section')
        context_type: the type of association (e.g. 'definition')
    """
    with conn.cursor() as cursor:
        # get the jurisdiction and the references
        cursor.execute(f"SELECT jurisdiction, L1_ref, L2_ref, L3_ref, L4_ref FROM muni WHERE id = {id_}")
        jurisdiction, L1_ref, L2_ref, L3_ref, L4_ref = cursor.fetchone()
        # get the columns that need to match
        columns = scope_map(scope)
        if not columns:
            return
        # get the rows that match the scope
        match_str = ' AND '.join([f"{col} = '{val}'" for col, val in zip(columns, [jurisdiction, L1_ref, L2_ref, L3_ref, L4_ref])])
        cursor.execute(f"SELECT id FROM muni WHERE {match_str} AND id != {id_}")
        rows = cursor.fetchall()
        # set the associations
        for row in rows:
            cursor.execute(sql_assoc, (jurisdiction, context_type, id_, row[0]))

def find_associations(conn):
    allowed_types = ['penalty', 'definition', 'interpretation', 'date']
    with conn.cursor() as cursor:
        cursor.execute(sql_select)
        rows = cursor.fetchall()
        for row in rows:
            id_, L1_ref, L1_heading, L2_ref, L2_heading, L3_ref, L3_heading, L4_ref, L4_heading, text = row
            headings = {'title': L1_heading, 'chapter': L2_heading, 'article': L3_heading, 'section': L4_heading}
            r = analyze_context(text, headings, model='gpt-4')
            if r:
                context_type, scope = r
                if context_type in allowed_types:
                    print(f"* Setting associations for id {id_}")
                    print(f"  Context type: {context_type}; Scope: {scope}")
                    print("  --> %s ..." % text[:80].replace('\n', ' '))
                    set_associations(conn, id_, scope, context_type)

## [**Code**] Hybrid search

In [None]:
def simple_semantic_query(conn, query, limit=10):
    query_embedding = create_embedding(query)
    with conn.cursor() as cursor:
        sql = """
        SELECT id, L4_heading, text
        FROM muni
        WHERE jurisdiction = 'Chicago'
        ORDER BY embedding <=> %s
        LIMIT %s;
        """
        cursor.execute(sql, (str(query_embedding), limit))
        return cursor.fetchall()
        
#with connection() as conn:        
#    results = simple_semantic_query(conn, 'drug paraphernalia')
#for r in results:
#    print(r)

In [None]:
def simple_full_text_query(conn, query, limit=10):
    with conn.cursor() as cursor:
        sql = """
        WITH tsq AS (
            SELECT to_tsquery('english', %s) AS search
            )
        SELECT id, L4_heading, text
        FROM muni, tsq
        WHERE jurisdiction = 'Chicago'
        AND textsearchable @@ tsq.search
        ORDER BY ts_rank_cd(textsearchable, tsq.search)
        LIMIT %s;
        """
        cursor.execute(sql, (query, limit))
        return cursor.fetchall()

#with connection() as conn:        
#    results = simple_full_text_query(conn, 'drug & paraphernalia')
#for r in results:
#     print(r)

In [None]:
# Now we do a more complicated hybrid search, borrowing and adapting from 
# https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search_rrf.py

def hybrid_query(conn, query, limit=10):
    embedding = create_embedding(query)

    sql = """
    WITH semantic_search AS (
        SELECT id, L4_heading, RANK () OVER (ORDER BY embedding <=> %(embedding)s) AS rank
        FROM muni
        ORDER BY embedding <=> %(embedding)s
        LIMIT 20
    ),
    keyword_search AS (
        SELECT id, L4_heading, RANK () OVER (ORDER BY ts_rank_cd(textsearchable, query) DESC)
        FROM muni, plainto_tsquery('english', %(query)s) query
        WHERE textsearchable @@ query
        ORDER BY ts_rank_cd(textsearchable, query) DESC
        LIMIT 20
    )
    SELECT
        COALESCE(semantic_search.id, keyword_search.id) AS id,
        COALESCE(1.0 / (%(k)s + semantic_search.rank), 0.0) +
        COALESCE(1.0 / (%(k)s + keyword_search.rank), 0.0) AS score,
        COALESCE(semantic_search.L4_heading, keyword_search.L4_heading) AS L4_heading
    FROM semantic_search
    FULL OUTER JOIN keyword_search ON semantic_search.id = keyword_search.id
    ORDER BY score DESC
    LIMIT %(limit)s;
    """
    result = conn.execute(sql, {'query': query, 'embedding': str(embedding), 'limit': limit, 'k': 60})
    return result.fetchall()

#with connection() as conn:
#    results = hybrid_query(conn, 'drug paraphernalia')

#for row in results:
#    print(row)

In [None]:
# Try query augmentation using Hyde (generation of synthetic replies matching the
# format of expected answers)
from muni.llm import augmented_embedding

def augmented_query(conn, query, limit=10):
    query_embedding = augmented_embedding(query, orig_weight = 0.5)
    with conn.cursor() as cursor:
        sql = """
        SELECT id, L4_heading, text
        FROM muni
        WHERE jurisdiction = 'Chicago'
        ORDER BY embedding <=> %s
        LIMIT %s;
        """
        cursor.execute(sql, (str(query_embedding), limit))
        return cursor.fetchall()
        
#with connection() as conn:        
#    results = augmented_query(conn, 'Does the code restrict drug paraphernalia')
#for r in results:
#    print(r)