In [1]:
%pip install -qU openai
%pip install -qU "psycopg[binary]"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
sys.path.insert(0, '..')

In [3]:
from muni.structure import Jurisdiction

# start off with one city to make parsing easier
# this text file contains relevant portions of the Municipal Code of Chicago

chicago = Jurisdiction(
    name="Chicago",
    hierarchy={
        "title":   r"TITLE \d+",
        "chapter": r"CHAPTER \d+-\d+",
        "article": r"ARTICLE [IVX]+\\.",
        "section": r"\d+-\d+-\d+",
    },
    source_local="../data/chicago/chicago.txt",
    source_url="https://www.chicago.gov/city/en/sites/covid-19/home.html",
)
chicago_tree = chicago.parse()

In [4]:
from psycopg import connect
from muni.llm import create_embedding, summarize
from muni.structure import Node

RESET = False
EMBEDDING_LENGTH = len(create_embedding("test"))

def connection():
    return connect(
        dbname="regrag",
        host="localhost",
        port="5432",
        autocommit=True
    )

with connection() as conn:
    if RESET:
        with conn.cursor() as cursor:
            cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
            cursor.execute("DROP TABLE IF EXISTS muni_associations;")
            cursor.execute("DROP TABLE IF EXISTS muni CASCADE;")

    with conn.cursor() as cursor:
        cursor.execute(
            """
            CREATE TABLE IF NOT EXISTS muni_associations (
                jurisdiction TEXT,
                association TEXT,
                left_id INTEGER,
                right_id INTEGER
            );
            """)
        cursor.execute(
            """
            CREATE TABLE IF NOT EXISTS muni (
                id SERIAL PRIMARY KEY,
                jurisdiction TEXT,
                L1_ref TEXT, L1_heading TEXT,
                L2_ref TEXT, L2_heading TEXT,
                L3_ref TEXT, L3_heading TEXT,
                L4_ref TEXT, L4_heading TEXT,
                segment INTEGER,
                text TEXT,
                embedding VECTOR(%s)
            );
            """, (EMBEDDING_LENGTH,))
        cursor.execute(
            """
            ALTER TABLE muni
                ADD COLUMN IF NOT EXISTS textsearchable tsvector
                    GENERATED ALWAYS AS
                    (to_tsvector('english',
                        coalesce(jurisdiction, '') || ' ' ||
                        coalesce(L1_heading, '') || ' ' ||
                        coalesce(L2_heading, '') || ' ' ||
                        coalesce(L3_heading, '') || ' ' ||
                        coalesce(L4_heading, '') || ' ' ||
                        coalesce(text, '') || ' '))
                    STORED;
            """
        )
        cursor.execute(
            """
            DROP INDEX IF EXISTS muni_fulltext;
            CREATE INDEX muni_fulltext ON muni USING GIN (textsearchable);
            """
        )

def node_embedding(node: Node) -> list[float]:
    pre = '\n'.join(list(node.metadata['headings'].values()))
    summary = summarize(node.text)
    if summary is not None:
        embedding_text = pre + summary
    else:
        embedding_text = pre
    return create_embedding(embedding_text)

def upload(node: Node) -> None:
    if node.text:
        references = node.metadata['references']
        headings = node.metadata['headings']
        with connection() as conn:
            with conn.cursor() as cursor:
                cursor.execute(
                    """
                    INSERT INTO muni (
                        jurisdiction,
                        L1_ref, L1_heading,
                        L2_ref, L2_heading,
                        L3_ref, L3_heading,
                        L4_ref, L4_heading,
                        segment,
                        text,
                        embedding
                    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                    """,
                    (
                        "Chicago",
                        references.get("title", ""),   headings.get("title", ""),   # L1
                        references.get("chapter", ""), headings.get("chapter", ""), # L2
                        references.get("article", ""), headings.get("article", ""), # L3
                        references.get("section", ""), headings.get("section", ""), # L3
                        0, # can add break-down segments later for large text blocks
                        node.text,
                        node_embedding(node),
                    )
                )
    
    if not node.children:
        return
    for child in node.children:
        upload(child)

In [5]:
upload(chicago_tree)

In [7]:
# Do a quick test to make sure the database and vector search are working

query = 'rules about dogs'
query_embedding = create_embedding(query)
limit = 15

with connection() as conn:
    with conn.cursor() as cursor:
        cursor.execute(
            """
            SELECT id, L4_ref, L2_heading, L3_heading, L4_heading, text
            FROM muni
            WHERE jurisdiction = 'Chicago'
            ORDER BY embedding <=> %s
            LIMIT %s""",
            (str(query_embedding), limit)
        )
        for row in cursor.fetchall():
            print(row)


(143, '7-12-330', 'ANIMAL CARE AND CONTROL', '', 'Burial of dead animals.', 'No person shall leave in or throw into any public way, public place\nor public theater, or offensively expose or bury within the city, the\nbody or any part thereof of any dead or fatally sick or injured animal;\nnor shall any person keep any dead animal in a place where it may be\ndangerous to the life or detrimental to the health of any other animal\nor person; provided, however, that the owner of any dead pet weighing\nnot more than 150 pounds may bury such animal on his premises; provided,\nfurther, that not more than one such animal shall be buried upon any\nhalf acre ground within two years, and such animal shall be placed at\nleast three feet below the surface of the soil surrounding and adjacent\nto the grave.\n\n(Prior code § 98-13)')
(157, '7-12-420', 'ANIMAL CARE AND CONTROL', '', 'Removal of excrement.', "No person shall appear with a pet upon the public ways or within\npublic places or upon the pr