In [19]:
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

oai_client = OpenAI(api_key =os.getenv("LANDPRO_OAI_KEY"))

load_dotenv('/Users/paigegiese/SYG/landproDATA_code/misc-work/.env')

username = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")

# Create SQLAlchemy engine
engine = create_engine(f"mysql+mysqlconnector://{username}:{password}@{host}:{port}/sandbox_paigeg",connect_args={"buffered": True})
# query = "SELECT * FROM instruments"
# df = pd.read_sql(query, con=engine)
# df.head()

# query = "SHOW TABLES;"
# tables_df = pd.read_sql(query, con=engine)
# print(tables_df)

In [18]:
engine.dispose()

In [7]:
[table for table in tables_df['Tables_in_sandbox_paigeg'] if table.startswith('llm')]

['llm_api_call_configs',
 'llm_api_call_versions',
 'llm_parameters',
 'llm_prompts',
 'llm_structured_output']

# Table Writes

In [None]:
from pydantic import BaseModel, Field
from typing import Annotated, Optional, List, Literal
import json
from sqlalchemy import text

## llm_api_call_configs

In [None]:
query = """ USE sandbox_paigeg;
INSERT INTO llm_api_call_configs 
(name, description, model, version, active, credential_key, created_by, updated_by)
VALUES
('document_ai_ocr', 'Returns structured output for subdivision documents processed by DocumentAI', 'gpt-4o-mini', 1, 1, 'OPENAI_API_KEY', 'landproDATA-PaigeG', 'landproDATA-PaigeG');
"""

with engine.connect() as conn:
    conn.execute(text(query))
    conn.close()

## structured_outputs

!! THIS NEEDS WORK - FINNICKY WITH THE JSON DUMPING

In [None]:
class Person(BaseModel):
    first_name: Annotated[str, Field(description="A persons first name")]
    last_name: Annotated[str, Field(description="A persons last name")]
    title: Annotated[str, Field(description="A persons job title")]
    license_number: Annotated[str, Field(description="A persons license number, if applicable (likely for plats and surveys), where the person is a certified professional")]

class Entities(BaseModel):
    name: Annotated[str, Field(description="The entity full name (including suffices like LLC, or inc)")]
    type: Annotated[Literal['state', 'county', 'city','company','parcel_name','other'], Field(description="Identify entities in the document and categorize by type. Use other if the existing types dont fit the context")]
    inferred_context: Annotated[str, Field(description="A 200-character description of why this entity is present. Why are they referenced? What appears to be the purpose of their presence?")]

class TownshipSectionRange(BaseModel):
    range: Annotated[str, Field(description="Range")]
    section: Annotated[str, Field(description="Section")]
    township: Annotated[str, Field(description="Township")]

class Document(BaseModel):
    people: Annotated[Optional[List[Person]], Field("Returns a list of people identified from the extracted text")] 
    entities: Annotated[Optional[List[Entities]], Field("Returns a list of non-human entities identified from the extracted text")] 
    township_section_range: Annotated[TownshipSectionRange, Field(description="Township, section, and range. Return None if not present.")]
    legal_description: Annotated[str, Field(description="Legal description from the extracted text - it is critical this is extremely precise to the original text.")]

    # Generate JSON Schema
document_schema_json = Document.model_json_schema()

# Pretty print or convert to string for DB
schema_json_str = json.dumps(document_schema_json, indent=2)
print(schema_json_str)

## llm_prompts

!! ALSO FINNICKY DUE TO JSON/TEXT/SINGLE QUOTES VS MULTIPLE QUOTES

In [None]:
prompt = """{"role": "system", "content": "You are a helpful assistant at interpreting raw text extracted from complex land and parcel documents and surveys. You will always be provided one document at a time. \
             ##Precise instructions:\
             1. Carefully analyze the extracted text from the OCR process. Each input will delineate the files with the words [FILE NAME], which indicates page separation within the same document. \
                Similarly, the OCR processor identifies different paragraphs, which are delineated with [PARAGRAPH]. This should help make logical assumptions about the request. \
                Do not use prior knowledge or information from outside the context to answer the questions. Only use the information provided in the context to answer the questions.\
             2. Review the required JSON structure for the response\
             3. Fulfill the request to the best of your ability\
             4.  Review the input text structure to ensure the Legal Description is as complete and precise as possible. \
             Hints to help find the legal description: \
             - Use paragraph and header/subtitle hints, like isolating text between headers.\
             - Legal descriptions often explain a geographic polygon that is able to be closed when drawn.\
             - Note language that describes an area of ownerhip\
             - Note language like 'the point of beginning', and subsequent measurements. e.g. South 00°38 16 West, 305.80 feet\
             - Note when a block/paragraph of text seemingly terminates with an acreage statement.\
            ### A complete example of a legal description \
            # [PARAGRAPH] # \
            A portion of Lots 25, 26, and 27, Roberts and Hill Subdivision as is filed in Book 4 of Plats at Page 159, records of Ada County, Idaho located in the\
            Southeast 1/4 of the Northeast 1/4 of Section 14, T.4N., R.1E., B.M., City of Boise, Ada County, Idaho more particularly described as follows:\
            # [PARAGRAPH] # \
            Commencing at the East 1/4 corner of said Section 14, from which the Center 1/4 corner of said Section 14 bears North 88°50 47 West, 2635.27 feet;\
            thence on the East boundary line of said Section 14, North 00°38 16 East, 1185.68 feet; thence leaving said East boundary line, North 88°35 49 West, 25.00\
            feet to the westerly right-of-way line of N. Bogart Lane and the REAL POINT OF BEGINNING;\
            # [PARAGRAPH] # \
            thence on said westerly right-of-way line the following seven (7) courses and distances:\
            # [PARAGRAPH] # \
            South 00°38 16  West, 305.80 feet;\
            South 07°14 08  West, 65.87 feet;\
            # [PARAGRAPH] # \
            South 03°53 20  West, 48.39 feet;\
            # [PARAGRAPH] # \
            71.67 feet along the arc of curve to the right having a radius of 60.00 feet, a central angle of 68°26 22  and a long chord which bears South\
            38°06 30  West, 67.48 feet;\
            # [PARAGRAPH] # \
            52.27 feet along the arc of curve to the right having a radius of 135.00 feet, a central angle of 22°11 07  and a long chord which bears South\
            83°25 15  West, 51.95 feet;\
            # [PARAGRAPH] # \
            81.83 feet along the arc of a curve to the left having a radius of 615.00 feet, a central angle of 07°37 26  and a long chord which bears North\
            89°17 54  West, 81.77 feet;\
            # [PARAGRAPH] #  \
            16.80 feet along the arc of a curve to the right having a radius of 585.00 feet, a central angle of 01°38 44  and a long chord which bears South\
            87°42 44  West, 16.80 feet to the northerly right-of-way line of W. Hill Road Parkway;\
            # [PARAGRAPH] # \
            thence on said northerly right-of-way line the following two (2) courses and distances:\
            # [PARAGRAPH] # \
            103.13 feet along the arc of a curve to the left having a radius of 1,193.92 feet, a central angle of 04°56 57  and a long chord which bears North\
            86°30 32  West, 103.10 feet;\
            # [PARAGRAPH] # \
            North 88°59 01  West, 659.42 feet to the west boundary line of said Lot 27, Roberts and Hill Subdivision;\
            # [PARAGRAPH] # \
            thence on said west boundary line, North 00°35 32  East, 608.79 feet to the north boundary line of said Roberts and Hill Subdivision;\
            thence on said north boundary line, South 88°35 49  East, 814.40 feet;\
            # [PARAGRAPH] # \
            thence leaving said north boundary line, South 00°38 16  West, 125.00 feet;\
            thence South 88°35 49  East, 150.00 feet to the REAL POINT OF BEGINNING.\
            # [PARAGRAPH] # \
            Containing 12.93 acres, more or less.\
            ## Formatting Instructions\
             Use as precise of language as possible. Do not include any explanation in the reply. Only include the extracted information in the reply.\
             Only in 'inferred_context' are you allowed to practice freedom of explanation."},
            {"role": "user", "content": f"Return your interpretation of the following OCR text using the structured output model provided: {text}"}"""

In [None]:
query = f"""USE sandbox_paigeg;
INSERT INTO llm_prompts (api_call_id, text, version, active, created_by,updated_by) VALUES (
2,
'{prompt}',
1,
1,
'landproDATA-PaigeG',
'landproDATA-PaigeG'
);"""

with engine.connect() as conn:
    conn.execute(text(query))


In [None]:
print(query)

## llm_parameters

In [10]:
from sqlalchemy import  text

In [8]:
query = """USE sandbox_paigeg;
INSERT INTO llm_parameters (api_call_id, key_name, value, version, active, created_by, updated_by) VALUES (
2,
'temperature',
1,
1,
1,
'landproDATA-PaigeG',
'landproDATA-PaigeG'
);"""

In [12]:
engine.dispose()

In [14]:
print(query)

USE sandbox_paigeg;
INSERT INTO llm_parameters (api_call_id, key_name, value, version, active, created_by, updated_by) VALUES (
2,
'temperature',
1,
1,
1,
'landproDATA-PaigeG',
'landproDATA-PaigeG'
);


In [17]:
# with engine.connect() as conn:
#     conn.execute(text(query))

## llm_api_call_versions

In [None]:
"""USE sandbox_paigeg;
INSERT INTO llm_api_call_versions (api_call_id, prompt_id, schema_id,parameters_version,version_label, active, created_by,updated_by) VALUES (
2,
1,
1,
1,
'v1',
1,
'landproDATA-PaigeG',
'landproDATA-PaigeG'
);
"""

# Recompile from the queries into a cURL request: