In [1]:
import os
import base64
import pandas as pd
import asyncio
import tiktoken
import time
import re
import json
import asyncio
import nest_asyncio

from datetime import datetime
from dotenv import load_dotenv
from openai import RateLimitError, APIStatusError
from llama_index.llms.azure_openai import AzureOpenAI

from llama_index.core import (
    PromptTemplate
)
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.core.query_pipeline import QueryPipeline, FnComponent
from mimetypes import guess_type

from dataclasses import dataclass


  from .autonotebook import tqdm as notebook_tqdm


To do:

- Improve prompt.
- Run on all images.

In [2]:
os.chdir("../")
CWD = os.getcwd()

data_dir = os.path.join(CWD, 'data')

#Specify mode (working with a sample or all the files?)
sample_mode = True 
sample_size = 20

# specify file path
INPUT_FILE_PATH = os.path.join("data", "proc", "building_plans", "text", "bp_text.json")
METADATA_PATH = os.path.join("data", "proc", "building_plans", "metadata","building_plans_metadata.csv")

# specify relevant column names
ID_COLUMN='filename'
TEXT_COLUMN='content'

# read in data
bp_text = pd.read_json(INPUT_FILE_PATH)
metadata_df = pd.read_csv(METADATA_PATH)

In [4]:
metadata_bps = metadata_df[metadata_df['Planart'].isin(['qualifizierter Bebauungsplan', 'einfacher Bebauungsplan', 'vorhabenbezogener Bebauungsplan'])]

In [5]:
bp_text['id'] = bp_text['filename'].str.extract(r'(\d+)_').astype(int)

In [6]:
input_df = metadata_bps.merge(bp_text)

In [7]:
# Option 1: Use httpimport to load 'azure_authentication' package remotely from GitHub without installing it
import httpimport
with httpimport.remote_repo('https://raw.githubusercontent.com/soda-lmu/azure-auth-helper-python/main/src'
                            '/azure_authentication/'):
    from customized_azure_login import CredentialFactory

In [8]:
# Recommendation: Configure your own authentication workflow with environment variables, see the description at
# https://github.com/soda-lmu/azure-auth-helper-python/blob/main/AuthenticationWorkflowSetup.md
credential = CredentialFactory().select_credential()
token_provider = credential.get_login_token_to_azure_cognitive_services()

print("Instantiate Azure OpenAI Client")

Instantiate Azure OpenAI Client


In [9]:
llm4 = AzureOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    engine="gpt-4-1106-preview", 
    model="gpt-4-1106-preview",
    api_key=token_provider(),  # alternative: insert os.getenv("AZURE_OPENAI_API_KEY")
    api_version="2024-02-01",  # or use a preview version (e.g., "2024-03-01-preview") for the latest features.
    # api_version (How-To): https://stackoverflow.com/questions/76475419/how-can-i-select-the-proper-openai-api-version
    timeout=600.0,  # throw APITimeoutError after 10 minutes without a response (default behavior)
)

#MODEL_DEPLOYMENT_NAME = "gpt-4-turbo-vision-preview"

In [10]:

@dataclass(frozen=True)

class Llm_Extraction_Prompt:
    """
    The dataclass contains a prompt (=query text).
    Strategy: We make a single query to extract relevant info from BP.
    """
    query: str = (
        "You are a helpful enviromental city planner." #city urban planner / spatial planner
        "Based on the excerpt from a building plan provided below, we would like to extract following information.\n"
        "1. Maximal construction height (GFZ): numeric value and unit of measurement.  \n"
        "2. Maximal floor coverage (GRZ): numeric value and unit of measurement.\n" #Add unit of measurement
        "3. Types of building use (Art der baulichen Nutzung): list all that appear."
        "4. Appearance of green areas (Grünflächen): True/False value." 
        "5. Firsthöhe length (FH): numeric value and unit of measurement."
        "6. Traufhöhe length (TH): numeric value and unit of measurement."
        "7. Company names mentioned: list all that appear."
        "If a particular piece of information is not present, output 'Not specified'.\n\n"
        "Extract the information into appropiate JSON.\n"
        "---------------------\n"
        "\n\nHere is the excerpt:\n {context_str}\n\n"
    )

    @staticmethod
    def parse_gpt_output(gpt_question_output) -> pd.DataFrame:
        """Extract year, scope, value, and unit gpt_question_output using regular expressions."""

        pattern = r'```json\n({.*?})\n```'

        rows = []

        match = re.search(pattern, gpt_question_output, re.DOTALL)
        
        if match:
            #json_str = match['output_parser']['res'].group(1)
            json_str = match.group(1)
            json_dict = json.loads(json_str)
            rows.append(json_dict)
    

        output_table = pd.DataFrame(rows)
        output_table["raw_llm_response"] = gpt_question_output

        return(output_table)


@dataclass(frozen=True)

class Llm_Flooding_Prompt:
    """
    The dataclass contains a prompt (=query text).
    Strategy: We make a single query to extract relevant info from BP.
    """
    query: str = (
        "You are a helpful environmental city planner. "
        "Based on the excerpt from a building plan provided below, we would like to know if the building has taken measures against flooding risk.\n"
        "1. Identify if there are water bodies mentioned in the text: list all that appear. \n"
        "2. Flooding risk prevention measures: list all that appear.\n"
        "---------------------\n"
        "Present the output in a JSON format where the keys are 'water_bodies', 'flooding_risk_prevention_measures' and the values are a list with the results."
        "---------------------\n"
        "\n\nHere is the excerpt:\n {context_str}\n\n"
    )

    @staticmethod
    def parse_gpt_output(gpt_question_output) -> pd.DataFrame:
        """Extract year, scope, value, and unit gpt_question_output using regular expressions."""

        pattern = r'```json\n({.*?})\n```'

        rows = []

        match = re.search(pattern, gpt_question_output, re.DOTALL)
        
        if match:
            json_str = match.group(1)
            json_dict = json.loads(json_str)
            rows.append(json_dict)
    

        output_table = pd.DataFrame(rows)
        output_table["raw_llm_response"] = gpt_question_output

        return(output_table)


In [20]:
def define_pipeline_query(prompt_selected):

    prompt_tmpl = PromptTemplate(prompt_selected.query)
    string_converter = FnComponent(fn=lambda x: str(x), output_key="gpt_question_output")
    output_parser = FnComponent(fn=prompt_selected.parse_gpt_output, output_key="res")

    p = QueryPipeline(modules={"llm_prompt": prompt_tmpl,
                            "llm": llm4,
                            "string_converter": string_converter,
                            "output_parser": output_parser
                            },
                            verbose=False)
    p.add_chain(["llm_prompt", "llm", "string_converter", "output_parser"])
    p.add_link( "string_converter", "output_parser")

    return(p)

async def run_pipeline_on_rows(str_text, 
                         pipeline):

    res = await pipeline.arun_multi({'llm_prompt':{'context_str' : str_text}})

    return(res)

async def extract_bp_info(prompt_selected,
                    data):
    
    p = define_pipeline_query(prompt_selected)

    results = []

    for index, row in data.iterrows():
        
        res = await run_pipeline_on_rows(row['content'], p)
        table_output = res['output_parser']['res']
        table_output['id'] = row['id']
        table_output['filename'] = row['filename']
        results.append(table_output)

    return(results)

In [21]:
if sample_mode: 
    
    run_data = input_df.sample(sample_size, random_state=15)

else: 

    run_data = input_df

In [22]:
nest_asyncio.apply()

flooding_results = await extract_bp_info(Llm_Flooding_Prompt(), run_data)
extraction_results = await extract_bp_info(Llm_Extraction_Prompt(), run_data)

In [23]:
extraction_df = pd.concat(extraction_results)
flooding_df = pd.concat(flooding_results)

In [25]:
OUTPUT_FLOODING_FILE_PATH = os.path.join("data", "proc", "building_plans_sample", "features",  "flooding_data_extraction.csv")
OUTPUT_EXTRACTIONS_FILE_PATH = os.path.join("data", "proc", "building_plans_sample", "features",  "info_data_extraction.csv")


extraction_df.to_csv(OUTPUT_EXTRACTIONS_FILE_PATH)
flooding_df.to_csv(OUTPUT_FLOODING_FILE_PATH)