# Document Understanding with GPT-4o and Tool Use (aka Structured Generation)

NOTE: NOT ELIGIBLE FOR USE IN THE CHALLENGE

In [1]:
!pip install -q openai

In [1]:
import os
import json
import base64

import pandas as pd

In [2]:
from openai import OpenAI
client = OpenAI(api_key="sk-<secret>")

In [3]:
def convert_image_to_base64(image_path: str):
    with open(image_path, "rb") as f:
        encoded_image = base64.b64encode(f.read())
    return encoded_image.decode("utf-8")

In [47]:
def extract_key_from_question(question: str) -> str:
    question = " ".join(question.split(" "))
    assert question.startswith("<image>\nWhat is the ")
    assert question.endswith(" in the image?")
    key = question[len("<image>\nWhat is the "):-len(" in the image?")]
    if key.startswith("[") or key.startswith("‘"):
        key = key[1:]
    if key.endswith("?"):
        key = key[:-1]
    return key.replace(" ", "_").replace("=", "equals")

In [48]:
def build_doc_extraction_tool(key: str):
    return {
        "type": "function",
        "function": {
            "name": "doc_extraction_tool",
            "description": "Extract information from a document",
            "parameters": {
                "type": "object",
                "properties": {
                    "reasoning": {"type": "string"},
                    f"{key}": {
                        "type": "integer" if key == "page" else "string",
                        "description": "Concise answer to the user question, exactly as it appears in the document."
                    }
                },
                "required": ["reasoning", f"{key}"],
            },
        }
    }

In [49]:
SYSTEM_PROMPT = "You are a document information extractor. You get an image as an input and you must answer the user's question from the data you extract from the image. Output in json format."

In [50]:
def run_inference(image_path, question, model="gpt-4o-2024-05-13", seed=0):
    image_base64 = convert_image_to_base64(image_path)
    key = extract_key_from_question(question)
    print(f"{key = }")
    if key == "":
        return "This question is unanswerable."
    tool = build_doc_extraction_tool(key)
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": SYSTEM_PROMPT,
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                    },
                    {
                        "type": "text",
                        "text": question,
                    }
                ]
            },
        ],
        temperature = 1,
        # max_tokens=256,
        seed=seed,
        top_p = 1,
        frequency_penalty = 0,
        presence_penalty = 0,
        tools = [tool],
        tool_choice = {
            "type": "function",
            "function": {"name": tool["function"]["name"]},
        },
    )
    response_args = response.choices[0].message.tool_calls[0].function.arguments
    print(f"{response_args = }")
    return json.loads(response_args)[key]

In [31]:
image_path = "data/raw_datasets/mydoc/images/dba9cdb7f30849824f654f41b01a69893e1fde4ca9d6d5ecfb5427a0e3b4158e.png"

In [32]:
run_inference(image_path, "<image>\nWhat is the po number in the image?")

key = 'po_number'
response_args = '{"reasoning":"The PO Number is typically located in the summary sections of invoices or orders. This image contains a \'COMMERCIAL SUMMARY\' section where such details are provided.","po_number":"MULTI/3104"}'


'MULTI/3104'

In [33]:
dataset_name = "mydoc"
dataset_path = os.path.join("data/raw_datasets", dataset_name, "annot_wo_answer.json")
print(dataset_path)
assert os.path.exists(dataset_path)

df_data = pd.read_json(dataset_path)

data/raw_datasets/mydoc/annot_wo_answer.json


In [34]:
!mkdir -p inference_results/gpt-4o

In [None]:
failed_idx = set()
for idx, row in df_data.iterrows():
    id = row["id"]
    answer_txt_path = f"inference_results/gpt-4o/{id}.txt"

    image_path = f"data/raw_datasets/{dataset_name}/images/{row['image']}"
    question = row["conversations"][0]["value"]

    if os.path.exists(answer_txt_path) and "page" not in question:
        continue
    print(id, idx, question)

    try:
        answer = run_inference(image_path, question)
        with open(answer_txt_path, "w") as f:
            f.write(str(answer))
    except Exception as e:
        print("ERROR", idx, row, e)
        failed_idx.add(idx)
    print("---------")

In [37]:
failed_idx = set(failed_idx)
failed_idx

set()

In [51]:
for idx, row in df_data.iterrows():
    id = row["id"]
    answer_txt_path = f"inference_results/gpt-4o/{id}.txt"
    if not os.path.exists(answer_txt_path):
        failed_idx.add(idx)
        continue

    with open(answer_txt_path, "r") as f:
        answer = f.read()

    if len(answer) >= 50:
        failed_idx.add(idx)

In [52]:
failed_idx

{115, 220, 243, 251, 318, 335, 391}

In [None]:
df_data.iloc[list(failed_idx)]

In [None]:
failed_idx_2 = set()
for idx, row in df_data.iloc[list(failed_idx)].iterrows():
    id = row["id"]
    answer_txt_path = f"inference_results/gpt-4o/{id}.txt"
    if os.path.exists(answer_txt_path):
        os.remove(answer_txt_path)
    print(id, idx)

    image_path = f"data/raw_datasets/{dataset_name}/images/{row['image']}"
    question = row["conversations"][0]["value"]

    try:
        answer = run_inference(image_path, question, seed=42)
        with open(answer_txt_path, "w") as f:
            f.write(answer)
    except Exception as e:
        print(idx, row, e)
        failed_idx_2.add(idx)

In [42]:
failed_idx_2

{220}

In [None]:
df_data.iloc[220]["image"]

In [None]:
df_data.iloc[220]["conversations"]