# WildReceipt with GPT-4o and Tool Use (aka Structured Generation)

NOTE: NOT ELIGIBLE FOR USE IN THE CHALLENGE

In [1]:
# !pip install -q openai

In [2]:
import os
import json

import pandas as pd
from PIL import Image

In [15]:
from openai import OpenAI
client = OpenAI(api_key="sk-<secret>")

In [4]:
from notebooks_shared_utils import (
    extract_image_title_from_path,
    load_image_and_convert_to_base64,
)

In [5]:
dataset_name = "wildreceipt"
dataset_path = os.path.join("data/processed_data", dataset_name, "converted_output_test.json")
print(dataset_path)
assert os.path.exists(dataset_path)

df_data = pd.read_json(dataset_path)

data/processed_data/wildreceipt/converted_output_test.json


In [6]:
OPTIONS_STR = """\"Store_name_value\", \"Store_name_key\", \"Store_addr_value\", \"Store_addr_key\", \"Tel_value\", \"Tel_key\", \"Date_value\", \"Date_key\", \"Time_value\", \"Time_key\", \"Prod_item_value\", \"Prod_item_key\", \"Prod_quantity_value\", \"Prod_quantity_key\", \"Prod_price_value\", \"Prod_price_key\", \"Subtotal_value\", \"Subtotal_key\", \"Tax_value\", \"Tax_key\", \"Tips_value\", \"Tips_key\", \"Total_value\", \"Total_key\", \"Others\""""
OPTIONS = json.loads(f"[{OPTIONS_STR}]")

In [7]:
def build_tool():
    return {
        "type": "function",
        "function": {
            "name": "question_and_answer_tool",
            "description": "Question and Answer Tool",
            "parameters": {
                "type": "object",
                "properties": {
                    "reasoning": {"type": "string"},
                    "answer": {
                        "enum": OPTIONS,
                        "description": "Exact answer to the user question."
                    },
                },
                "required": ["reasoning", "answer"],
            },
        }
    }

In [8]:
SYSTEM_PROMPT = "You are a question-and-answer tool. You get an image as an input and you must answer the user's question from the data you extract from the image. Output in json format."

In [9]:
def run_inference(
    image_path,
    question,
    infer_image_title: bool = False,
    model="gpt-4o-2024-05-13",
    seed=0,
):
    image_base64 = load_image_and_convert_to_base64(image_path)
    question_trimmed = question[len("<image>\n"):]
    if infer_image_title:
        title = extract_image_title_from_path(image_path)
        print(f"{title = } | {question_trimmed = }")
    else:
        print(f"{question_trimmed = }")
    tool = build_tool()
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": SYSTEM_PROMPT,
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                    },
                    {
                        "type": "text",
                        "text": (
                            f"This infographic has the title {title}. {question_trimmed}"
                            if infer_image_title
                            else question_trimmed
                        ),
                    }
                ]
            },
        ],
        temperature = 1,
        # max_tokens=256,
        seed=seed,
        top_p = 1,
        frequency_penalty = 0,
        presence_penalty = 0,
        tools = [tool],
        tool_choice = {
            "type": "function",
            "function": {"name": tool["function"]["name"]},
        },
    )
    response_args = response.choices[0].message.tool_calls[0].function.arguments
    print(f"{response_args = }")
    return json.loads(response_args)["answer"]

In [None]:
image_path = df_data.iloc[0]["image"]
print(image_path)
image = Image.open(image_path)
image

In [None]:
question = df_data.iloc[0]["conversations"][0]["value"]
run_inference(image_path, question)

In [12]:
!mkdir -p inference_results/gpt-4o

In [13]:
failed_idx = set()
for idx, row in df_data.iterrows():
    id = row["id"]
    answer_txt_path = f"inference_results/gpt-4o/{dataset_name}_{id}.txt"
    if os.path.exists(answer_txt_path):
        continue

    image_path = row['image']
    question = row["conversations"][0]["value"]
    print(id, idx, image_path)

    try:
        answer = run_inference(image_path, question)
        with open(answer_txt_path, "w") as f:
            f.write(answer)
    except Exception as e:
        print(idx, row, e)
        failed_idx.add(idx)
    finally:
        print("=========================")

In [14]:
failed_idx = set(failed_idx)
failed_idx

set()