# Infographics Understanding with GPT-4o and Tool Use (aka Structured Generation)

NOTE: NOT ELIGIBLE FOR USE IN THE CHALLENGE

In [1]:
!pip install -q openai

In [2]:
import os
import json
import base64

import pandas as pd

In [3]:
from openai import OpenAI
client = OpenAI(api_key="sk-<secret>")

In [4]:
def convert_image_to_base64(image_path: str):
    with open(image_path, "rb") as f:
        encoded_image = base64.b64encode(f.read())
    return encoded_image.decode("utf-8")

In [9]:
def format_title(image_path: str) -> str:
    title = image_path.split("/")[-1].split(".")[0]
    return title.replace("-", " ").replace("_", " ")

In [10]:
def build_tool():
    return {
        "type": "function",
        "function": {
            "name": "infographic_explainer_tool",
            "description": "Infographic Explair Tool",
            "parameters": {
                "type": "object",
                "properties": {
                    "reasoning": {"type": "string"},
                    "answer": {
                        "type": "string",
                        "description": "Concise answer to the user question."
                    },
                },
                "required": ["reasoning", "answer"],
            },
        }
    }

In [11]:
SYSTEM_PROMPT = "You are an infographics explainer. You will receive an image as an input and you must answer the user's question based on the image. Be concise and limit responses to at most 3 sentences, preferably one sentence long. Respond in English."

In [12]:
def run_inference(image_path, question, model="gpt-4o-2024-05-13", seed=0):
    image_base64 = convert_image_to_base64(image_path)
    question_trimmed = question[len("<image>\n"):]
    title = format_title(image_path)
    print(f"{title = } | {question_trimmed = }")
    tool = build_tool()
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": SYSTEM_PROMPT,
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
                    },
                    {
                        "type": "text",
                        "text": f"This infographics has the title {title}. {question_trimmed}",
                    }
                ]
            },
        ],
        temperature = 1,
        # max_tokens=256,
        seed=seed,
        top_p = 1,
        frequency_penalty = 0,
        presence_penalty = 0,
        tools = [tool],
        tool_choice = {
            "type": "function",
            "function": {"name": tool["function"]["name"]},
        },
    )
    response_args = response.choices[0].message.tool_calls[0].function.arguments
    print(f"{response_args = }")
    return json.loads(response_args)["answer"]

In [13]:
image_path = "data/raw_datasets/myinfographic/images/_53f9b07a43e61_w1500.jpg"

In [None]:
run_inference(image_path, "<image>\nWhat is the overall message conveyed by the different elements and quotes in the infographic?")

In [None]:
dataset_name = "myinfographic"
dataset_path = os.path.join("data/raw_datasets", dataset_name, "annot_wo_answer.json")
print(dataset_path)
assert os.path.exists(dataset_path)

df_data = pd.read_json(dataset_path)

In [16]:
!mkdir -p inference_results/gpt-4o

In [None]:
failed_idx = set()
for idx, row in df_data.iterrows():
    id = row["id"]
    answer_txt_path = f"inference_results/gpt-4o/{id}.txt"
    if os.path.exists(answer_txt_path):
        continue
    print(id, idx)

    image_path = f"data/raw_datasets/{dataset_name}/images/{row['image']}"
    question = row["conversations"][0]["value"]

    try:
        answer = run_inference(image_path, question)
        with open(answer_txt_path, "w") as f:
            f.write(answer)
    except Exception as e:
        print(idx, row, e)
        failed_idx.add(idx)

In [None]:
failed_idx = set(failed_idx)
failed_idx

In [81]:
for idx, row in df_data.iterrows():
    id = row["id"]
    answer_txt_path = f"inference_results/gpt-4o/{id}.txt"
    if not os.path.exists(answer_txt_path):
        failed_idx.add(idx)
        continue

    with open(answer_txt_path, "r") as f:
        answer = f.read()

    if len(answer) >= 50:
        failed_idx.add(idx)

In [None]:
failed_idx

In [None]:
df_data.iloc[list(failed_idx)]

In [None]:
failed_idx_2 = set()
for idx, row in df_data.iloc[list(failed_idx)].iterrows():
    id = row["id"]
    answer_txt_path = f"inference_results/gpt-4o/{id}.txt"
    if os.path.exists(answer_txt_path):
        os.remove(answer_txt_path)
    print(id, idx)

    image_path = f"data/raw_datasets/{dataset_name}/images/{row['image']}"
    question = row["conversations"][0]["value"]

    try:
        answer = run_inference(image_path, question, seed=42)
        with open(answer_txt_path, "w") as f:
            f.write(answer)
    except Exception as e:
        print(idx, row, e)
        failed_idx_2.add(idx)

In [None]:
failed_idx_2