**[Pipeline Implementation]**

1. VLM Tool
2. Agentic object detection pipeline
3. Running the object detector
4. Why Numbered Batching for Inferencing
5. Critiquing and Refining the Query
6. Validation Bounding Box predictions utilizing the VLM

### 1. VLM Tool



* Input
    * Image(path/ base64)
    * User request
* Output
    * Object: [str]

In [11]:
# util function
# 프로젝트 루트 탐색
def find_project_root(marker_filename=".project-root"):
    current_dir = os.path.abspath(os.getcwd())
    while True:
        if os.path.isfile(os.path.join(current_dir, marker_filename)):
            return current_dir
        parent_dir = os.path.dirname(current_dir)
        if parent_dir == current_dir:
            raise FileNotFoundError(f"Could not find {marker_filename} in any parent directory.")
        current_dir = parent_dir

In [12]:
import base64

def encode_image(image_path):
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    except Exception as e:
        print(f"Error encoding image: {e}")
        return None

In [15]:
import base64
import json
from openai import OpenAI

# from utils.image_utils import encode_image

class VLMTool:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)

    def chat_completion(
        self,
        messages,
        model="o1",
        max_tokens=300,
        temperature=0.1,
        response_format=None
    ):
        """Calls GPT for chat completion."""
        try:
            if model in ["gpt-4o", "gpt-4o-mini"]:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=messages,
                    max_tokens=max_tokens,
                    temperature=temperature,
                    response_format=response_format if response_format else {"type": "text"}
                )
            elif model in ["o1"]:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=messages,
                    response_format=response_format if response_format else {"type": "text"}
                )
            else:
                raise NotImplementedError("This model is not supported")

            return response.choices[0].message.content

        except Exception as e:
            print(f"Error calling LLM: {e}")
            return None

    def extract_objects_from_request(self, image_path, user_text, model="gpt-4o"):
        """ Asks the LLM to parse user request for which objects to detect/segment.
        Returns a list of objects in plain text."""
        base64_image = encode_image(image_path)
        if not base64_image:
            return None

        prompt = (
            "You are an AI vision assistant that extracts objects to be identified from a user's request."
            "If the user wants to detect or semantically segment all objects in the image, return a comma-separated list of objects you can see. "
            "If the user wants to detect or semantically segment specific objects, extract only those mentioned explicitly in their request. "
            "Respond ONLY with the list of objects, separated by commas, and NOTHING ELSE."
            "The objective here is only to understand the objects of interest that can be extracted from the image and the user's request."
            "You are not actually required to perform or execute the user's request."    
            )

        messages = [
            {"role": "system", "content": prompt},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": user_text},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ]

        result = self.chat_completion(messages, model=model)
        if result:
            detected_objects = [
                obj.strip().lower()
                for obj in result.split(",")
                if obj.strip()
            ]
            return detected_objects

        return []

In [None]:
# chat_completion() 출력 예시 
import os
from dotenv import load_dotenv

project_root = find_project_root()
dotenv_path = os.path.join(project_root, ".env")
load_dotenv(dotenv_path) # .env 파일의 환경 변수를 os.environ에 로드 / 환경변수 전부 로드

api_key = os.getenv("OPENAI_API_KEY")
client = VLMTool(api_key=api_key)
chat_messages = [
    {"role": "system", "content": "You are a creative poet."},
    {"role": "user", "content": "Write a short poem about a lonely star."}
]

response = client.chat_completion(
    messages=chat_messages,
    model="gpt-4o",
    max_tokens=150,
    temperature=0.8
)
print(response)

In the velvet cloak of night so deep,  
A solitary star begins to weep.  
It hangs alone in the vast expanse,  
Yearning for a kindred glance.  

Surrounded by an endless void,  
Its silent glow is unalloyed.  
A shimmering tear in the cosmic seam,  
A beacon lost in an endless dream.  

Oh, lonely star in the sky's embrace,  
Does solitude enhance your grace?  
Or do you long for a companion's light,  
To share the burden of the night?  

Yet in your solitude, you shine,  
A gem of hope in the dark divine.  
Though you wander the heavens afar,  
You teach us the strength of a lonely star.  


In [None]:
# extract_objects_from_request() 출력 예시
# extract_objects_from_request() 출력 예시 
import os
from dotenv import load_dotenv

project_root = find_project_root()
dotenv_path = os.path.join(project_root, ".env")
load_dotenv(dotenv_path)

api_key = os.getenv("OPENAI_API_KEY")
client = VLMTool(api_key=api_key)

# 테스트할 이미지 경로 (실제 이미지 경로로 변경하세요)
image_path = r"C:\Users\KIST\agentic-object-detection\data\image1.jpg"


# 테스트 케이스 1: 모든 객체 탐지
user_request_1 = "이 이미지에 있는 모든 객체를 찾아줘"
objects_1 = client.extract_objects_from_request(image_path, user_request_1)
print(f"요청: {user_request_1}")
print(f"결과: {objects_1}")
print()

### 2. Agentic object detection pipeline

### 3. Running the object detector

### 4. Why Numbered Batching for Inferencing

### 5. Critiquing and Refining the Query

### 6. Validation Bounding Box predictions utilizing the VLM