**[Pipeline Implementation]**

1. VLM Tool
2. Agentic object detection pipeline
3. Running the object detector
4. Why Numbered Batching for Inferencing
5. Critiquing and Refining the Query
6. Validation Bounding Box predictions utilizing the VLM

### 1. VLM Tool



In [5]:
import base64

def encode_image(image_path):
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    except Exception as e:
        print(f"Error encoding image: {e}")
        return None

In [None]:
import base64
import json
from openai import OpenAI

# from utils.image_utils import encode_image

class VLMTool:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)

    def chat_completion(
        self,
        messages,
        model="o1",
        max_tokens=300,
        temperature=0.1,
        response_format=None
    ):
        """Calls GPT for chat completion."""
        try:
            if model in ["gpt-4o", "gpt-4o-mini"]:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=messages,
                    max_tokens=max_tokens,
                    temperature=temperature,
                    response_format=response_format if response_format else {"type": "text"}
                )
            elif model in ["o1"]:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=messages,
                    response_format=response_format if response_format else {"type": "text"}
                )
            else:
                raise NotImplementedError("This model is not supported")

            return response.choices[0].message.content

        except Exception as e:
            print(f"Error calling LLM: {e}")
            return None

    def extract_objects_from_request(self, image_path, user_text, model="gpt-4o"):
        pass

In [None]:
import os
import argparse
from dotenv import load_dotenv

# Find project root using .project-root marker file
def find_project_root(marker_filename=".project-root"):
    current_dir = os.path.abspath(os.getcwd())
    while True:
        if os.path.isfile(os.path.join(current_dir, marker_filename)):
            return current_dir
        parent_dir = os.path.dirname(current_dir)
        if parent_dir == current_dir:
            raise FileNotFoundError(f"Could not find {marker_filename} in any parent directory.")
        current_dir = parent_dir

parser = argparse.ArgumentParser()
parser.add_argument("--project_root",  type=str, default=None, help="Path to project root (containing .project-root)")
args, _ = parser.parse_known_args()

if args.project_root:
    project_root = args.project_root
else:
    project_root = find_project_root()

dotenv_path = os.path.join(project_root, ".env")
load_dotenv(dotenv_path)``

api_key = os.getenv("OPENAI_API_KEY")
client = VLMTool(api_key=api_key)
chat_messages = [
    {"role": "system", "content": "You are a creative poet."},
    {"role": "user", "content": "Write a short poem about a lonely star."}
]

response = client.chat_completion(
    messages=chat_messages,
    model="gpt-4o",
    max_tokens=150,
    temperature=0.8
)
print(response)

In the vast velvet sky so wide,  
A lonely star takes its nightly stride.  
With a shimmer soft, it bravely gleams,  
Whispering secrets through cosmic dreams.  

No constellations to call its kin,  
Yet in solitude, its light begins.  
A beacon bright in night's embrace,  
Graceful in its silent, solitary space.  

It watches worlds with a gentle sigh,  
Dreaming of friends in the endless sky.  
Though alone, it never fades away,  
Guiding lost souls to a brand new day.  


### 2. Agentic object detection pipeline

### 3. Running the object detector

### 4. Why Numbered Batching for Inferencing

### 5. Critiquing and Refining the Query

### 6. Validation Bounding Box predictions utilizing the VLM