In [1]:
import random
from llama_index.core.workflow import (
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Event,
    Context
)

## Structured output testing

In [2]:
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
from llama_index.llms.openai import OpenAI as OpenAILlamaIndex
from llama_index.llms.ollama import Ollama as OllamaLlamaIndex
from typing import Literal
from typing import Optional
from typing import List
import json

load_dotenv()

client = OpenAI()

class ContextDescription(BaseModel):
    left_of_object: Optional[str]
    right_of_object: Optional[str]
    behind_object: Optional[str]
    in_front_of_object: Optional[str]
    above_object: Optional[str]
    below_object: Optional[str]
    additional_information: Optional[List[str]]

class ObjectDescription(BaseModel):
    name: Optional[str]
    position: Optional[str]
    size: Optional[str]
    texture: Optional[str]
    color: Optional[str]
    additional_information: Optional[List[str]]

class RoomDescription(BaseModel):
    room_type: Optional[Literal["Living room", "Kitchen", "Bedroom", "Bathroom"]]
    size: Optional[str]
    additional_information: Optional[List[str]]

class InitialDescription(BaseModel):
    target_object_description: Optional[ObjectDescription]
    context_description: Optional[ContextDescription]
    room_description: Optional[RoomDescription]
    additional_information: Optional[List[str]]
    
    
user_description= """The object is a knife, it's on a table in a medium sized room. The table is far from the wall. It appears to be a kitchen or living room. Left of the table which the knife is one is a TV and on the right on the wall behind the table are two windows."""

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": "Your task is to turn a user's description of an object, its context and the room type into a structured response. When information is missing from the user's description, do not make up parts of the description, go ONLY off of the user's description."},
        {"role": "user", "content": user_description}
    ],
    response_format=InitialDescription,
)

structured_description_struct_output = completion.choices[0].message.parsed
print(json.dumps(structured_description_struct_output.model_dump(), indent=4))

{
    "target_object_description": {
        "name": "knife",
        "position": "on a table",
        "size": null,
        "texture": null,
        "color": null,
        "additional_information": null
    },
    "context_description": {
        "left_of_object": "TV",
        "right_of_object": "two windows",
        "behind_object": null,
        "in_front_of_object": null,
        "above_object": null,
        "below_object": null,
        "additional_information": null
    },
    "room_description": {
        "room_type": null,
        "size": "medium sized",
        "additional_information": [
            "It appears to be a kitchen or living room.",
            "The table is far from the wall."
        ]
    },
    "additional_information": null
}


In [3]:
from dotenv import load_dotenv

from llama_index.core.prompts import ChatPromptTemplate
from llama_index.core.llms import ChatMessage
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

from pydantic import BaseModel, Field
from typing import Literal, Optional, List
import json

load_dotenv()

class ContextDescription(BaseModel):
    left_of_object: Optional[str]
    right_of_object: Optional[str]
    behind_object: Optional[str]
    in_front_of_object: Optional[str]
    above_object: Optional[str]
    below_object: Optional[str]
    additional_information: Optional[List[str]]

class ObjectDescription(BaseModel):
    name: Optional[str]
    position: Optional[str]
    size: Optional[str]
    texture: Optional[str]
    color: Optional[str]
    additional_information: Optional[List[str]]

class RoomDescription(BaseModel):
    room_type: Optional[Literal["Living room", "Kitchen", "Bedroom", "Bathroom"]]
    size: Optional[str]
    additional_information: Optional[List[str]]

class InitialDescription(BaseModel):
    target_object_description: Optional[ObjectDescription]
    context_description: Optional[ContextDescription]
    room_description: Optional[RoomDescription]
    additional_information: Optional[List[str]]
    
llm = OpenAILlamaIndex()

chat_prompt_tmpl = ChatPromptTemplate(
    message_templates=[
        ChatMessage.from_str(
            "Your task is to turn a user's description of an object, its context and the room type into a structured response. When information is missing from the user's description, do not make up parts of the description, go ONLY off of the user's description. Here is the description:\n {movie_name}", role="user"
        )
    ]
)

structured_description_llamaindex = llm.structured_predict(
    InitialDescription, chat_prompt_tmpl, movie_name="The object is a knife, it's on a table in a medium sized room. The table is far from the wall. It appears to be a kitchen or living room. Left of the table which the knife is one is a TV and on the right on the wall behind the table are two windows."
)

print(json.dumps(structured_description_llamaindex.model_dump(), indent=4))

{
    "target_object_description": {
        "name": "knife",
        "position": "on a table",
        "size": null,
        "texture": null,
        "color": null,
        "additional_information": null
    },
    "context_description": {
        "left_of_object": "TV",
        "right_of_object": "two windows",
        "behind_object": "table",
        "in_front_of_object": null,
        "above_object": null,
        "below_object": null,
        "additional_information": null
    },
    "room_description": {
        "room_type": "Kitchen",
        "size": "medium sized",
        "additional_information": null
    },
    "additional_information": null
}


In [4]:
# Compare structured_description_llamaindex and structured_description_struct_output by printing the values side by side
for openai_key, llama_key in zip (structured_description_struct_output.model_dump().items(), structured_description_llamaindex.dict().items()):
    print(openai_key[0], openai_key[1], llama_key[1])
    

target_object_description {'name': 'knife', 'position': 'on a table', 'size': None, 'texture': None, 'color': None, 'additional_information': None} {'name': 'knife', 'position': 'on a table', 'size': None, 'texture': None, 'color': None, 'additional_information': None}
context_description {'left_of_object': 'TV', 'right_of_object': 'two windows', 'behind_object': None, 'in_front_of_object': None, 'above_object': None, 'below_object': None, 'additional_information': None} {'left_of_object': 'TV', 'right_of_object': 'two windows', 'behind_object': 'table', 'in_front_of_object': None, 'above_object': None, 'below_object': None, 'additional_information': None}
room_description {'room_type': None, 'size': 'medium sized', 'additional_information': ['It appears to be a kitchen or living room.', 'The table is far from the wall.']} {'room_type': 'Kitchen', 'size': 'medium sized', 'additional_information': None}
additional_information None None


## AI2Thor wrapper

In [5]:
from ai2thor.controller import Controller
import ai2thor
import random
from PIL import Image
import base64
from openai import OpenAI
from llama_index.llms.openai import OpenAI as OpenAILlamaIndex
from llama_index.llms.ollama import Ollama as OllamaLlamaIndex
import time

In [6]:
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


class AI2ThorClient: 
    """
    An AI2Thor instance with methods wrapping its controller.
    """

    def __init__(self):
        self._controller = Controller(
            agentMode="default",
            visibilityDistance=1.5,
            scene="FloorPlan212",

            # step sizes
            
            gridSize=0.25,
            snapToGrid=True,
            rotateStepDegrees=90,

            # image modalities
            renderDepthImage=False,
            renderInstanceSegmentation=False,

            # camera properties
            width=512,
            height=512,
            fieldOfView=90
            )
        self._events = []
        self._llm_ollama = OllamaLlamaIndex(model="llama3.2", request_timeout=120.0)
        self._llm_openai = OpenAILlamaIndex(model="gpt-4o-2024-08-06")
        self._llm_openai_multimodal = OpenAI()
        
    def _get_image(self):
        return Image.fromarray(self._controller.last_event.frame)
    
    def _rotate_to_coordinates(self, x, y, z):
        """AI2ThorClient
        Rotates the agent to face the given coordinates.
        
        Parameters
        ----------
        x : float
            The x-coordinate to face.
        y : float
            The y-coordinate to face.
        z : float
            The z-coordinate to face.
            
        Returns None
        """
        pass
    
    def describe_scene_from_image(self):
        """
        Describes the scene using an image-to-text model.
    
        Returns
        -------
        str
            A string describing the current scene.
        """
        
        image = self._get_image()
        
        img_path  = f"log/img/{str(time.time())}.jpg"
        image.save(img_path)
        
        encoded_image = encode_image(img_path)
        
        response = self._llm_openai_multimodal.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Imagine this is your point-of-view. Describe what you see in this virtual environment. Write from the first perspective so start your message with 'I'. First, describe the objects, their colors, and their positions. Don't introduce your description. Start describing directly e.g. 'I currently see a <object> on a <surface> ...'. Be objective in your description! Finally describe the room type: it's either a living room, kitchen, bedroom, or bedroom. It can't be anything else. If you can't infer the room type, just say so.",
                        },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url":  f"data:image/jpeg;base64,{encoded_image}"
                            },
                        },
                    ],
                },
            ],
        )

        return response.choices[0].message.content
        
    
    def step(self, direction: str = "MoveAhead", magnitude: float = None) -> ai2thor.server.Event:
        """
        Robot takes one step in given direction.
    
        Returns None
        """
        self._controller.step(
            action=direction,
            moveMagnitude=magnitude)
        
        self._events.append(self._controller.last_event.metadata)
        
    def look(self, direction: str = "LookUp") -> None:
        """
        Robot looks up or down.
    
        Returns None
        """
        self._controller.step(
            action=direction,
            degrees=30
            )
        
        self._events.append(self._controller.last_event.metadata)
    
    def rotate(self, direction: str, degrees: float = None) -> None:
        """
        Robot turns in given direction.
        
        Parameters
        ----------
        direction : str
            Direction to turn in. Can be "RotateLeft" or "RotateRight".
        
        Returns None
        """
        self._controller.step(
            action=direction,
            degrees=degrees
            )
        
        self._events.append(self._controller.last_event.metadata)

    def crouch(self):
        """
        Robot crouches.
    
        Returns None
        """
        self._controller.step(action="Crouch")
        
        self._events.append(self._controller.last_event.metadata)
        
    def stand(self):
        """
        Robot stands.
    
        Returns None
        """
        self._controller.step(action="Stand")
        
        self._events.append(self._controller.last_event.metadata)
        
    def teleport(self, position: dict = None, rotation: dict = None, horizon: float = None, standing: bool = None, to_random: bool = False) -> None:
        """
        Robot teleports to random location.
    
        Returns None
        """
        
        if to_random:
            rotation = {"x": random.randint(0, 360), "y": random.randint(0, 360), "z": random.randint(0, 360)}
            positions = self._controller.step(action="GetReachablePositions").metadata["actionReturn"]
            position = random.choice(positions)
            
        self._controller.step(
            action="Teleport",
            position=position,
            rotation=rotation,
            horizon=horizon,
            standing=standing
        )
        
        self._events.append(self._controller.last_event.metadata)
    
    def done(self) -> None:
        """
        The Done action does nothing to the state of the environment. 
        But, it returns a cleaned up event with respect to the metadata.
    
        Returns None
        """
        self._controller.step(action="Done")
        
        self._events.append(self._controller.last_event.metadata)

In [7]:
thor = AI2ThorClient()

In [8]:
thor.teleport(to_random=True)
thor.rotate(direction="RotateRight")

In [9]:
thor.describe_scene_from_image()

"I'm currently seeing a green sofa with three cushions on it. To the right of the sofa, there's a black remote placed on the seat. A red cushion is also set on the left cushion of the sofa. There are two more chairs, each with a green cushion and positioned on either side of the sofa. In the background, I notice two vertical windows allowing natural light to enter the room. I would identify this area as a living room."

In [10]:
thor.teleport(to_random=True)
last_event = thor._events[-1]
objects = last_event["objects"]

for obj in objects:
    print(obj["objectId"])

ArmChair|-00.27|+00.00|+01.87
ArmChair|+02.66|+00.00|+01.86
Boots|+04.00|+00.00|+01.70
Box|-00.47|+01.04|-00.71
CoffeeTable|+01.59|00.00|+00.45
CreditCard|+01.41|+00.47|+00.65
Drawer|+03.88|+00.77|+00.86
Floor|+00.00|+00.00|+00.00
FloorLamp|+03.61|+00.00|+02.16
GarbageCan|+03.83|-00.03|-00.50
HousePlant|+00.39|+00.80|-00.73
KeyChain|+01.50|+00.47|+00.53
Laptop|+01.80|+00.47|+00.50
LightSwitch|-01.40|+01.29|+01.84
Newspaper|+02.15|+00.41|-00.72
Painting|+04.07|+01.95|+00.85
Pen|+03.93|+00.87|+01.04
Pencil|+03.89|+00.87|+01.18
Pillow|+00.65|+00.39|+01.71
RemoteControl|+01.88|+00.33|+01.73
Shelf|-00.29|+00.59|-00.73
Shelf|+01.91|+00.20|-00.73
Shelf|-00.29|+00.20|-00.73
Shelf|+01.91|+00.59|-00.73
SideTable|+03.95|+00.00|+00.86
Sofa|+01.19|+00.01|+01.87
Statue|-00.09|+00.03|-00.70
Statue|-00.54|+00.40|-00.69
Television|+01.90|+01.28|-00.84
TissueBox|+03.92|+00.87|+00.68
TVStand|-00.29|00.00|-00.77
TVStand|+01.90|00.00|-00.77
WateringCan|+01.62|+00.02|-00.70
Window|+01.57|+02.07|+02.49
Windo

In [11]:
thor._llm_ollama.complete("What do you see?")

CompletionResponse(text="I don't have the ability to see or perceive the physical world. I'm a text-based AI assistant, and my interactions are limited to processing and generating text. I can understand and respond to text-based inputs, but I don't have visual capabilities or direct access to sensory information.\n\nHowever, I can help facilitate discussions about what you see, provide information about visual topics, or assist with tasks that involve analyzing images or videos if you'd like.", additional_kwargs={'tool_calls': []}, raw={'model': 'llama3.2', 'created_at': '2024-11-15T09:08:54.133358Z', 'message': {'role': 'assistant', 'content': "I don't have the ability to see or perceive the physical world. I'm a text-based AI assistant, and my interactions are limited to processing and generating text. I can understand and respond to text-based inputs, but I don't have visual capabilities or direct access to sensory information.\n\nHowever, I can help facilitate discussions about wh

## Workflow

In [12]:
class InitialDescriptionComplete(Event):
    payload: str

class InitialDescriptionIncomplete(Event):
    payload: str

class ObjectFound(Event):
    payload: str

class WrongObjectSuggested(Event):
    payload: str

class RoomCorrect(Event):
    payload: str

class RoomIncorrect(Event):
    payload: str

class ObjectInRoom(Event):
    payload: str

class ObjectNotInRoom(Event):
    payload: str

number = 1

class ThorFindsObject(Workflow):
    
    def __init__(self, timeout: int = 10, verbose: bool = False):
        super().__init__(timeout=timeout, verbose=verbose)
        self.thor = AI2ThorClient()
    
    @step
    async def evaluate_initial_description(self, ev: StartEvent) -> InitialDescriptionComplete | InitialDescriptionIncomplete:
        
        if random.randint(0, 1) == 0:
            return InitialDescriptionComplete(payload="Initial description is complete.")
        else:
            return InitialDescriptionIncomplete(payload="Initial description is incomplete.")

    @step
    async def clarify_initial_description(self, ev: InitialDescriptionIncomplete) -> InitialDescriptionComplete:
        return InitialDescriptionComplete(payload="Description clarified.")

    @step
    async def find_correct_room_type(self, ev: InitialDescriptionComplete | RoomIncorrect | ObjectNotInRoom) -> RoomCorrect | RoomIncorrect:
        current_description = """a living room setup viewed from behind a dark-colored couch. The room has light-colored walls and a floor that seems to be a muted, earthy tone. The main items in the room include:
- A large, dark-colored sofa in the foreground facing a TV.
- A television placed on a small white TV stand, positioned along the far wall.
- A small side table with a blue vase and a decorative item beside the TV stand.
- A wooden shelf or cabinet off to the left side of the room."""
        
        if random.randint(0, 1) == 0:
            return RoomCorrect(payload="Correct room is found.")
        else:
            return RoomIncorrect(payload="Correct room is not found.")

    @step 
    async def find_object_in_room(self, ev: RoomCorrect) -> ObjectInRoom | ObjectNotInRoom:
        if random.randint(0, 10) < 4:
            return ObjectInRoom(payload="Object may be in this room.")
        else:
            return ObjectNotInRoom(payload="Object is not in this room.")
    
    @step 
    async def suggest_object(self, ev: ObjectInRoom | WrongObjectSuggested) -> WrongObjectSuggested | ObjectNotInRoom | StopEvent:
        
        
        if  random.randint(0, 1) == 0:
            return StopEvent(result="We found the object!")  # End the workflow
        else:
            if random.randint(0, 1) == 0:
                return WrongObjectSuggested(payload="Couldn't find object in this room.")
            else:
                return ObjectNotInRoom(payload="Object is not in this room.")

import asyncio

# Initialize and run the workflow
w = ThorFindsObject(timeout=10, verbose=False)
result = await w.run()
print(result)


We found the object!


In [13]:
from llama_index.utils.workflow import draw_all_possible_flows

draw_all_possible_flows(ThorFindsObject, filename="possible_flows.html")

<class 'NoneType'>
<class '__main__.InitialDescriptionComplete'>
<class '__main__.InitialDescriptionComplete'>
<class '__main__.InitialDescriptionIncomplete'>
<class '__main__.RoomCorrect'>
<class '__main__.RoomIncorrect'>
<class '__main__.ObjectInRoom'>
<class '__main__.ObjectNotInRoom'>
<class '__main__.WrongObjectSuggested'>
<class '__main__.ObjectNotInRoom'>
<class 'llama_index.core.workflow.events.StopEvent'>
possible_flows.html


In [14]:
!pip install chainlit

