In [2]:
import random
from llama_index.core.workflow import (
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Event,
    Context
)

## Structured output testing

In [7]:
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
from typing import Literal
from typing import Optional
from typing import List
import json

load_dotenv()

client = OpenAI()

class ContextDescription(BaseModel):
    left_of_object: Optional[str]
    right_of_object: Optional[str]
    behind_object: Optional[str]
    in_front_of_object: Optional[str]
    above_object: Optional[str]
    below_object: Optional[str]
    additional_information: Optional[List[str]]

class ObjectDescription(BaseModel):
    name: Optional[str]
    position: Optional[str]
    size: Optional[str]
    texture: Optional[str]
    color: Optional[str]
    additional_information: Optional[List[str]]

class RoomDescription(BaseModel):
    room_type: Optional[Literal["Living room", "Kitchen", "Bedroom", "Bathroom"]]
    size: Optional[str]
    additional_information: Optional[List[str]]

class InitialDescription(BaseModel):
    target_object_description: Optional[ObjectDescription]
    context_description: Optional[ContextDescription]
    room_description: Optional[RoomDescription]
    additional_information: Optional[List[str]]
    
    
user_description= """The object is a knife, it's on a table in a medium sized room. The table is far from the wall. It appears to be a kitchen or living room. Left of the table which the knife is one is a TV and on the right on the wall behind the table are two windows."""

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": "Your task is to turn a user's description of an object, its context and the room type into a structured response. When information is missing from the user's description, do not make up parts of the description, go ONLY off of the user's description."},
        {"role": "user", "content": user_description}
    ],
    response_format=InitialDescription,
)

structured_description_struct_output = completion.choices[0].message.parsed
print(json.dumps(structured_description_struct_output.model_dump(), indent=4))

{
    "target_object_description": {
        "name": "knife",
        "position": "on a table",
        "size": null,
        "texture": null,
        "color": null,
        "additional_information": null
    },
    "context_description": {
        "left_of_object": "TV",
        "right_of_object": null,
        "behind_object": "two windows",
        "in_front_of_object": null,
        "above_object": null,
        "below_object": null,
        "additional_information": [
            "The table is far from the wall."
        ]
    },
    "room_description": {
        "room_type": null,
        "size": "medium sized",
        "additional_information": [
            "It appears to be a kitchen or living room."
        ]
    },
    "additional_information": null
}


In [None]:
from dotenv import load_dotenv

from llama_index.core.prompts import ChatPromptTemplate
from llama_index.core.llms import ChatMessage
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

from pydantic import BaseModel, Field
from typing import Literal, Optional, List
import json

load_dotenv()

class ContextDescription(BaseModel):
    left_of_object: Optional[str]
    right_of_object: Optional[str]
    behind_object: Optional[str]
    in_front_of_object: Optional[str]
    above_object: Optional[str]
    below_object: Optional[str]
    additional_information: Optional[List[str]]

class ObjectDescription(BaseModel):
    name: Optional[str]
    position: Optional[str]
    size: Optional[str]
    texture: Optional[str]
    color: Optional[str]
    additional_information: Optional[List[str]]

class RoomDescription(BaseModel):
    room_type: Optional[Literal["Living room", "Kitchen", "Bedroom", "Bathroom"]]
    size: Optional[str]
    additional_information: Optional[List[str]]

class InitialDescription(BaseModel):
    target_object_description: Optional[ObjectDescription]
    context_description: Optional[ContextDescription]
    room_description: Optional[RoomDescription]
    additional_information: Optional[List[str]]
    
llm = OpenAI()

chat_prompt_tmpl = ChatPromptTemplate(
    message_templates=[
        ChatMessage.from_str(
            "Your task is to turn a user's description of an object, its context and the room type into a structured response. When information is missing from the user's description, do not make up parts of the description, go ONLY off of the user's description. Here is the description:\n {movie_name}", role="user"
        )
    ]
)

structured_description_llamaindex = llm.structured_predict(
    InitialDescription, chat_prompt_tmpl, movie_name="The object is a knife, it's on a table in a medium sized room. The table is far from the wall. It appears to be a kitchen or living room. Left of the table which the knife is one is a TV and on the right on the wall behind the table are two windows."
)

print(json.dumps(structured_description_llamaindex.model_dump(), indent=4))

{
    "target_object_description": {
        "name": "knife",
        "position": "on a table",
        "size": null,
        "texture": null,
        "color": null,
        "additional_information": null
    },
    "context_description": {
        "left_of_object": "TV",
        "right_of_object": "two windows",
        "behind_object": "wall",
        "in_front_of_object": null,
        "above_object": null,
        "below_object": null,
        "additional_information": null
    },
    "room_description": {
        "room_type": "Kitchen",
        "size": "medium sized",
        "additional_information": null
    },
    "additional_information": null
}


In [9]:
# Compare structured_description_llamaindex and structured_description_struct_output by printing the values side by side
for openai_key, llama_key in zip (structured_description_struct_output.model_dump().items(), structured_description_llamaindex.dict().items()):
    print(openai_key[0], openai_key[1], llama_key[1])
    

target_object_description {'name': 'knife', 'position': 'on a table', 'size': None, 'texture': None, 'color': None, 'additional_information': None} {'name': 'knife', 'position': 'on a table', 'size': None, 'texture': None, 'color': None, 'additional_information': None}
context_description {'left_of_object': 'TV', 'right_of_object': None, 'behind_object': 'two windows', 'in_front_of_object': None, 'above_object': None, 'below_object': None, 'additional_information': ['The table is far from the wall.']} {'left_of_object': 'TV', 'right_of_object': 'two windows', 'behind_object': 'wall', 'in_front_of_object': None, 'above_object': None, 'below_object': None, 'additional_information': None}
room_description {'room_type': None, 'size': 'medium sized', 'additional_information': ['It appears to be a kitchen or living room.']} {'room_type': 'Kitchen', 'size': 'medium sized', 'additional_information': None}
additional_information None None


## AI2Thor wrapper

In [41]:
from ai2thor.controller import Controller
import ai2thor
import random
from PIL import Image
import base64
from openai import OpenAI
from llama_index.llms.openai import OpenAI as OpenAILlamaIndex
from llama_index.llms.ollama import Ollama as OllamaLlamaIndex
import time

In [57]:
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


class AI2ThorClient: 
    """
    An AI2Thor instance with methods wrapping its controller.
    """

    def __init__(self):
        self._controller = Controller(
            agentMode="default",
            visibilityDistance=1.5,
            scene="FloorPlan212",

            # step sizes
            
            gridSize=0.25,
            snapToGrid=True,
            rotateStepDegrees=90,

            # image modalities
            renderDepthImage=False,
            renderInstanceSegmentation=False,

            # camera properties
            width=512,
            height=512,
            fieldOfView=90
            )
        self._metadata = []
        self._llm_ollama = OllamaLlamaIndex(model="llama3.2", request_timeout=120.0)
        self._llm_openai = OpenAILlamaIndex(model="gpt-4o-2024-08-06")
        self._llm_openai_multimodal = OpenAI()
        
    def _get_image(self):
        return Image.fromarray(self._controller.last_event.frame)
    
    
    def describe_scene_from_image(self):
        """
        Describes the scene using an image-to-text model.
    
        Returns
        -------
        str
            A string describing the current scene.
        """
        
        image = self._get_image()
        
        img_path  = f"log/img/{str(time.time())}.jpg"
        image.save(img_path)
        
        encoded_image = encode_image(img_path)
        
        response = self._llm_openai_multimodal.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "What is in this image?",
                        },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url":  f"data:image/jpeg;base64,{encoded_image}"
                            },
                        },
                    ],
                },
            ],
        )

        return response.choices[0].message.content
        
    
    def step(self, direction: str = "MoveAhead", magnitude: float = None) -> ai2thor.server.Event:
        """
        Robot takes one step in given direction.
    
        Returns None
        """
        self._controller.step(
            action=direction,
            moveMagnitude=magnitude)
        
        self._metadata.append(self._controller.last_event.metadata)
        
    def look(self, direction: str = "LookUp") -> None:
        """
        Robot looks up or down.
    
        Returns None
        """
        self._controller.step(
            action=direction,
            degrees=30
            )
        
        self._metadata.append(self._controller.last_event.metadata)
    
    def rotate(self, direction: str, degrees: float = None) -> None:
        """
        Robot turns in given direction.
        
        Parameters
        ----------
        direction : str
            Direction to turn in. Can be "RotateLeft" or "RotateRight".
        
        Returns None
        """
        self._controller.step(
            action=direction,
            degrees=degrees
            )
        
        self._metadata.append(self._controller.last_event.metadata)

    def crouch(self):
        """
        Robot crouches.
    
        Returns None
        """
        self._controller.step(action="Crouch")
        
        self._metadata.append(self._controller.last_event.metadata)
        
    def stand(self):
        """
        Robot stands.
    
        Returns None
        """
        self._controller.step(action="Stand")
        
        self._metadata.append(self._controller.last_event.metadata)
        
    def teleport(self, position: dict = None, rotation: dict = None, horizon: float = None, standing: bool = None, to_random: bool = False) -> None:
        """
        Robot teleports to random location.
    
        Returns None
        """
        
        if to_random:
            rotation = {"x": random.randint(0, 360), "y": random.randint(0, 360), "z": random.randint(0, 360)}
            positions = self._controller.step(action="GetReachablePositions").metadata["actionReturn"]
            position = random.choice(positions)
            
        self._controller.step(
            action="Teleport",
            position=position,
            rotation=rotation,
            horizon=horizon,
            standing=standing
        )
        
        self._metadata.append(self._controller.last_event.metadata)
    
    def done(self) -> None:
        """
        The Done action does nothing to the state of the environment. 
        But, it returns a cleaned up event with respect to the metadata.
    
        Returns None
        """
        self._controller.step(action="Done")
        
        self._metadata.append(self._controller.last_event.metadata)

In [58]:
thor = AI2ThorClient()

In [23]:
event = thor.teleport(to_random=True)
event = thor.rotate(direction="RotateRight")

In [59]:
thor.describe_scene_from_image()

"The image shows an interior space with wooden flooring. There are several doors visible, each with white paneling and brass handles. In the foreground on the left, there's an open cardboard box on a red surface, and nearby there is an object that resembles a shoehorn. The walls are painted in light colors, contributing to a clean and minimalist appearance."

In [None]:
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core.schema import ImageDocument
import base64
from openai import OpenAI
import time

image = thor._get_image()
img_path  = f"{str(time.time())}.jpg"
image.save(img_path)

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

encoded_image = encode_image(img_path)

client = OpenAI()

response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What is in this image?",
        },
        {
          "type": "image_url",
          "image_url": {
            "url":  f"data:image/jpeg;base64,{encoded_image}"
          },
        },
      ],
    }
  ],
)

print(response.choices[0].message.content)

The image depicts an interior space that appears to be a room or hallway. There are walls painted in different colors, with a light-colored wall on the left and a darker wall on the right. There are several closed doors visible, one of which is white. In the foreground, there is a wooden table or shelf with a cardboard box on it and what looks like a shoe or object next to the box. The flooring seems to be wooden. The overall atmosphere suggests a simple, possibly minimalistic interior design.


In [None]:
thor._llm_ollama.complete("What do you see?")

CompletionResponse(text='I don\'t have a physical presence, so I don\'t see things like humans do. However, I can process and analyze vast amounts of information that is available to me through various inputs such as text data, images, or audio files.\n\nIn terms of visual perception, I can:\n\n1. **Recognize text**: I can understand and process written text, including emails, articles, documents, and more.\n2. **Process images**: While I don\'t "see" in the classical sense, I can analyze and understand images, including recognizing objects, scenes, and activities.\n3. **Understand audio**: I can transcribe and interpret spoken language from various sources like podcasts, audiobooks, or voice messages.\n\nPlease keep in mind that my perception is limited to text-based data and doesn\'t involve direct sensory inputs like sight, sound, touch, taste, or smell.\n\nHow can I assist you today?', additional_kwargs={'tool_calls': []}, raw={'model': 'llama3.2', 'created_at': '2024-11-10T16:18:0

## Workflow

In [18]:
class InitialDescriptionComplete(Event):
    payload: str

class InitialDescriptionIncomplete(Event):
    payload: str

class ObjectFound(Event):
    payload: str

class WrongObjectSuggested(Event):
    payload: str

class RoomCorrect(Event):
    payload: str

class RoomIncorrect(Event):
    payload: str

class ObjectInRoom(Event):
    payload: str

class ObjectNotInRoom(Event):
    payload: str

number = 1

class ThorFindsObject(Workflow):
    
    @step
    async def ask_initial_description(self, ev: StartEvent) -> InitialDescriptionComplete | InitialDescriptionIncomplete:
        print("Thor: Tell me what you saw in detail. Describe the object, its context and the type of room you saw.")
        
        if random.randint(0, 1) == 0:
            print("Initial description is complete.")
            return InitialDescriptionComplete(payload="Initial description is complete.")
        else:
            print("Initial description is incomplete.")
            return InitialDescriptionIncomplete(payload="Initial description is incomplete.")

    @step
    async def clarify_initial_description(self, ev: InitialDescriptionIncomplete) -> InitialDescriptionComplete:
        print("Initial description clarified.")
        return InitialDescriptionComplete(payload="Description clarified.")

    @step
    async def random_teleport(self, ev: InitialDescriptionComplete | RoomIncorrect | ObjectNotInRoom) -> RoomCorrect | RoomIncorrect:
        if random.randint(0, 1) == 0:
            print("Teleported to correct room type.")
            return RoomCorrect(payload="Correct room is found.")
        else:
            print("Teleported to incorrect room type.")
            return RoomIncorrect(payload="Correct room is not found.")

    @step 
    async def find_object_in_room(self, ev: RoomCorrect) -> ObjectInRoom | ObjectNotInRoom:
        if random.randint(0, 10) < 4:
            print("Object may be in this room.")
            return ObjectInRoom(payload="Object may be in this room.")
        else:
            print("Object is not in this room.")
            return ObjectNotInRoom(payload="Object is not in this room.")
    
    @step 
    async def suggest_object(self, ev: ObjectInRoom | WrongObjectSuggested) -> WrongObjectSuggested | ObjectNotInRoom | StopEvent:
        if random.randint(0, 10) < 8:
            print("Wrong object suggested.")
            return WrongObjectSuggested(payload="Couldn't find object in this room.")
        elif random.randint(0, 10) < 4:
            return StopEvent(result="We found the object!")  # End the workflow
        else:
            return ObjectNotInRoom(payload="Object is not in this room.")

# Initialize and run the workflow
w = ThorFindsObject(timeout=10, verbose=False)
result = await w.run()
print(result)


Thor: Tell me what you saw in detail. Describe the object, its context and the type of room you saw.
Initial description is incomplete.
Initial description clarified.
Teleported to correct room type.
Object is not in this room.
Teleported to correct room type.
Object is not in this room.
Teleported to incorrect room type.
Teleported to correct room type.
Object is not in this room.
Teleported to correct room type.
Object is not in this room.
Teleported to incorrect room type.
Teleported to incorrect room type.
Teleported to correct room type.
Object may be in this room.
Teleported to correct room type.
Object may be in this room.
Wrong object suggested.
Teleported to correct room type.
Object is not in this room.
Teleported to correct room type.
Object may be in this room.
Wrong object suggested.
Teleported to correct room type.
Object may be in this room.
Wrong object suggested.
Wrong object suggested.
Wrong object suggested.
We found the object!


In [10]:
from llama_index.utils.workflow import draw_all_possible_flows

draw_all_possible_flows(ThorFindsObject, filename="possible_flows.html")

<class 'NoneType'>
<class '__main__.InitialDescriptionComplete'>
<class '__main__.InitialDescriptionIncomplete'>
<class '__main__.InitialDescriptionComplete'>
<class '__main__.ObjectInRoom'>
<class '__main__.ObjectNotInRoom'>
<class '__main__.RoomCorrect'>
<class '__main__.RoomIncorrect'>
<class '__main__.WrongObjectSuggested'>
<class 'llama_index.core.workflow.events.StopEvent'>
possible_flows.html


In [19]:
!pip install chainlit

Collecting chainlit
  Downloading chainlit-1.3.2-py3-none-any.whl.metadata (6.8 kB)
Collecting aiofiles<24.0.0,>=23.1.0 (from chainlit)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting asyncer<0.0.8,>=0.0.7 (from chainlit)
  Downloading asyncer-0.0.7-py3-none-any.whl.metadata (6.6 kB)
Collecting fastapi<0.116,>=0.115.3 (from chainlit)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting filetype<2.0.0,>=1.2.0 (from chainlit)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting lazify<0.5.0,>=0.4.0 (from chainlit)
  Using cached Lazify-0.4.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting literalai==0.0.623 (from chainlit)
  Downloading literalai-0.0.623.tar.gz (57 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting packaging<24.0,>=23.1 (from chainlit)
  Using cached packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Collecting python-multipart<0.0.10,>=0.0.9 (from chainlit)
  Using cached python_mu