In [2]:
import random
from llama_index.core.workflow import (
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Event,
    Context
)

## Structured output testing

In [1]:
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
from llama_index.llms.openai import OpenAI as OpenAILlamaIndex
from llama_index.llms.ollama import Ollama as OllamaLlamaIndex
from typing import Literal
from typing import Optional
from typing import List
import json

load_dotenv()

client = OpenAI()

class ObjectDescription(BaseModel):
    name: Optional[str]
    position: Optional[str]
    size: Optional[str]
    texture: Optional[str]
    color: Optional[str]
    additional_information: Optional[List[str]]
    
class ContextObjectDescription(ObjectDescription):
    position_relative_to_target_object: Optional[str]

class RoomDescription(BaseModel):
    room_type: Optional[Literal["Living room", "Kitchen", "Bedroom", "Bathroom"]]
    size: Optional[str]
    additional_information: Optional[List[str]]

class InitialDescription(BaseModel):
    target_object: Optional[ObjectDescription]
    object_in_context: Optional[List[ContextObjectDescription]]
    room_description: Optional[RoomDescription]
    additional_information: Optional[List[str]]
    
    
user_description= """The object is a knife, it's on a table in a medium sized room. The table is far from the wall. It appears to be a kitchen or living room. Left of the table which the knife is one is a TV and on the right on the wall behind the table are two windows."""

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": "Your task is to turn a user's description of an object, its context and the room type into a structured response. When information is missing from the user's description, do not make up parts of the description, go ONLY off of the user's description."},
        {"role": "user", "content": user_description}
    ],
    response_format=InitialDescription,
)

structured_description_struct_output = completion.choices[0].message.parsed
print(json.dumps(structured_description_struct_output.model_dump(), indent=4))

{
    "target_object": {
        "name": "Knife",
        "position": "On the table",
        "size": null,
        "texture": null,
        "color": null,
        "additional_information": null
    },
    "object_in_context": [
        {
            "name": "Table",
            "position": "Far from the wall",
            "size": null,
            "texture": null,
            "color": null,
            "additional_information": null,
            "position_relative_to_target_object": "The knife is on the table"
        },
        {
            "name": "TV",
            "position": null,
            "size": null,
            "texture": null,
            "color": null,
            "additional_information": null,
            "position_relative_to_target_object": "Left of the table which the knife is on"
        },
        {
            "name": "Windows",
            "position": "On the wall behind the table",
            "size": null,
            "texture": null,
            "color": null

In [2]:
from dotenv import load_dotenv

from llama_index.core.prompts import ChatPromptTemplate
from llama_index.core.llms import ChatMessage
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

from pydantic import BaseModel, Field
from typing import Literal, Optional, List
import json

load_dotenv()

class ObjectDescription(BaseModel):
    name: Optional[str]
    position: Optional[str]
    size: Optional[str]
    texture: Optional[str]
    color: Optional[str]
    additional_information: Optional[List[str]]
    
class ContextObjectDescription(ObjectDescription):
    position_relative_to_target_object: Optional[str]

class RoomDescription(BaseModel):
    room_type: Optional[Literal["Living room", "Kitchen", "Bedroom", "Bathroom"]]
    size: Optional[str]
    additional_information: Optional[List[str]]

class InitialDescription(BaseModel):
    target_object: Optional[ObjectDescription]
    object_in_context: Optional[List[ContextObjectDescription]]
    room_description: Optional[RoomDescription]
    additional_information: Optional[List[str]]
    
llm = OpenAILlamaIndex()

chat_prompt_tmpl = ChatPromptTemplate(
    message_templates=[
        ChatMessage.from_str(
            "Your task is to turn a user's description of an object, its context and the room type into a structured response. When information is missing from the user's description, do not make up parts of the description, go ONLY off of the user's description. Here is the description:\n {movie_name}", role="user"
        )
    ]
)

structured_description_llamaindex = llm.structured_predict(
    InitialDescription, chat_prompt_tmpl, movie_name="The object is a knife, it's on a table in a medium sized room. The table is far from the wall. It appears to be a kitchen or living room. Left of the table which the knife is one is a TV and on the right on the wall behind the table are two windows."
)

print(structured_description_llamaindex)

target_object=ObjectDescription(name='knife', position='on a table', size='N/A', texture='N/A', color='N/A', additional_information=[]) object_in_context=[ContextObjectDescription(name='TV', position='left of the table', size='N/A', texture='N/A', color='N/A', additional_information=None, position_relative_to_target_object=None), ContextObjectDescription(name='windows', position='on the wall behind the table', size='N/A', texture='N/A', color='N/A', additional_information=None, position_relative_to_target_object=None)] room_description=RoomDescription(room_type='Kitchen', size='medium sized', additional_information=[]) additional_information=[]


In [4]:
# Compare structured_description_llamaindex and structured_description_struct_output by printing the values side by side
for openai_key, llama_key in zip (structured_description_struct_output.model_dump().items(), structured_description_llamaindex.dict().items()):
    print(openai_key[0], openai_key[1], llama_key[1])
    

target_object_description {'name': 'knife', 'position': 'on a table', 'size': None, 'texture': None, 'color': None, 'additional_information': None} {'name': 'knife', 'position': 'on a table', 'size': None, 'texture': None, 'color': None, 'additional_information': None}
context_description {'left_of_object': 'TV', 'right_of_object': 'two windows', 'behind_object': None, 'in_front_of_object': None, 'above_object': None, 'below_object': None, 'additional_information': None} {'left_of_object': 'TV', 'right_of_object': 'two windows', 'behind_object': 'table', 'in_front_of_object': None, 'above_object': None, 'below_object': None, 'additional_information': None}
room_description {'room_type': None, 'size': 'medium sized', 'additional_information': ['It appears to be a kitchen or living room.', 'The table is far from the wall.']} {'room_type': 'Kitchen', 'size': 'medium sized', 'additional_information': None}
additional_information None None


## AI2Thor wrapper

In [35]:
from ai2thor.controller import Controller
import ai2thor
import random
from PIL import Image
import base64
from openai import OpenAI
from llama_index.llms.openai import OpenAI as OpenAILlamaIndex
from llama_index.llms.ollama import Ollama as OllamaLlamaIndex
import time

In [174]:
from ai2thor.controller import Controller
from PIL import Image
from llama_index.llms.openai import OpenAI as OpenAILlamaIndex
from llama_index.llms.ollama import Ollama as OllamaLlamaIndex
from descriptions import InitialDescription, ViewDescription
from leolani_client import Action
from openai import OpenAI
import random
import base64
import json
import time
from thor_utils import ( 
                        encode_image, 
                        get_distance,
                        closest_objects
                       )

# Constants
VISIBILITY_DISTANCE = 1.5
SCENE = "FloorPlan212"

class AI2ThorClient: 
    """
    An AI2Thor instance with methods wrapping its controller.
    """

    def __init__(self, leolaniClient, chat_mode):
        self._controller = Controller(
            agentMode="default",
            visibilityDistance=VISIBILITY_DISTANCE,
            scene=SCENE,

            # step sizes
            gridSize=0.25,
            snapToGrid=True,
            rotateStepDegrees=90,

            # image modalities
            renderDepthImage=False,
            renderInstanceSegmentation=False,

            # camera properties
            width=512,
            height=512,
            fieldOfView=90
            )
        self._metadata = []
        self.descriptions = []
        self.unstructured_descriptions = []
        self.leolaniClient = leolaniClient
        self._llm_ollama = OllamaLlamaIndex(model="llama3.2", request_timeout=120.0)
        self._llm_openai = OpenAILlamaIndex(model="gpt-4o-2024-08-06")
        self._llm_openai_multimodal = OpenAI()


    def describe_view_from_image(self):
        """
        Describes the current view using an image-to-text model.

        Returns
        -------
        str
            A string describing the current view.
        """
        encoded_image = encode_image(self._get_image())

        response = self._llm_openai_multimodal.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Imagine this is your point-of-view. Describe what you see in this virtual environment. Write from the first perspective so start your message with 'I'. First, describe the objects, their colors, and their positions. Don't introduce your description. Start describing directly e.g. 'I currently see a <object> on a <surface> ...'. Be objective in your description! Finally describe the room type: it's either a living room, kitchen, bedroom, or bedroom. It can't be anything else. If you can't infer the room type, just say so.",
                        },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url":  f"data:image/jpeg;base64,{encoded_image}"
                            },
                        },
                    ],
                },
            ],
        )
        
        self.descriptions.append(response.choices[0].message.content)
        return response.choices[0].message.content


    def describe_view_from_image_structured(self):
        """
        Describes the current view using an image-to-text model with structure.
        
        Returns:
        -------
        ViewDescription
            A structured description of the current view.
        """    

        encoded_image = encode_image(self._get_image())
        
        response = self._llm_openai_multimodal.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Imagine this is your point-of-view. Describe what you see in this virtual environment. Write from the first perspective. Describe the objects, their colors, and their positions. Be objective in your description! Describe the room type: it's either a living room, kitchen, bedroom, or bedroom. It can't be anything else.",
                            },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url":  f"data:image/jpeg;base64,{encoded_image}"
                                },
                            },
                        ],
                    },
                ],
            response_format=ViewDescription,
            )
        
        self.descriptions.append(response.choices[0].message.parsed)
        return response.choices[0].message.parsed

    def infer_room_type(self, description: str) -> str:
        """
        Infers the room type the agent is in.

        Inference is based on:
        - The image-to-text description of the view.
        - The objects in the metadata.
        - The AI2Thor object types mapping (https://ai2thor.allenai.org/ithor/documentation/objects/object-types).

        Returns
        -------
        Returns a string representing the likely room type.
        """
        pass 
    
    def parse_unstructured_description(self, description: str):
        """
        Parse an unstructured description into structured data.

        Parameters
        ----------
        description : str
            The unstructured description to parse.
            
        Returns
        -------
        PydanticModel
            An instance of the given Pydantic model populated with the parsed data.
        """

        response = self._llm_openai_multimodal.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", "content": """Your task is to turn a user's description of an object, its context and the room type into a structured response. 
                 When information is missing from the user's description, do not make up parts of the description, go ONLY off of the user's description. 
                 Only deviate from this rule when positions of objects in context are obvious, such as a floor (which is always below the target object) and a ceiling (which is above)."""},
                {"role": "user", "content": description}
            ],
            response_format=InitialDescription,
        )

        self.structured_initial_description = response.choices[0].message.parsed

    def _get_image(self):
        image = Image.fromarray(self._controller.last_event.frame)
        # self.leolaniClient._add_image()
        return image
    
        

    def _step(self, direction: str = "MoveAhead", magnitude: float = None) -> None:
        """
        Robot takes one step in given direction. Options are:
            - MoveAhead
            - MoveBack
            - MoveLeft
            - MoveRight

        Returns None
        """
        self._controller.step(
            action=direction,
            moveMagnitude=magnitude
            ) 

        action_attribute = getattr(Action, direction, None)
        if action_attribute is not None:
            self.leolaniClient._add_action(action_attribute)
        else:
            raise AttributeError(f"'Action' object has no attribute '{direction}'")

        self._metadata.append(self._controller.last_event.metadata)

    def _look(self, direction: str = "LookUp") -> None:
        """
        Robot looks up or down. Options are:
        - LookUp
        - LookDown
    
        Returns None
        """
        self._controller.step(
            action=direction,
            degrees=30
            )

        self.leolaniClient._add_action(Action.direction)
        self._metadata.append(self._controller.last_event.metadata)

    def _rotate(self, direction: str, degrees: float = None) -> None:
        """
        Robot turns in given direction (for optional degrees).
        
        Parameters
        ----------
        direction : str
            Direction to turn in. Can be "RotateLeft" or "RotateRight".
        degrees : float, optional
            Degrees to turn. Default is None.
        
        Returns None
        """
        self._controller.step(
            action=direction,
            degrees=degrees
            )
        
        if direction == "RotateLeft":
            self.leolaniClient._add_action(Action.RotateLeft)
        elif direction == "RotateRight":
            self.leolaniClient._add_action(Action.RotateRight)
        self._metadata.append(self._controller.last_event.metadata)

    def _crouch(self):
        """
        Robot crouches.

        Returns None
        """
        self._controller.step(action="Crouch")

        self.leolaniClient._add_action(Action.Crouch)
        self._metadata.append(self._controller.last_event.metadata)

    def _stand(self):
        """
        Robot stands.

        Returns None
        """
        self._controller.step(action="Stand")

        self.leolaniClient._add_action(Action.Stand)
        self._metadata.append(self._controller.last_event.metadata)

    def _teleport(self, position: dict = None, rotation: dict = None, horizon: float = None, standing: bool = None, to_random: bool = False) -> None:
        """
        Robot teleports to random location.
        
        Parameters
        ----------
        position: dict
            The 'x', 'y', 'z' coordinates.
        rotation: num
            The rotation of the agent's body. If unspecified, the rotation of the agent remains the same.
        horizon: Float
            Look up of down. Negative values (e.g. -30) correspond to agent looking up, and vice versa.
        standing: bool
            True for 

        Returns None
        """

        if to_random:
            rotation = dict(x=0, y=random.randint(0, 360), z=0)
            reachable_positions = self._controller.step(action="GetReachablePositions").metadata["actionReturn"]
            position = random.choice(reachable_positions)
        
        params = {"action": "Teleport", "position": position}
        if rotation is not None:
            params["rotation"] = rotation
        if horizon is not None:
            params["horizon"] = horizon
        if standing is not None:
            params["standing"] = standing
            
        self._controller.step(**params)

        self.leolaniClient._add_action(Action.Teleport)
        self._metadata.append(self._controller.last_event.metadata)
    
    def _find_objects_in_sight(self, object_type: str = None) -> list:
        """
        Finds objects in sight.

        Parameters
        ----------
        object_type : str
            The type of object to find.

        Returns
        -------
        list
            A list of objects in sight.
        """

        # Get objects in sight
        objects_in_sight = [obj for obj in self._controller.last_event.metadata["objects"] if obj["visible"] == True]

        # Optionally filter by object type
        if object_type:
            objects_in_sight = [obj for obj in objects_in_sight if obj["objectType"] == object_type]

        return objects_in_sight
    
    def _find_all_rooms(self, number=None):
        """
        Create a list of all rooms (based on `roomType` == "Floor") in current scene. 
        Sorted from nearest to furthest.
        
        """
        rooms = [obj for obj in self._controller.last_event.metadata["objects"] if obj["objectType"] == "Floor"]
        rooms.sort(key=lambda room: room['distance'])
        return rooms
        
    def _find_nearest_center_of_room(self):
        """
        Create a dictionary with "x", "y", "z" coordinates of nearest center of room(s).
        
        Returns:
        --------
        """
        rooms = self._find_all_rooms()
        nearest_room = rooms[0]
        center = nearest_room['axisAlignedBoundingBox']['center']
        return center

    def _done(self) -> None:
        """
        The Done action does nothing to the state of the environment. 
        But, it returns a cleaned up event with respect to the metadata.

        Returns None
        """
        self._controller.step(action="Done")

        self._metadata.append(self._controller.last_event.metadata)


In [175]:
EMISSOR_PATH = "./emissor"
AGENT = "Human"
HUMAN = "AI2ThorCLient"
from leolani_client import LeolaniChatClient, Action
thor = AI2ThorClient(leolaniClient=LeolaniChatClient(emissor_path=EMISSOR_PATH, agent=AGENT, human=HUMAN), chat_mode="production")

In [163]:
thor._teleport(position={"x":0, "y": 0.9009991884231567, "z": 0})
thor._teleport(position={"x":1, "y": 0.9009991884231567, "z": 1})
thor._teleport(position={"x":2, "y": 0.9009991884231567, "z": 2})
thor._teleport(position={"x":3, "y": 0.9009991884231567, "z": 3})
thor._teleport(to_random=True)
thor._teleport(position=thor._find_nearest_center_of_room())
# thor._rotate(direction="RotateRight")
# thor._step(direction="MoveAhead")

In [177]:
thor._find_objects_in_sight()
print(thor.describe_view_from_image())

I currently see a brown cardboard box on a reddish-brown wooden surface, which appears to be a table or cabinet. There is a shiny, metallic object close to the box. To my right, there is a light gray wall with a white door that has a gold handle. Beyond this door, there is another room with a dark brown wall and a white door at the end. The floor is light wood, and the lighting is soft, creating a calm atmosphere. The space suggests it might be a living room.


In [160]:
thor._rotate(direction="RotateLeft")
thor._stand()

In [148]:
for metadata in thor._metadata:
    print(metadata['lastAction'])
    print(metadata['agent']['position'])

Teleport
{'x': 0.0, 'y': 0.9009991884231567, 'z': 0.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': -4.0, 'y': 0.9009991884231567, 'z': -1.0}
Teleport
{'x': 0.0, 'y': 0.9009991884231567, 'z': 0.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': -2.0, 'y': 0.9009991884231567, 'z': -0.5}
Teleport
{'x': 0.0, 'y': 0.9009991884231567, 'z': 0.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': -0.5, 'y': 0.9009991884231567, 'z': 0.25}
Teleport
{'x': 0.0, 'y': 0.9009991884231567, 'z': 0.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Teleport
{'x': 1.0, 'y': 0.9009991884231567, 'z': 1.0}
Tele

In [39]:
thor.describe_view_from_image()

"I currently see a dark blue couch positioned against a light-colored wall. There is a plant peeking into view from the corner of the image. On the wall directly in front of me, there's a colorful abstract painting. The floor is made of wooden planks, giving a warm, inviting appearance. In the background, I notice a white door and what seems to be a hallway or another room. The overall arrangement and items present suggest that this is a living room."

In [None]:
rooms = [obj for obj in thor._metadata[-1]["objects"] if obj["objectType"] == "Floor"]
rooms.sort(key=lambda room: room['distance'])

print(rooms)



[{'name': 'Floor_e3656fd3', 'position': {'x': 0.0, 'y': 0.0, 'z': 0.0}, 'rotation': {'x': -0.0, 'y': 0.0, 'z': 0.0}, 'visible': True, 'isInteractable': True, 'receptacle': True, 'toggleable': False, 'isToggled': False, 'breakable': False, 'isBroken': False, 'canFillWithLiquid': False, 'isFilledWithLiquid': False, 'fillLiquid': None, 'dirtyable': False, 'isDirty': False, 'canBeUsedUp': False, 'isUsedUp': False, 'cookable': False, 'isCooked': False, 'temperature': 'Cold', 'isHeatSource': False, 'isColdSource': False, 'sliceable': False, 'isSliced': False, 'openable': False, 'isOpen': False, 'openness': 0.0, 'pickupable': False, 'isPickedUp': False, 'moveable': False, 'mass': 0.0, 'salientMaterials': None, 'receptacleObjectIds': ['Box|-02.00|+00.24|-02.66', 'SideTable|-05.73|+00.00|-01.28', 'Bed|-04.98|+00.00|-02.36', 'BaseballBat|-05.76|+00.14|-00.41', 'Doorway|-02.24|-00.01|+00.04', 'Desk|-05.52|00.00|-00.30', 'Chair|-04.43|00.00|-00.34', 'Shelf|-05.52|+00.10|-00.30', 'BasketBall|-02.46

In [134]:
print(rooms[0]['axisAlignedBoundingBox']['center'])

{'x': 0.0, 'y': -0.1621541976928711, 'z': -0.0208432674407959}


In [10]:
thor.teleport(to_random=True)
last_event = thor._events[-1]
objects = last_event["objects"]

for obj in objects:
    print(obj["objectId"])

ArmChair|-00.27|+00.00|+01.87
ArmChair|+02.66|+00.00|+01.86
Boots|+04.00|+00.00|+01.70
Box|-00.47|+01.04|-00.71
CoffeeTable|+01.59|00.00|+00.45
CreditCard|+01.41|+00.47|+00.65
Drawer|+03.88|+00.77|+00.86
Floor|+00.00|+00.00|+00.00
FloorLamp|+03.61|+00.00|+02.16
GarbageCan|+03.83|-00.03|-00.50
HousePlant|+00.39|+00.80|-00.73
KeyChain|+01.50|+00.47|+00.53
Laptop|+01.80|+00.47|+00.50
LightSwitch|-01.40|+01.29|+01.84
Newspaper|+02.15|+00.41|-00.72
Painting|+04.07|+01.95|+00.85
Pen|+03.93|+00.87|+01.04
Pencil|+03.89|+00.87|+01.18
Pillow|+00.65|+00.39|+01.71
RemoteControl|+01.88|+00.33|+01.73
Shelf|-00.29|+00.59|-00.73
Shelf|+01.91|+00.20|-00.73
Shelf|-00.29|+00.20|-00.73
Shelf|+01.91|+00.59|-00.73
SideTable|+03.95|+00.00|+00.86
Sofa|+01.19|+00.01|+01.87
Statue|-00.09|+00.03|-00.70
Statue|-00.54|+00.40|-00.69
Television|+01.90|+01.28|-00.84
TissueBox|+03.92|+00.87|+00.68
TVStand|-00.29|00.00|-00.77
TVStand|+01.90|00.00|-00.77
WateringCan|+01.62|+00.02|-00.70
Window|+01.57|+02.07|+02.49
Windo

In [11]:
thor._llm_ollama.complete("What do you see?")

CompletionResponse(text="I don't have the ability to see or perceive the physical world. I'm a text-based AI assistant, and my interactions are limited to processing and generating text. I can understand and respond to text-based inputs, but I don't have visual capabilities or direct access to sensory information.\n\nHowever, I can help facilitate discussions about what you see, provide information about visual topics, or assist with tasks that involve analyzing images or videos if you'd like.", additional_kwargs={'tool_calls': []}, raw={'model': 'llama3.2', 'created_at': '2024-11-15T09:08:54.133358Z', 'message': {'role': 'assistant', 'content': "I don't have the ability to see or perceive the physical world. I'm a text-based AI assistant, and my interactions are limited to processing and generating text. I can understand and respond to text-based inputs, but I don't have visual capabilities or direct access to sensory information.\n\nHowever, I can help facilitate discussions about wh

## Workflow

In [12]:
class InitialDescriptionComplete(Event):
    payload: str

class InitialDescriptionIncomplete(Event):
    payload: str

class ObjectFound(Event):
    payload: str

class WrongObjectSuggested(Event):
    payload: str

class RoomCorrect(Event):
    payload: str

class RoomIncorrect(Event):
    payload: str

class ObjectInRoom(Event):
    payload: str

class ObjectNotInRoom(Event):
    payload: str

number = 1

class ThorFindsObject(Workflow):
    
    def __init__(self, timeout: int = 10, verbose: bool = False):
        super().__init__(timeout=timeout, verbose=verbose)
        self.thor = AI2ThorClient()
    
    @step
    async def evaluate_initial_description(self, ev: StartEvent) -> InitialDescriptionComplete | InitialDescriptionIncomplete:
        
        if random.randint(0, 1) == 0:
            return InitialDescriptionComplete(payload="Initial description is complete.")
        else:
            return InitialDescriptionIncomplete(payload="Initial description is incomplete.")

    @step
    async def clarify_initial_description(self, ev: InitialDescriptionIncomplete) -> InitialDescriptionComplete:
        return InitialDescriptionComplete(payload="Description clarified.")

    @step
    async def find_correct_room_type(self, ev: InitialDescriptionComplete | RoomIncorrect | ObjectNotInRoom) -> RoomCorrect | RoomIncorrect:
        current_description = """a living room setup viewed from behind a dark-colored couch. The room has light-colored walls and a floor that seems to be a muted, earthy tone. The main items in the room include:
- A large, dark-colored sofa in the foreground facing a TV.
- A television placed on a small white TV stand, positioned along the far wall.
- A small side table with a blue vase and a decorative item beside the TV stand.
- A wooden shelf or cabinet off to the left side of the room."""
        
        if random.randint(0, 1) == 0:
            return RoomCorrect(payload="Correct room is found.")
        else:
            return RoomIncorrect(payload="Correct room is not found.")

    @step 
    async def find_object_in_room(self, ev: RoomCorrect) -> ObjectInRoom | ObjectNotInRoom:
        if random.randint(0, 10) < 4:
            return ObjectInRoom(payload="Object may be in this room.")
        else:
            return ObjectNotInRoom(payload="Object is not in this room.")
    
    @step 
    async def suggest_object(self, ev: ObjectInRoom | WrongObjectSuggested) -> WrongObjectSuggested | ObjectNotInRoom | StopEvent:
        
        
        if  random.randint(0, 1) == 0:
            return StopEvent(result="We found the object!")  # End the workflow
        else:
            if random.randint(0, 1) == 0:
                return WrongObjectSuggested(payload="Couldn't find object in this room.")
            else:
                return ObjectNotInRoom(payload="Object is not in this room.")

import asyncio

# Initialize and run the workflow
w = ThorFindsObject(timeout=10, verbose=False)
result = await w.run()
print(result)


We found the object!


In [13]:
from llama_index.utils.workflow import draw_all_possible_flows

draw_all_possible_flows(ThorFindsObject, filename="possible_flows.html")

<class 'NoneType'>
<class '__main__.InitialDescriptionComplete'>
<class '__main__.InitialDescriptionComplete'>
<class '__main__.InitialDescriptionIncomplete'>
<class '__main__.RoomCorrect'>
<class '__main__.RoomIncorrect'>
<class '__main__.ObjectInRoom'>
<class '__main__.ObjectNotInRoom'>
<class '__main__.WrongObjectSuggested'>
<class '__main__.ObjectNotInRoom'>
<class 'llama_index.core.workflow.events.StopEvent'>
possible_flows.html


In [14]:
!pip install chainlit

