diff --git a/instill/helpers/ray_io.py b/instill/helpers/ray_io.py index 9fb48f0..24c175a 100644 --- a/instill/helpers/ray_io.py +++ b/instill/helpers/ray_io.py @@ -778,7 +778,7 @@ async def parse_task_chat_to_multimodal_chat_input( # messages and prompt images messages: List[Dict[str, str]] = [] images: List[List[Image.Image]] = [] - for message in data["messages"]: + for i, message in enumerate(data["messages"]): role = message["role"] content = message["content"] @@ -786,7 +786,11 @@ async def parse_task_chat_to_multimodal_chat_input( if role == PROMPT_ROLES[0]: for c in content: if c["type"] == "text": - messages.insert(0, {"role": role, "content": c["text"]}) + if len(messages) > i: + raise InvalidInputException( + "can only have single text from user in each round" + ) + messages.insert(i, {"role": role, "content": c["text"]}) elif c["type"] == "image-url": imgs.append(url_to_pil_image(c["image-url"])) elif c["type"] == "image-base64":