From 72f1654e0158d067c493459e68e121237b1c29b2 Mon Sep 17 00:00:00 2001 From: Heiru Wu Date: Thu, 12 Sep 2024 03:07:35 +0800 Subject: [PATCH] fix(ray): fix multimodal chat input --- instill/helpers/ray_io.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/instill/helpers/ray_io.py b/instill/helpers/ray_io.py index 9fb48f0..24c175a 100644 --- a/instill/helpers/ray_io.py +++ b/instill/helpers/ray_io.py @@ -778,7 +778,7 @@ async def parse_task_chat_to_multimodal_chat_input( # messages and prompt images messages: List[Dict[str, str]] = [] images: List[List[Image.Image]] = [] - for message in data["messages"]: + for i, message in enumerate(data["messages"]): role = message["role"] content = message["content"] @@ -786,7 +786,11 @@ async def parse_task_chat_to_multimodal_chat_input( if role == PROMPT_ROLES[0]: for c in content: if c["type"] == "text": - messages.insert(0, {"role": role, "content": c["text"]}) + if len(messages) > i: + raise InvalidInputException( + "can only have single text from user in each round" + ) + messages.insert(i, {"role": role, "content": c["text"]}) elif c["type"] == "image-url": imgs.append(url_to_pil_image(c["image-url"])) elif c["type"] == "image-base64":