From 352893c0c6c97a588fd93e129d96e2054c16974c Mon Sep 17 00:00:00 2001 From: Heiru Wu Date: Wed, 24 Jul 2024 04:48:49 +0800 Subject: [PATCH 1/3] feat(ray): add test io in parser --- instill/helpers/const.py | 4 +- instill/helpers/ray_io.py | 268 +++++++++++++++++++++++++++++++------- 2 files changed, 226 insertions(+), 46 deletions(-) diff --git a/instill/helpers/const.py b/instill/helpers/const.py index 18256a8..d7d4404 100644 --- a/instill/helpers/const.py +++ b/instill/helpers/const.py @@ -40,7 +40,7 @@ class ConversationInput: class ConversationMultiModelInput: conversation: List[Union[Dict[str, Union[str, Dict[str, str]]]]] - prompt_images: Union[List[np.ndarray], None] = None + prompt_images: Union[List[Image.Image], None] = None max_new_tokens: int = 100 temperature: float = 0.8 top_k: int = 1 @@ -60,7 +60,7 @@ class TextToImageInput: class ImageToImageInput: prompt = "" - prompt_image: Union[np.ndarray, None] = None + prompt_image: Union[Image.Image, None] = None steps: int = 5 cfg_scale: float = 7.5 seed: int = 0 diff --git a/instill/helpers/ray_io.py b/instill/helpers/ray_io.py index 2ef57db..cbea797 100644 --- a/instill/helpers/ray_io.py +++ b/instill/helpers/ray_io.py @@ -7,6 +7,7 @@ import requests from google.protobuf import json_format, struct_pb2 from PIL import Image +from starlette.requests import Request import instill.protogen.model.model.v1alpha.common_pb2 as commonpb import instill.protogen.model.model.v1alpha.model_pb2 as modelpb @@ -70,9 +71,18 @@ def protobuf_to_struct(pb_msg): return struct_pb -def parse_task_classification_to_vision_input( - request: CallRequest, +async def parse_task_classification_to_vision_input( + request: Union[CallRequest, Request], ) -> List[VisionInput]: + + # http test input + if isinstance(request, Request): + test_image_url: str = await request.json() + + inp = VisionInput() + inp.image = url_to_pil_image(test_image_url) + return [inp] + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["Classification"][ @@ -96,12 +106,19 @@ def parse_task_classification_to_vision_input( def construct_task_classification_output( + request: Union[CallRequest, Request], categories: List[str], scores: List[float], -) -> CallResponse: +) -> Union[CallResponse, Dict[str, List]]: if not len(categories) == len(scores): raise InvalidOutputShapeException + if isinstance(request, Request): + return { + "categories": categories, + "scores": scores, + } + task_outputs = [] for category, score in zip(categories, scores): @@ -118,9 +135,18 @@ def construct_task_classification_output( return CallResponse(task_outputs=task_outputs) -def parse_task_detection_to_vision_input( - request: CallRequest, +async def parse_task_detection_to_vision_input( + request: Union[CallRequest, Request], ) -> List[VisionInput]: + + # http test input + if isinstance(request, Request): + test_image_url: str = await request.json() + + inp = VisionInput() + inp.image = url_to_pil_image(test_image_url) + return [inp] + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["Detection"]["Type"] @@ -142,10 +168,11 @@ def parse_task_detection_to_vision_input( def construct_task_detection_output( + request: Union[CallRequest, Request], categories: List[List[str]], scores: List[List[float]], bounding_boxes: List[List[tuple]], -) -> CallResponse: +) -> Union[CallResponse, Dict[str, List]]: """Construct trigger output for detection task Args: @@ -157,6 +184,13 @@ def construct_task_detection_output( if not len(categories) == len(scores) == len(bounding_boxes): raise InvalidOutputShapeException + if isinstance(request, Request): + return { + "categories": categories, + "scores": scores, + "bounding_boxes": bounding_boxes, + } + task_outputs = [] for category, score, bbox in zip(categories, scores, bounding_boxes): objects = [] @@ -184,9 +218,18 @@ def construct_task_detection_output( return CallResponse(task_outputs=task_outputs) -def parse_task_ocr_to_vision_input( - request: CallRequest, +async def parse_task_ocr_to_vision_input( + request: Union[CallRequest, Request], ) -> List[VisionInput]: + + # http test input + if isinstance(request, Request): + test_image_url: str = await request.json() + + inp = VisionInput() + inp.image = url_to_pil_image(test_image_url) + return [inp] + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["Ocr"]["Type"] @@ -208,10 +251,11 @@ def parse_task_ocr_to_vision_input( def construct_task_ocr_output( + request: Union[CallRequest, Request], texts: List[List[str]], scores: List[List[float]], bounding_boxes: List[List[tuple]], -) -> CallResponse: +) -> Union[CallResponse, Dict[str, List]]: """Construct trigger output for ocr task Args: @@ -223,6 +267,13 @@ def construct_task_ocr_output( if not len(texts) == len(scores) == len(bounding_boxes): raise InvalidOutputShapeException + if isinstance(request, Request): + return { + "texts": texts, + "scores": scores, + "bounding_boxes": bounding_boxes, + } + task_outputs = [] for text, score, bbox in zip(texts, scores, bounding_boxes): objects = [] @@ -246,9 +297,18 @@ def construct_task_ocr_output( return CallResponse(task_outputs=task_outputs) -def parse_task_instance_segmentation_to_vision_input( - request: CallRequest, +async def parse_task_instance_segmentation_to_vision_input( + request: Union[CallRequest, Request], ) -> List[VisionInput]: + + # http test input + if isinstance(request, Request): + test_image_url: str = await request.json() + + inp = VisionInput() + inp.image = url_to_pil_image(test_image_url) + return [inp] + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["InstanceSegmentation"][ @@ -272,11 +332,12 @@ def parse_task_instance_segmentation_to_vision_input( def construct_task_instance_segmentation_output( + request: Union[CallRequest, Request], rles: List[List[str]], categories: List[List[str]], scores: List[List[float]], bounding_boxes: List[List[tuple]], -) -> CallResponse: +) -> Union[CallResponse, Dict[str, List]]: """Construct trigger output for instance segmentation task Args: @@ -289,6 +350,14 @@ def construct_task_instance_segmentation_output( if not len(rles) == len(categories) == len(scores) == len(bounding_boxes): raise InvalidOutputShapeException + if isinstance(request, Request): + return { + "rles": rles, + "categories": categories, + "scores": scores, + "bounding_boxes": bounding_boxes, + } + task_outputs = [] for rle, category, score, bbox in zip(rles, categories, scores, bounding_boxes): objects = [] @@ -319,9 +388,18 @@ def construct_task_instance_segmentation_output( return CallResponse(task_outputs=task_outputs) -def parse_task_semantic_segmentation_to_vision_input( - request: CallRequest, +async def parse_task_semantic_segmentation_to_vision_input( + request: Union[CallRequest, Request], ) -> List[VisionInput]: + + # http test input + if isinstance(request, Request): + test_image_url: str = await request.json() + + inp = VisionInput() + inp.image = url_to_pil_image(test_image_url) + return [inp] + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["SemanticSegmentation"][ @@ -345,9 +423,10 @@ def parse_task_semantic_segmentation_to_vision_input( def construct_task_semantic_segmentation_output( + request: Union[CallRequest, Request], rles: List[List[str]], categories: List[List[str]], -) -> CallResponse: +) -> Union[CallResponse, Dict[str, List]]: """Construct trigger output for semantic segmentation task Args: @@ -358,6 +437,12 @@ def construct_task_semantic_segmentation_output( if not len(rles) == len(categories): raise InvalidOutputShapeException + if isinstance(request, Request): + return { + "rles": rles, + "categories": categories, + } + task_outputs = [] for rle, category in zip(rles, categories): objects = [] @@ -381,9 +466,18 @@ def construct_task_semantic_segmentation_output( return CallResponse(task_outputs=task_outputs) -def parse_task_keypoint_to_vision_input( - request: CallRequest, +async def parse_task_keypoint_to_vision_input( + request: Union[CallRequest, Request], ) -> List[VisionInput]: + + # http test input + if isinstance(request, Request): + test_image_url: str = await request.json() + + inp = VisionInput() + inp.image = url_to_pil_image(test_image_url) + return [inp] + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["Keypoint"]["Type"] @@ -405,10 +499,11 @@ def parse_task_keypoint_to_vision_input( def construct_task_keypoint_output( + request: Union[CallRequest, Request], keypoints: List[List[List[tuple]]], scores: List[List[float]], bounding_boxes: List[List[tuple]], -) -> CallResponse: +) -> Union[CallResponse, Dict[str, List]]: """Construct trigger output for keypoint task Args: @@ -418,9 +513,17 @@ def construct_task_keypoint_output( bounding_boxes (List[List[tuple]]): for each image input, the list of detected object's bbox, with the format (top, left, width, height) """ + if not len(keypoints) == len(scores) == len(bounding_boxes): raise InvalidOutputShapeException + if isinstance(request, Request): + return { + "keypoints": keypoints, + "scores": scores, + "bounding_boxes": bounding_boxes, + } + task_outputs = [] for keypoint, score, bbox in zip(keypoints, scores, bounding_boxes): objects = [] @@ -455,12 +558,19 @@ def construct_task_keypoint_output( return CallResponse(task_outputs=task_outputs) -def parse_task_text_generation_to_conversation_input( - request: CallRequest, +async def parse_task_text_generation_to_conversation_input( + request: Union[CallRequest, Request], ) -> List[ConversationInput]: - input_list = [] + # http test input + if isinstance(request, Request): + test_prompt: str = await request.json() + inp = ConversationInput() + inp.conversation = [{"role": "user", "content": test_prompt}] + return [inp] + + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["TextGeneration"] @@ -569,9 +679,15 @@ def parse_task_text_generation_to_conversation_input( return input_list -def construct_task_text_generation_output(texts: List[str]) -> CallResponse: - task_outputs = [] +def construct_task_text_generation_output( + request: Union[CallRequest, Request], + texts: List[str], +) -> Union[CallResponse, List[str]]: + + if isinstance(request, Request): + return texts + task_outputs = [] for text in texts: task_outputs.append( protobuf_to_struct( @@ -584,12 +700,19 @@ def construct_task_text_generation_output(texts: List[str]) -> CallResponse: return CallResponse(task_outputs=task_outputs) -def parse_task_text_generation_chat_to_conversation_input( - request: CallRequest, +async def parse_task_text_generation_chat_to_conversation_input( + request: Union[CallRequest, Request], ) -> List[ConversationInput]: - input_list = [] + # http test input + if isinstance(request, Request): + test_prompt: str = await request.json() + inp = ConversationInput() + inp.conversation = [{"role": "user", "content": test_prompt}] + return [inp] + + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["TextGenerationChat"] @@ -693,14 +816,20 @@ def parse_task_text_generation_chat_to_conversation_input( if "seed" in task_input_dict: inp.seed = int(task_input_dict["seed"]) - input_list.append(inp) + input_list.append(inp) return input_list -def construct_task_text_generation_chat_output(texts: List[str]) -> CallResponse: - task_outputs = [] +def construct_task_text_generation_chat_output( + request: Union[CallRequest, Request], + texts: List[str], +) -> Union[CallResponse, List[str]]: + + if isinstance(request, Request): + return texts + task_outputs = [] for text in texts: task_outputs.append( protobuf_to_struct( @@ -715,12 +844,31 @@ def construct_task_text_generation_chat_output(texts: List[str]) -> CallResponse return CallResponse(task_outputs=task_outputs) -def parse_task_visual_question_answering_to_conversation_multimodal_input( - request: CallRequest, +async def parse_task_visual_question_answering_to_conversation_multimodal_input( + request: Union[CallRequest, Request], ) -> List[ConversationMultiModelInput]: - input_list = [] + # http test input + if isinstance(request, Request): + test_dict: dict = await request.json() + + test_prompt = test_dict["prompt"] + image_url = test_dict["image_url"] + inp = ConversationMultiModelInput() + inp.conversation = [ + { + "role": "user", + "content": { + "type": "text", + "content": test_prompt, + }, + } + ] + inp.prompt_images = [url_to_pil_image(image_url)] + return [inp] + + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)[ "VisualQuestionAnswering" @@ -813,10 +961,14 @@ def parse_task_visual_question_answering_to_conversation_multimodal_input( def construct_task_visual_question_answering_output( + request: Union[CallRequest, Request], texts: List[str], -) -> CallResponse: - task_outputs = [] +) -> Union[CallResponse, List[str]]: + + if isinstance(request, Request): + return texts + task_outputs = [] for text in texts: task_outputs.append( protobuf_to_struct( @@ -831,12 +983,20 @@ def construct_task_visual_question_answering_output( return CallResponse(task_outputs=task_outputs) -def parse_task_text_to_image_input( - request: CallRequest, +async def parse_task_text_to_image_input( + request: Union[CallRequest, Request], ) -> List[TextToImageInput]: - input_list = [] + # http test input + if isinstance(request, Request): + test_prompt: str = await request.json() + + inp = TextToImageInput() + inp.prompt = test_prompt + + return [inp] + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["TextToImage"] @@ -867,15 +1027,19 @@ def parse_task_text_to_image_input( def construct_task_text_to_image_output( + request: Union[CallRequest, Request], images: List[List[str]], -) -> CallResponse: +) -> Union[CallResponse, List[List[str]]]: """Construct trigger output for keypoint task Args: images (List[List[str]]): for each input prompt, the generated images with the length of `samples` """ - task_outputs = [] + if isinstance(request, Request): + return images + + task_outputs = [] for imgs in images: task_outputs.append( protobuf_to_struct( @@ -888,12 +1052,24 @@ def construct_task_text_to_image_output( return CallResponse(task_outputs=task_outputs) -def parse_task_image_to_image_input( - request: CallRequest, +async def parse_task_image_to_image_input( + request: Union[CallRequest, Request], ) -> List[ImageToImageInput]: - input_list = [] + # http test input + if isinstance(request, Request): + test_dict: dict = await request.json() + + test_prompt = test_dict["prompt"] + test_image_url = test_dict["image_url"] + + inp = ImageToImageInput() + inp.prompt = test_prompt + inp.prompt_image = url_to_pil_image(test_image_url) + return [inp] + + input_list = [] for task_input in request.task_inputs: task_input_dict = json_format.MessageToDict(task_input)["ImageToImage"] @@ -942,15 +1118,19 @@ def parse_task_image_to_image_input( def construct_task_image_to_image_output( + request: Union[CallRequest, Request], images: List[List[str]], -) -> CallResponse: +) -> Union[CallResponse, List[List[str]]]: """Construct trigger output for keypoint task Args: images (List[List[str]]): for each input prompt, the generated images with the length of `samples` """ - task_outputs = [] + if isinstance(request, Request): + return images + + task_outputs = [] for imgs in images: task_outputs.append( protobuf_to_struct( From 93bd7b2d0cfb74d7ffb2c509733180503cd49648 Mon Sep 17 00:00:00 2001 From: Heiru Wu Date: Wed, 24 Jul 2024 06:24:35 +0800 Subject: [PATCH 2/3] feat: add run command --- instill/helpers/cli.py | 86 +++++++++++++++++++++++++++++++++++++++ instill/helpers/ray_io.py | 48 +++++++++++----------- instill/helpers/test.py | 20 +++++++++ 3 files changed, 130 insertions(+), 24 deletions(-) create mode 100644 instill/helpers/test.py diff --git a/instill/helpers/cli.py b/instill/helpers/cli.py index 5bdc8c7..04e51e1 100644 --- a/instill/helpers/cli.py +++ b/instill/helpers/cli.py @@ -1,4 +1,6 @@ import argparse +import uuid +import time # import hashlib import os @@ -88,6 +90,27 @@ def cli(): required=False, ) + # run + run_parser = subcommands.add_parser("run", help="Run inference on model image") + run_parser.set_defaults(func=run) + run_parser.add_argument( + "name", + help="user and model namespace, in the format of /", + ) + run_parser.add_argument( + "-t", + "--tag", + help="tag for the model image, default to `latest`", + default="latest", + required=False, + ) + run_parser.add_argument( + "-i", + "--input", + help="inference input json", + required=True, + ) + args = parser.parse_args() args.func(args) @@ -227,5 +250,68 @@ def push(args): Logger.i("[Instill Builder] Done") +def run(args): + try: + name = uuid.uuid4() + + Logger.i("[Instill Builder] Starting model image...") + subprocess.run( + [ + "docker", + "run", + "--rm", + "-d", + "--name", + str(name), + f"{args.name}:{args.tag}", + "serve", + "run", + "_model:entrypoint", + ], + check=True, + ) + time.sleep(10) + subprocess.run( + [ + "docker", + "exec", + str(name), + "/bin/bash", + "-c", + "until serve status --name default | grep 'RUNNING: 1' > /dev/null; do sleep 1; done;", + ], + check=True, + ) + Logger.i("[Instill Builder] Model running") + subprocess.run( + [ + "docker", + "exec", + str(name), + "python", + "-m", + "instill.helpers.test", + "-i", + args.input, + ], + check=True, + ) + subprocess.run( + [ + "docker", + "stop", + str(name), + ], + check=True, + ) + except subprocess.CalledProcessError: + Logger.e("[Instill Builder] Run failed") + except Exception as e: + Logger.e("[Instill Builder] Prepare failed") + Logger.e(e) + finally: + Logger.i("[Instill Builder] Done") + + if __name__ == "__main__": cli() diff --git a/instill/helpers/ray_io.py b/instill/helpers/ray_io.py index cbea797..3793e9c 100644 --- a/instill/helpers/ray_io.py +++ b/instill/helpers/ray_io.py @@ -77,10 +77,10 @@ async def parse_task_classification_to_vision_input( # http test input if isinstance(request, Request): - test_image_url: str = await request.json() + data: dict = await request.json() inp = VisionInput() - inp.image = url_to_pil_image(test_image_url) + inp.image = url_to_pil_image(data["image_url"]) return [inp] input_list = [] @@ -141,10 +141,10 @@ async def parse_task_detection_to_vision_input( # http test input if isinstance(request, Request): - test_image_url: str = await request.json() + data: dict = await request.json() inp = VisionInput() - inp.image = url_to_pil_image(test_image_url) + inp.image = url_to_pil_image(data["image_url"]) return [inp] input_list = [] @@ -224,10 +224,10 @@ async def parse_task_ocr_to_vision_input( # http test input if isinstance(request, Request): - test_image_url: str = await request.json() + data: dict = await request.json() inp = VisionInput() - inp.image = url_to_pil_image(test_image_url) + inp.image = url_to_pil_image(data["image_url"]) return [inp] input_list = [] @@ -303,10 +303,10 @@ async def parse_task_instance_segmentation_to_vision_input( # http test input if isinstance(request, Request): - test_image_url: str = await request.json() + data: dict = await request.json() inp = VisionInput() - inp.image = url_to_pil_image(test_image_url) + inp.image = url_to_pil_image(data["image_url"]) return [inp] input_list = [] @@ -394,10 +394,10 @@ async def parse_task_semantic_segmentation_to_vision_input( # http test input if isinstance(request, Request): - test_image_url: str = await request.json() + data: dict = await request.json() inp = VisionInput() - inp.image = url_to_pil_image(test_image_url) + inp.image = url_to_pil_image(data["image_url"]) return [inp] input_list = [] @@ -472,10 +472,10 @@ async def parse_task_keypoint_to_vision_input( # http test input if isinstance(request, Request): - test_image_url: str = await request.json() + data: dict = await request.json() inp = VisionInput() - inp.image = url_to_pil_image(test_image_url) + inp.image = url_to_pil_image(data["image_url"]) return [inp] input_list = [] @@ -564,10 +564,10 @@ async def parse_task_text_generation_to_conversation_input( # http test input if isinstance(request, Request): - test_prompt: str = await request.json() + data: dict = await request.json() inp = ConversationInput() - inp.conversation = [{"role": "user", "content": test_prompt}] + inp.conversation = [{"role": "user", "content": data["prompt"]}] return [inp] input_list = [] @@ -706,10 +706,10 @@ async def parse_task_text_generation_chat_to_conversation_input( # http test input if isinstance(request, Request): - test_prompt: str = await request.json() + data: dict = await request.json() inp = ConversationInput() - inp.conversation = [{"role": "user", "content": test_prompt}] + inp.conversation = [{"role": "user", "content": data["prompt"]}] return [inp] input_list = [] @@ -850,10 +850,10 @@ async def parse_task_visual_question_answering_to_conversation_multimodal_input( # http test input if isinstance(request, Request): - test_dict: dict = await request.json() + data: dict = await request.json() - test_prompt = test_dict["prompt"] - image_url = test_dict["image_url"] + test_prompt = data["prompt"] + image_url = data["image_url"] inp = ConversationMultiModelInput() inp.conversation = [ @@ -989,10 +989,10 @@ async def parse_task_text_to_image_input( # http test input if isinstance(request, Request): - test_prompt: str = await request.json() + data: dict = await request.json() inp = TextToImageInput() - inp.prompt = test_prompt + inp.prompt = data["prompt"] return [inp] @@ -1058,10 +1058,10 @@ async def parse_task_image_to_image_input( # http test input if isinstance(request, Request): - test_dict: dict = await request.json() + data: dict = await request.json() - test_prompt = test_dict["prompt"] - test_image_url = test_dict["image_url"] + test_prompt = data["prompt"] + test_image_url = data["image_url"] inp = ImageToImageInput() inp.prompt = test_prompt diff --git a/instill/helpers/test.py b/instill/helpers/test.py new file mode 100644 index 0000000..8891de7 --- /dev/null +++ b/instill/helpers/test.py @@ -0,0 +1,20 @@ +import argparse +import json +import requests +import pprint + +parser = argparse.ArgumentParser() + +parser.add_argument( + "-i", + "--input", + help="inference input json", + required=True, +) + +args = parser.parse_args() +pprint.pprint( + requests.post( + "http://127.0.0.1:8000/", json=json.loads(args.input), timeout=600 + ).text +) From 6f58084c43093dc8d15ab6bae93441940ab64bbf Mon Sep 17 00:00:00 2001 From: Heiru Wu Date: Wed, 24 Jul 2024 06:26:22 +0800 Subject: [PATCH 3/3] chore: fix static check --- instill/helpers/cli.py | 4 ++-- instill/helpers/const.py | 1 - instill/helpers/test.py | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/instill/helpers/cli.py b/instill/helpers/cli.py index 04e51e1..b39d80a 100644 --- a/instill/helpers/cli.py +++ b/instill/helpers/cli.py @@ -1,6 +1,4 @@ import argparse -import uuid -import time # import hashlib import os @@ -8,6 +6,8 @@ import shutil import subprocess import tempfile +import time +import uuid import ray import yaml diff --git a/instill/helpers/const.py b/instill/helpers/const.py index d7d4404..98e22fb 100644 --- a/instill/helpers/const.py +++ b/instill/helpers/const.py @@ -2,7 +2,6 @@ from enum import Enum from typing import Any, Dict, List, Union -import numpy as np from PIL import Image PROMPT_ROLES = ["user", "assistant", "system"] diff --git a/instill/helpers/test.py b/instill/helpers/test.py index 8891de7..6518ce5 100644 --- a/instill/helpers/test.py +++ b/instill/helpers/test.py @@ -1,8 +1,9 @@ import argparse import json -import requests import pprint +import requests + parser = argparse.ArgumentParser() parser.add_argument(