diff --git a/instill/helpers/const.py b/instill/helpers/const.py index cf1aae7..f2b4f91 100644 --- a/instill/helpers/const.py +++ b/instill/helpers/const.py @@ -125,8 +125,8 @@ class ImageToImageInput: "PYTHONPATH": os.getcwd(), }, } -DEFAULT_MAX_ONGOING_REQUESTS = 6 -DEFAULT_MAX_QUEUED_REQUESTS = 100 +DEFAULT_MAX_ONGOING_REQUESTS = 4 +DEFAULT_MAX_QUEUED_REQUESTS = 1000 RAM_MINIMUM_RESERVE = 1 # GB RAM_UPSCALE_FACTOR = 1.25 diff --git a/instill/helpers/ray_config.py b/instill/helpers/ray_config.py index 274bda8..dc55b01 100644 --- a/instill/helpers/ray_config.py +++ b/instill/helpers/ray_config.py @@ -84,6 +84,8 @@ def __init__(self, deployable: Deployment) -> None: if is_high_scale_model is not None and is_high_scale_model.lower() == "true": self._update_upscale_delay(120) self._update_downscale_delay(600) + self._update_target_ongoing_requests(4) + self._update_max_concurrent_requests(6) def _determine_vram_usage(self, model_path: str, total_vram: str): warn( @@ -189,6 +191,21 @@ def _update_max_replicas(self, num_replicas: int): return self + def _update_target_ongoing_requests(self, target_ongoing_requests: int): + self._autoscaling_config["target_num_ongoing_requests_per_replica"] = ( + target_ongoing_requests + ) + self._autoscaling_config["target_ongoing_requests"] = target_ongoing_requests + self._deployment = self._deployment.options( + autoscaling_config=self._autoscaling_config + ) + + def _update_max_concurrent_requests(self, max_concurrent_requests: int): + self._deployment = self._deployment.options( + max_concurrent_queries=max_concurrent_requests, + max_ongoing_requests=max_concurrent_requests, + ) + def _update_upscale_delay(self, upscale_delay_s: int): self._autoscaling_config["upscale_delay_s"] = upscale_delay_s self._deployment = self._deployment.options(