diff --git a/dashboard/modules/serve/serve_agent.py b/dashboard/modules/serve/serve_agent.py index 8ce5d2888ecc75..236f55ee378b67 100644 --- a/dashboard/modules/serve/serve_agent.py +++ b/dashboard/modules/serve/serve_agent.py @@ -251,51 +251,35 @@ async def put_all_applications(self, req: Request) -> Response: from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag try: - config = ServeDeploySchema.parse_obj(await req.json()) + config: ServeDeploySchema = ServeDeploySchema.parse_obj(await req.json()) except ValidationError as e: return Response( status=400, text=repr(e), ) + config_http_options = config.http_options.dict() + full_http_options = dict( + {"location": config.proxy_location}, **config_http_options + ) + async with self._controller_start_lock: client = await serve_start_async( detached=True, - http_options={ - "host": config.http_options.host, - "port": config.http_options.port, - "root_path": config.http_options.root_path, - "location": config.proxy_location, - }, + http_options=full_http_options, ) - # Check HTTP Host - host_conflict = self.check_http_options( - "host", client.http_config.host, config.http_options.host - ) - if host_conflict is not None: - return host_conflict - - # Check HTTP Port - port_conflict = self.check_http_options( - "port", client.http_config.port, config.http_options.port - ) - if port_conflict is not None: - return port_conflict - - # Check HTTP root path - root_path_conflict = self.check_http_options( - "root path", client.http_config.root_path, config.http_options.root_path - ) - if root_path_conflict is not None: - return root_path_conflict - - # Check HTTP location - location_conflict = self.check_http_options( - "location", client.http_config.location, config.proxy_location - ) - if location_conflict is not None: - return location_conflict + # Serve ignores HTTP options if it was already running when + # serve_start_async() is called. We explicitly return an error response + # here if Serve's HTTP options don't match the config's options. + for option, requested_value in full_http_options.items(): + conflict_response = self.check_http_options( + option, + getattr(client.http_config, option), + requested_value, + ) + if conflict_response is not None: + return conflict_response try: client.deploy_apps(config) diff --git a/doc/source/serve/advanced-guides/performance.md b/doc/source/serve/advanced-guides/performance.md index 40004c7ff0103f..89f26420d036cb 100644 --- a/doc/source/serve/advanced-guides/performance.md +++ b/doc/source/serve/advanced-guides/performance.md @@ -121,11 +121,6 @@ proper backpressure. You can increase the value in the deployment decorator; e.g (serve-performance-e2e-timeout)= ### Set an end-to-end request timeout -By default, Serve lets client HTTP requests run to completion no matter how long they take. However, slow requests could bottleneck the replica processing, blocking other requests that are waiting. It's recommended that you set an end-to-end timeout, so slow requests can be terminated and retried at another replica. +By default, Serve lets client HTTP requests run to completion no matter how long they take. However, slow requests could bottleneck the replica processing, blocking other requests that are waiting. It's recommended that you set an end-to-end timeout, so slow requests can be terminated and retried. -You can set an end-to-end timeout for HTTP requests by setting the `request_timeout_s` in the `http_options` field of the Serve config. HTTP Proxies will wait for that many seconds before terminating an HTTP request and retrying it at another replica. This config is global to your Ray cluster, and it cannot be updated during runtime. - -(serve-performance-http-retry)= -### Set request retry times - -By default, the Serve HTTP proxy retries up to `10` times when a response is not received due to failures (e.g. network disconnect, request timeout, etc.). +You can set an end-to-end timeout for HTTP requests by setting the `request_timeout_s` in the `http_options` field of the Serve config. HTTP Proxies will wait for that many seconds before terminating an HTTP request. This config is global to your Ray cluster, and it cannot be updated during runtime. Use [client-side retries](serve-best-practices-http-requests) to retry requests that time out due to transient failures. diff --git a/doc/source/serve/production-guide/best-practices.md b/doc/source/serve/production-guide/best-practices.md index dc4181c382f6f1..9677f9523e9944 100644 --- a/doc/source/serve/production-guide/best-practices.md +++ b/doc/source/serve/production-guide/best-practices.md @@ -102,6 +102,8 @@ deployment_statuses: For Kubernetes deployments with KubeRay, tighter integrations of `serve status` with Kubernetes are available. See [Getting the status of Serve applications in Kubernetes](serve-getting-status-kubernetes). +(serve-best-practices-http-requests)= + ## Best practices for HTTP requests Most examples in these docs use straightforward `get` or `post` requests using Python's `requests` library, such as: diff --git a/python/ray/serve/tests/test_request_timeout.py b/python/ray/serve/tests/test_request_timeout.py index 5791881ad98eb7..6393cef3ca4883 100644 --- a/python/ray/serve/tests/test_request_timeout.py +++ b/python/ray/serve/tests/test_request_timeout.py @@ -7,9 +7,12 @@ import requests import ray -from ray._private.test_utils import SignalActor +from ray._private.test_utils import SignalActor, wait_for_condition +from ray.dashboard.modules.serve.sdk import ServeSubmissionClient from ray import serve +from ray.serve.schema import ServeInstanceDetails +from ray.serve._private.common import ApplicationStatus from ray.serve._private.constants import RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING @@ -102,6 +105,66 @@ async def __call__(self): ray.get(signal_actor.send.remote()) +@serve.deployment(graceful_shutdown_timeout_s=0) +class HangsOnFirstRequest: + def __init__(self): + self._saw_first_request = False + self.signal_actor = SignalActor.remote() + + async def __call__(self): + if not self._saw_first_request: + self._saw_first_request = True + await self.signal_actor.wait.remote() + else: + ray.get(self.signal_actor.send.remote()) + return "Success!" + + +hangs_on_first_request_app = HangsOnFirstRequest.bind() + + +def test_with_rest_api(ray_start_stop): + """Verify the REST API can configure the request timeout.""" + config = { + "proxy_location": "EveryNode", + "http_options": {"request_timeout_s": 1}, + "applications": [ + { + "name": "app", + "route_prefix": "/", + "import_path": ( + "ray.serve.tests." "test_request_timeout:hangs_on_first_request_app" + ), + } + ], + } + ServeSubmissionClient("http://localhost:52365").deploy_applications(config) + + def application_running(): + response = requests.get( + "http://localhost:52365/api/serve/applications/", timeout=15 + ) + assert response.status_code == 200 + + serve_details = ServeInstanceDetails(**response.json()) + return serve_details.applications["app"].status == ApplicationStatus.RUNNING + + wait_for_condition(application_running, timeout=15) + print("Application has started running. Testing requests...") + + response = requests.get("http://localhost:8000") + if RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING: + assert response.status_code == 500 + else: + assert response.status_code == 200 + assert response.text == "Success!" + + response = requests.get("http://localhost:8000") + assert response.status_code == 200 + print("Requests succeeded! Deleting application.") + ServeSubmissionClient("http://localhost:52365").delete_applications() + + @pytest.mark.parametrize( "ray_instance", [