Skip to content

Commit

Permalink
feat: graceful shutdown (#4682)
Browse files Browse the repository at this point in the history
* feat: retry cancelled requests

* feat: bump grpc grace time to 1s

* fix: add comment for retries
  • Loading branch information
jacobowitz committed Apr 26, 2022
1 parent 655213d commit 25b8df1
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 12 deletions.
13 changes: 11 additions & 2 deletions jina/serve/networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,14 +636,23 @@ async def task_wrapper(
timeout=timeout,
)
except AioRpcError as e:
if e.code() != grpc.StatusCode.UNAVAILABLE:
# connection failures and cancelled requests should be retried
# all other cases should not be retried and will be raised immediately
# connection failures have the code grpc.StatusCode.UNAVAILABLE
# cancelled requests have the code grpc.StatusCode.CANCELLED
# requests usually gets cancelled when the server shuts down
# retries for cancelled requests will hit another replica in K8s
if (
e.code() != grpc.StatusCode.UNAVAILABLE
and e.code() != grpc.StatusCode.CANCELLED
):
raise
elif e.code() == grpc.StatusCode.UNAVAILABLE and i == 2:
self._logger.debug(f'GRPC call failed, retries exhausted')
raise
else:
self._logger.debug(
f'GRPC call failed with StatusCode.UNAVAILABLE, retry attempt {i+1}/3'
f'GRPC call failed with code {e.code()}, retry attempt {i+1}/3'
)

return asyncio.create_task(task_wrapper(requests, connection, endpoint))
Expand Down
4 changes: 2 additions & 2 deletions jina/serve/runtimes/worker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ async def async_cancel(self):
self.logger.debug('Cancel WorkerRuntime')

# 0.5 gives the runtime some time to complete outstanding responses
# this should be handled better, 0.5 is a rather random number
await self._grpc_server.stop(0.5)
# this should be handled better, 1.0 is a rather random number
await self._grpc_server.stop(1.0)
self.logger.debug('Stopped GRPC Server')

async def async_teardown(self):
Expand Down
8 changes: 0 additions & 8 deletions tests/k8s/test_graceful_request_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,6 @@ async def async_inputs():


@pytest.mark.asyncio
@pytest.mark.skipif(
'GITHUB_WORKFLOW' in os.environ,
reason='this actually does not work, there are messages lost when shutting down k8s pods',
)
@pytest.mark.parametrize(
'docker_images', [['slow-process-executor', 'jinaai/jina']], indirect=True
)
Expand Down Expand Up @@ -238,10 +234,6 @@ async def test_no_message_lost_during_scaling(logger, docker_images, tmpdir):


@pytest.mark.asyncio
@pytest.mark.skipif(
'GITHUB_WORKFLOW' in os.environ,
reason='this actually does not work, there are messages lost when shutting down k8s pods',
)
@pytest.mark.parametrize(
'docker_images', [['slow-process-executor', 'jinaai/jina']], indirect=True
)
Expand Down

0 comments on commit 25b8df1

Please sign in to comment.