Skip to content

Commit

Permalink
fix: Add timeout to prepare()'s PersistentTaskGroup (#514)
Browse files Browse the repository at this point in the history
* Add `async_timeout` when calling `TaskGroup`
* Change `prepare()`'s `TaskGroup` to `PersistentTaskGroup`
  • Loading branch information
kyujin-cho committed Jul 5, 2022
1 parent 088b0aa commit f559a45
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 1 deletion.
1 change: 1 addition & 0 deletions changes/514.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix `prepare()` not running when `start_session()` call hangs without raising Exception
8 changes: 7 additions & 1 deletion src/ai/backend/manager/scheduler/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)

import aiotools
import async_timeout
import sqlalchemy as sa
from dateutil.tz import tzutc
from sqlalchemy.engine.row import Row
Expand Down Expand Up @@ -781,7 +782,10 @@ async def _mark_session_preparing() -> Sequence[PendingSession]:

scheduled_sessions = await execute_with_retry(_mark_session_preparing)
log.debug("prepare(): preparing {} session(s)", len(scheduled_sessions))
async with aiotools.TaskGroup() as tg:
async with (
async_timeout.timeout(delay=50.0),
aiotools.PersistentTaskGroup() as tg,
):
for scheduled_session in scheduled_sessions:
await self.registry.event_producer.produce_event(
SessionPreparingEvent(
Expand All @@ -800,6 +804,8 @@ async def _mark_session_preparing() -> Sequence[PendingSession]:
"maybe another prepare() call is still running")
raise asyncio.CancelledError()
raise
except asyncio.TimeoutError:
log.warn('prepare(): timeout while executing start_session()')

async def start_session(
self,
Expand Down

0 comments on commit f559a45

Please sign in to comment.