Skip to content

Commit

Permalink
feature: add some exceptions when schedule computing session (#1401, #…
Browse files Browse the repository at this point in the history
…1887)

Backported-from: main (24.03)
Backported-to: 23.09
Co-authored-by: Joongi Kim <joongi@lablup.com>
  • Loading branch information
minseokey and achimnol committed Feb 23, 2024
1 parent 4be4913 commit 911927e
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 7 deletions.
1 change: 1 addition & 0 deletions changes/1887.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Enhance logging by adding more detailed exceptions when scheduling sessions, such as conditions like missing kernels or no agents available at all for the selected pending session
34 changes: 27 additions & 7 deletions src/ai/backend/manager/scheduler/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,12 +715,16 @@ async def _schedule_single_node_session(
raise GenericBadRequest(
"Cannot assign multiple kernels with different architectures' single node session",
)
if not sess_ctx.kernels:
raise GenericBadRequest(f"The session {sess_ctx.id!r} does not have any child kernel.")
requested_architecture = requested_architectures.pop()
compatible_candidate_agents = [
ag for ag in candidate_agents if ag.architecture == requested_architecture
]

try:
if not candidate_agents:
raise InstanceNotAvailable(extra_msg="No agents are available for scheduling")
if not compatible_candidate_agents:
raise InstanceNotAvailable(
extra_msg=(
Expand Down Expand Up @@ -1011,20 +1015,32 @@ async def _schedule_multi_node_session(
agent: Optional[AgentRow] = kernel.agent_row
if agent is not None:
# Check the resource availability of the manually designated agent
query = sa.select(AgentRow.available_slots).where(AgentRow.id == agent.id)
available_agent_slots = (await agent_db_sess.execute(query)).scalar()
if available_agent_slots is None:
raise GenericBadRequest(f"No such agent: {agent.id}")
for key in available_agent_slots:
if available_agent_slots[key] >= kernel.requested_slots[key]:
result = (
await agent_db_sess.execute(
sa.select([
AgentRow.available_slots,
AgentRow.occupied_slots,
]).where(AgentRow.id == agent.id)
)
).fetchall()[0]

if result is None:
raise GenericBadRequest(f"No such agent exist in DB: {agent_id}")
available_slots, occupied_slots = result

for key in available_slots.keys():
if (
available_slots[key] - occupied_slots[key]
>= kernel.requested_slots[key]
):
continue
else:
raise InstanceNotAvailable(
extra_msg=(
f"The designated agent ({agent.id}) does not have "
f"the enough remaining capacity ({key}, "
f"requested: {sess_ctx.requested_slots[key]}, "
f"available: {available_agent_slots[key]})."
f"remaining: {available_slots[key] - occupied_slots[key]})."
),
)
agent_id = agent.id
Expand All @@ -1033,6 +1049,10 @@ async def _schedule_multi_node_session(
compatible_candidate_agents = [
ag for ag in candidate_agents if ag.architecture == kernel.architecture
]
if not candidate_agents:
raise InstanceNotAvailable(
extra_msg="No agents are available for scheduling"
)
if not compatible_candidate_agents:
raise InstanceNotAvailable(
extra_msg=(
Expand Down

0 comments on commit 911927e

Please sign in to comment.