Skip to content

Commit

Permalink
fix: Always dump kernel registry information upon agent termination (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
adrysn committed Jun 8, 2022
1 parent 94aacb5 commit 9d3e226
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 4 deletions.
1 change: 1 addition & 0 deletions changes/450.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Always dump kernel registry information to a file upon agent termination.
6 changes: 3 additions & 3 deletions src/ai/backend/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,7 +893,7 @@ async def lifecycle_task_exception_handler(
while True:
ev = await self.container_lifecycle_queue.get()
if isinstance(ev, Sentinel):
await self.save_last_registry()
await self.save_last_registry(force=True)
return
# attr currently does not support customizing getstate/setstate dunder methods
# until the next release.
Expand Down Expand Up @@ -1802,8 +1802,8 @@ async def download_file(self, kernel_id: KernelId, filepath: str):
async def list_files(self, kernel_id: KernelId, path: str):
return await self.kernel_registry[kernel_id].list_files(path)

async def save_last_registry(self) -> None:
if now := time.monotonic() <= self.last_registry_written_time + 60:
async def save_last_registry(self, force=False) -> None:
if (not force) and (now := time.monotonic() <= self.last_registry_written_time + 60):
return # don't save too frequently
try:
ipc_base_path = self.local_config["agent"]["ipc-base-path"]
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/agent/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ async def collect_container_stat(
try:
cid = info['container_id']
except KeyError:
log.warning('collect_container_stat(): no container for kernel {}}', kid)
log.warning('collect_container_stat(): no container for kernel {}', kid)
kernel_id_map[ContainerId(cid)] = kid
unused_kernel_ids = set(self.kernel_metrics.keys()) - set(kernel_id_map.values())
for unused_kernel_id in unused_kernel_ids:
Expand Down

0 comments on commit 9d3e226

Please sign in to comment.