Skip to content

Commit

Permalink
fix: resource usage calculated incorrectly (#2062)
Browse files Browse the repository at this point in the history
  • Loading branch information
fregataa committed Apr 28, 2024
1 parent d4d5613 commit d0e0b89
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 12 deletions.
1 change: 1 addition & 0 deletions changes/2062.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix wrong calculation of resource usage
34 changes: 22 additions & 12 deletions src/ai/backend/manager/models/resource_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ def to_json(self) -> Mapping[str, Any]:
"gpu_allocated": self.gpu_allocated,
}

def copy(self) -> ResourceUsage:
return attrs.evolve(self, nfs={*self.nfs}, device_type={*self.device_type})


def to_str(val: Any) -> Optional[str]:
return str(val) if val is not None else None
Expand Down Expand Up @@ -178,21 +181,21 @@ def to_map(self) -> dict[str, Any]:
"project_id": self.project_id,
"project_name": self.project_name,
"kernel_id": self.kernel_id,
"container_ids": self.container_ids,
"container_ids": self.container_ids.copy() if self.container_ids is not None else None,
"session_id": self.session_id,
"session_name": self.session_name,
"domain_name": self.domain_name,
"last_stat": self.last_stat,
"extra_info": self.extra_info,
"last_stat": {**self.last_stat} if self.last_stat is not None else None,
"extra_info": {**self.extra_info} if self.extra_info is not None else None,
"full_name": self.full_name,
"images": self.images,
"agents": self.agents,
"images": self.images.copy() if self.images is not None else None,
"agents": self.agents.copy() if self.agents is not None else None,
"status": self.status,
"status_info": self.status_info,
"status_history": self.status_history,
"cluster_mode": self.cluster_mode,
"scheduled_at": self.scheduled_at,
"total_usage": self.total_usage,
"total_usage": self.total_usage.copy(),
}

def to_json_base(self) -> dict[str, Any]:
Expand Down Expand Up @@ -238,7 +241,7 @@ class KernelResourceUsage(BaseResourceUsageGroup):
def to_json(self, child: bool = False) -> dict[str, Any]:
return {
**self.to_json_base(),
"agents": list(self.total_usage.agent_ids),
"agents": list(self.agents) if self.agents is not None else [],
"agent": self.agent,
"group_unit": self.group_unit.value,
"total_usage": self.total_usage.to_json(),
Expand All @@ -264,7 +267,10 @@ def from_base_usage_group(cls, usage_group: BaseResourceUsageGroup) -> KernelRes
session_row=usage_group.session_row,
kernel_row=usage_group.kernel_row,
agent=usage_group.kernel_row.agent,
**usage_group.to_map(),
**{
**usage_group.to_map(),
"total_usage": ResourceUsage(),
},
)

def register_resource_group(
Expand All @@ -285,7 +291,7 @@ class SessionResourceUsage(BaseResourceUsageGroup):
def to_json(self, child: bool = False) -> dict[str, Any]:
return_val = {
**self.to_json_base(),
"agents": list(self.total_usage.agent_ids),
"agents": list(self.agents) if self.agents is not None else [],
"group_unit": self.group_unit.value,
"total_usage": self.total_usage.to_json(),
}
Expand All @@ -312,7 +318,10 @@ def from_base_usage_group(cls, usage_group: BaseResourceUsageGroup) -> SessionRe
return cls(
project_row=usage_group.project_row,
session_row=usage_group.session_row,
**usage_group.to_map(),
**{
**usage_group.to_map(),
"total_usage": ResourceUsage(),
},
)

def register_resource_group(self, other: BaseResourceUsageGroup) -> bool:
Expand Down Expand Up @@ -378,6 +387,7 @@ def from_base_usage_group(cls, usage_group: BaseResourceUsageGroup) -> ProjectRe
"cluster_mode": None,
"scheduled_at": None,
"terminated_at": None,
"total_usage": ResourceUsage(),
}
return cls(
project_row=usage_group.project_row,
Expand Down Expand Up @@ -465,7 +475,7 @@ def parse_resource_usage(

return ResourceUsage(
agent_ids={kernel.agent},
nfs=nfs,
nfs={*nfs},
cpu_allocated=float(kernel.occupied_slots.get("cpu", 0)),
cpu_used=float(nmget(last_stat, "cpu_used.current", 0)),
mem_allocated=int(kernel.occupied_slots.get("mem", 0)),
Expand All @@ -475,7 +485,7 @@ def parse_resource_usage(
disk_used=int(nmget(last_stat, "io_scratch_size/stats.max", 0, "/")),
io_read=int(nmget(last_stat, "io_read.current", 0)),
io_write=int(nmget(last_stat, "io_write.current", 0)),
device_type=device_type,
device_type={*device_type},
smp=float(smp),
gpu_mem_allocated=float(gpu_mem_allocated),
gpu_allocated=float(gpu_allocated),
Expand Down

0 comments on commit d0e0b89

Please sign in to comment.