Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update replica gauge for endpoint and collectors #341

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@
ray_actor_handle=replica_actor_handle,
)

async def status(self):
if len(self.replicas) > 0:
num_replicas = await self.replicas[0].ray_actor_handle.num_replicas.remote()
self.num_replicas_gauge.set(num_replicas)
return self._status

Check warning on line 102 in buildflow/core/app/runtime/actors/collector_pattern/collector_pool.py

View check run for this annotation

Codecov / codecov/patch

buildflow/core/app/runtime/actors/collector_pattern/collector_pool.py#L99-L102

Added lines #L99 - L102 were not covered by tests

async def snapshot(self) -> ProcessorGroupSnapshot:
parent_snapshot: ProcessorGroupSnapshot = await super().snapshot()
num_replicas = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,26 +150,29 @@
async def status(self) -> RuntimeStatus:
return self._status

async def num_replicas(self) -> int:
if self.collector_deployment is not None:
application = serve.status().applications.get(self.processor_group.group_id)
if application is None:
return 0

Check warning on line 157 in buildflow/core/app/runtime/actors/collector_pattern/receive_process_push_ack.py

View check run for this annotation

Codecov / codecov/patch

buildflow/core/app/runtime/actors/collector_pattern/receive_process_push_ack.py#L154-L157

Added lines #L154 - L157 were not covered by tests

num_replicas = application.deployments.get(

Check warning on line 159 in buildflow/core/app/runtime/actors/collector_pattern/receive_process_push_ack.py

View check run for this annotation

Codecov / codecov/patch

buildflow/core/app/runtime/actors/collector_pattern/receive_process_push_ack.py#L159

Added line #L159 was not covered by tests
self.collector_deployment.name, {}
).replica_states.get("RUNNING", 0)
else:
num_replicas = 0
return num_replicas

Check warning on line 164 in buildflow/core/app/runtime/actors/collector_pattern/receive_process_push_ack.py

View check run for this annotation

Codecov / codecov/patch

buildflow/core/app/runtime/actors/collector_pattern/receive_process_push_ack.py#L163-L164

Added lines #L163 - L164 were not covered by tests

async def snapshot(self) -> Snapshot:
processor_snapshots = {}
for processor in self.processor_group.processors:
processor_snapshots[processor.processor_id] = IndividualProcessorMetrics(
events_processed_per_sec=0,
avg_process_time_millis=0,
)
if self.collector_deployment is not None:
num_replicas = (
serve.status()
.applications.get(self.processor_group.group_id, {})
.deployments.get(self.endpoint_deployment.name, {})
.replica_states.get("RUNNING", 0)
)
else:
num_replicas = 0

return ReceiveProcessPushSnapshot(
status=self._status,
timestamp_millis=utils.timestamp_millis(),
processor_snapshots=processor_snapshots,
num_replicas=num_replicas,
num_replicas=await self.num_replicas(),
)
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@
ray_actor_handle=replica_actor_handle,
)

async def status(self):
if len(self.replicas) > 0:
num_replicas = await self.replicas[0].ray_actor_handle.num_replicas.remote()
self.num_replicas_gauge.set(num_replicas)
return self._status

Check warning on line 103 in buildflow/core/app/runtime/actors/endpoint_pattern/endpoint_pool.py

View check run for this annotation

Codecov / codecov/patch

buildflow/core/app/runtime/actors/endpoint_pattern/endpoint_pool.py#L100-L103

Added lines #L100 - L103 were not covered by tests

async def snapshot(self) -> ProcessorGroupSnapshot:
parent_snapshot: ProcessorGroupSnapshot = await super().snapshot()
num_replicas = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,19 @@
async def status(self) -> RuntimeStatus:
return self._status

async def num_replicas(self) -> int:
if self.endpoint_deployment is not None:
application = serve.status().applications.get(self.processor_group.group_id)
if application is None:
return 0

Check warning on line 144 in buildflow/core/app/runtime/actors/endpoint_pattern/receive_process_respond.py

View check run for this annotation

Codecov / codecov/patch

buildflow/core/app/runtime/actors/endpoint_pattern/receive_process_respond.py#L141-L144

Added lines #L141 - L144 were not covered by tests

num_replicas = application.deployments.get(

Check warning on line 146 in buildflow/core/app/runtime/actors/endpoint_pattern/receive_process_respond.py

View check run for this annotation

Codecov / codecov/patch

buildflow/core/app/runtime/actors/endpoint_pattern/receive_process_respond.py#L146

Added line #L146 was not covered by tests
self.endpoint_deployment.name, {}
).replica_states.get("RUNNING", 0)
else:
num_replicas = 0
return num_replicas

Check warning on line 151 in buildflow/core/app/runtime/actors/endpoint_pattern/receive_process_respond.py

View check run for this annotation

Codecov / codecov/patch

buildflow/core/app/runtime/actors/endpoint_pattern/receive_process_respond.py#L150-L151

Added lines #L150 - L151 were not covered by tests

async def snapshot(self) -> Snapshot:
processor_snapshots = {}
# TODO: need to figure out local metrics
Expand All @@ -145,19 +158,10 @@
events_processed_per_sec=0,
avg_process_time_millis=0,
)
if self.endpoint_deployment is not None:
num_replicas = (
serve.status()
.applications.get(self.processor_group.group_id, {})
.deployments.get(self.endpoint_deployment.name, {})
.replica_states.get("RUNNING", 0)
)
else:
num_replicas = 0

return ReceiveProcessRespondSnapshot(
status=self._status,
timestamp_millis=utils.timestamp_millis(),
processor_snapshots=processor_snapshots,
num_replicas=num_replicas,
num_replicas=await self.num_replicas(),
)
Loading