Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion src/huggingface_inference_toolkit/webservice_starlette.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from starlette.responses import PlainTextResponse, Response
from starlette.routing import Route

from huggingface_inference_toolkit.async_utils import async_handler_call
from huggingface_inference_toolkit.async_utils import MAX_CONCURRENT_THREADS, MAX_THREADS_GUARD, async_handler_call
from huggingface_inference_toolkit.const import (
HF_FRAMEWORK,
HF_HUB_TOKEN,
Expand Down Expand Up @@ -69,6 +69,18 @@ async def health(request):
return PlainTextResponse("Ok")


# Report Prometheus metrics
# inf_batch_current_size: Current number of requests being processed
# inf_queue_size: Number of requests waiting in the queue
async def metrics(request):
batch_current_size = MAX_CONCURRENT_THREADS - MAX_THREADS_GUARD.value
queue_size = MAX_THREADS_GUARD.statistics().tasks_waiting
return PlainTextResponse(
f"inf_batch_current_size {batch_current_size}\n" +
f"inf_queue_size {queue_size}\n"
Comment on lines +79 to +80
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure the naming is the best, I copied TGI

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we okay that this will always be 1?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it bother you or makes it unclear for customers I can remove inf_batch_current_size

)


async def predict(request):
try:
# extracts content from request
Expand Down Expand Up @@ -143,6 +155,7 @@ async def predict(request):
Route("/health", health, methods=["GET"]),
Route("/", predict, methods=["POST"]),
Route("/predict", predict, methods=["POST"]),
Route("/metrics", metrics, methods=["GET"]),
],
on_startup=[prepare_model_artifacts],
)
Loading