Skip to content

Commit

Permalink
Prometheus http and kernel startup/shutdown metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
maico committed Mar 27, 2024
1 parent d01e84a commit 27d5472
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 3 deletions.
18 changes: 17 additions & 1 deletion enterprise_gateway/base/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import List

import jupyter_server._version
import prometheus_client
from jupyter_server.base.handlers import APIHandler
from tornado import web

Expand All @@ -31,6 +32,17 @@ def get(self):
)


class PrometheusMetricsHandler(CORSMixin, web.RequestHandler):
"""
Return prometheus metrics from this enterprise gateway
"""

def get(self):
"""Get the latest state of the Prometheus' metrics."""
self.set_header("Content-Type", prometheus_client.CONTENT_TYPE_LATEST)
self.write(prometheus_client.generate_latest(prometheus_client.REGISTRY))


class NotFoundHandler(JSONErrorsMixin, web.RequestHandler):
"""
Catches all requests and responds with 404 JSON messages.
Expand All @@ -48,4 +60,8 @@ def prepare(self):
raise web.HTTPError(404)


default_handlers: List[tuple] = [(r"/api", APIVersionHandler), (r"/(.*)", NotFoundHandler)]
default_handlers: List[tuple] = [
(r"/api", APIVersionHandler),
(r"/metrics", PrometheusMetricsHandler),
(r"/(.*)", NotFoundHandler),
]
3 changes: 2 additions & 1 deletion enterprise_gateway/enterprisegatewayapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
WebhookKernelSessionManager,
)
from .services.sessions.sessionmanager import SessionManager
from .webapp import EnterpriseGatewayWebApp

try:
from jupyter_server.auth.authorizer import AllowAllAuthorizer
Expand Down Expand Up @@ -219,7 +220,7 @@ def init_webapp(self) -> None:

handlers = self._create_request_handlers()

self.web_app = web.Application(
self.web_app = EnterpriseGatewayWebApp(
handlers=handlers,
kernel_manager=self.kernel_manager,
session_manager=self.session_manager,
Expand Down
30 changes: 30 additions & 0 deletions enterprise_gateway/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Collection of all the metrics used by the Enterprise Gateway"""

import os

from prometheus_client import Histogram

metrics_prefix = os.environ.get("EG_METRICS_PREFIX", "enterprise_gateway")

HTTP_REQUEST_DURATION_SECONDS = Histogram(
'http_request_duration_seconds',
'Request duration for all HTTP requests',
['method', 'handler', 'status_code'],
namespace=metrics_prefix,
)

KERNEL_START_DURATION_SECONDS = Histogram(
'kernel_start_duration_seconds',
'Kernel startup duration',
['kernel_name', 'process_proxy'],
buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0],
namespace=metrics_prefix,
)

KERNEL_SHUTDOWN_DURATION_SECONDS = Histogram(
'kernel_shutdown_duration_seconds',
'Kernel startup duration for all HTTP requests',
['kernel_name', 'process_proxy'],
buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0],
namespace=metrics_prefix,
)
33 changes: 32 additions & 1 deletion enterprise_gateway/services/kernels/remotemanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from enterprise_gateway.mixins import EnterpriseGatewayConfigMixin

from ...metrics import KERNEL_SHUTDOWN_DURATION_SECONDS, KERNEL_START_DURATION_SECONDS
from ..processproxies.processproxy import BaseProcessProxyABC, LocalProcessProxy, RemoteProcessProxy
from ..sessions.kernelsessionmanager import KernelSessionManager

Expand Down Expand Up @@ -501,7 +502,12 @@ async def start_kernel(self, **kwargs: dict[str, Any] | None):
"""
self._get_process_proxy()
self._capture_user_overrides(**kwargs)
await super().start_kernel(**kwargs)
with KERNEL_START_DURATION_SECONDS.time() as timer:
timer.labels(
kernel_name=self.kernel_name,
process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}',
)
await super().start_kernel(**kwargs)

def _capture_user_overrides(self, **kwargs: dict[str, Any] | None) -> None:
"""
Expand Down Expand Up @@ -588,6 +594,31 @@ def request_shutdown(self, restart: bool = False) -> None:
if isinstance(self.process_proxy, RemoteProcessProxy):
self.process_proxy.shutdown_listener()

async def shutdown_kernel(self, now: bool = False, restart: bool = False):
"""Attempts to stop the kernel process cleanly.
This attempts to shutdown the kernels cleanly by:
1. Sending it a shutdown message over the control channel.
2. If that fails, the kernel is shutdown forcibly by sending it
a signal.
Parameters
----------
now : bool
Should the kernel be forcible killed *now*. This skips the
first, nice shutdown attempt.
restart: bool
Will this kernel be restarted after it is shutdown. When this
is True, connection files will not be cleaned up.
"""
with KERNEL_SHUTDOWN_DURATION_SECONDS.time() as timer:
timer.labels(
kernel_name=self.kernel_name,
process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}',
)
await super().shutdown_kernel(now=now, restart=restart)

async def restart_kernel(self, now: bool = False, **kwargs: dict[str, Any] | None) -> None:
"""
Restarts a kernel with the arguments that were used to launch it.
Expand Down
6 changes: 6 additions & 0 deletions enterprise_gateway/tests/test_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,12 @@ def test_kernel_env_auth_token(self):
if ws:
ws.close()

@gen_test
def test_get_metrics(self):
"""Getting the swagger.json spec should be ok"""
response = yield self.http_client.fetch(self.get_url("/metrics"))
self.assertEqual(response.code, 200)


class TestCustomDefaultKernel(TestHandlers):
"""Tests gateway behavior when setting a custom default kernelspec."""
Expand Down
36 changes: 36 additions & 0 deletions enterprise_gateway/webapp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Tornado web app for enterprise_gateway."""

from tornado import web
from tornado.web import RequestHandler

from enterprise_gateway.metrics import HTTP_REQUEST_DURATION_SECONDS


class EnterpriseGatewayWebApp(web.Application):
"""
Custom Tornado web application that handles all HTTP traffic for the Enterprise Gateway.
"""

def log_request(self, handler: RequestHandler) -> None:
"""
Tornado log handler for recording RED metrics.
We record the following metrics:
Rate: the number of requests, per second, your services are serving.
Errors: the number of failed requests per second.
Duration: the amount of time each request takes expressed as a time interval.
We use a fully qualified name of the handler as a label,
rather than every url path to reduce cardinality.
This function should be either the value of or called from a function
that is the 'log_function' tornado setting. This makes it get called
at the end of every request, allowing us to record the metrics we need.
"""
super().log_request(handler)

HTTP_REQUEST_DURATION_SECONDS.labels(
method=handler.request.method,
handler=f'{handler.__class__.__module__}.{type(handler).__name__}',
status_code=handler.get_status(),
).observe(handler.request.request_time())

0 comments on commit 27d5472

Please sign in to comment.