diff --git a/enterprise_gateway/base/handlers.py b/enterprise_gateway/base/handlers.py index bda75cac..681ef73b 100644 --- a/enterprise_gateway/base/handlers.py +++ b/enterprise_gateway/base/handlers.py @@ -8,6 +8,7 @@ from typing import List import jupyter_server._version +import prometheus_client from jupyter_server.base.handlers import APIHandler from tornado import web @@ -31,6 +32,17 @@ def get(self): ) +class PrometheusMetricsHandler(CORSMixin, web.RequestHandler): + """ + Return prometheus metrics from this enterprise gateway + """ + + def get(self): + """Get the latest state of the Prometheus' metrics.""" + self.set_header("Content-Type", prometheus_client.CONTENT_TYPE_LATEST) + self.write(prometheus_client.generate_latest(prometheus_client.REGISTRY)) + + class NotFoundHandler(JSONErrorsMixin, web.RequestHandler): """ Catches all requests and responds with 404 JSON messages. @@ -48,4 +60,8 @@ def prepare(self): raise web.HTTPError(404) -default_handlers: List[tuple] = [(r"/api", APIVersionHandler), (r"/(.*)", NotFoundHandler)] +default_handlers: List[tuple] = [ + (r"/api", APIVersionHandler), + (r"/metrics", PrometheusMetricsHandler), + (r"/(.*)", NotFoundHandler), +] diff --git a/enterprise_gateway/enterprisegatewayapp.py b/enterprise_gateway/enterprisegatewayapp.py index 3b10fe3f..aa4eb39a 100644 --- a/enterprise_gateway/enterprisegatewayapp.py +++ b/enterprise_gateway/enterprisegatewayapp.py @@ -37,6 +37,7 @@ WebhookKernelSessionManager, ) from .services.sessions.sessionmanager import SessionManager +from .webapp import EnterpriseGatewayWebApp try: from jupyter_server.auth.authorizer import AllowAllAuthorizer @@ -219,7 +220,7 @@ def init_webapp(self) -> None: handlers = self._create_request_handlers() - self.web_app = web.Application( + self.web_app = EnterpriseGatewayWebApp( handlers=handlers, kernel_manager=self.kernel_manager, session_manager=self.session_manager, diff --git a/enterprise_gateway/metrics.py b/enterprise_gateway/metrics.py new file mode 100644 index 00000000..8a23202b --- /dev/null +++ b/enterprise_gateway/metrics.py @@ -0,0 +1,30 @@ +"""Collection of all the metrics used by the Enterprise Gateway""" + +import os + +from prometheus_client import Histogram + +metrics_prefix = os.environ.get("EG_METRICS_PREFIX", "enterprise_gateway") + +HTTP_REQUEST_DURATION_SECONDS = Histogram( + 'http_request_duration_seconds', + 'Request duration for all HTTP requests', + ['method', 'handler', 'status_code'], + namespace=metrics_prefix, +) + +KERNEL_START_DURATION_SECONDS = Histogram( + 'kernel_start_duration_seconds', + 'Kernel startup duration', + ['kernel_name', 'process_proxy'], + buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0], + namespace=metrics_prefix, +) + +KERNEL_SHUTDOWN_DURATION_SECONDS = Histogram( + 'kernel_shutdown_duration_seconds', + 'Kernel startup duration for all HTTP requests', + ['kernel_name', 'process_proxy'], + buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0], + namespace=metrics_prefix, +) diff --git a/enterprise_gateway/services/kernels/remotemanager.py b/enterprise_gateway/services/kernels/remotemanager.py index f906d68f..687290b3 100644 --- a/enterprise_gateway/services/kernels/remotemanager.py +++ b/enterprise_gateway/services/kernels/remotemanager.py @@ -23,6 +23,7 @@ from enterprise_gateway.mixins import EnterpriseGatewayConfigMixin +from ...metrics import KERNEL_SHUTDOWN_DURATION_SECONDS, KERNEL_START_DURATION_SECONDS from ..processproxies.processproxy import BaseProcessProxyABC, LocalProcessProxy, RemoteProcessProxy from ..sessions.kernelsessionmanager import KernelSessionManager @@ -501,7 +502,12 @@ async def start_kernel(self, **kwargs: dict[str, Any] | None): """ self._get_process_proxy() self._capture_user_overrides(**kwargs) - await super().start_kernel(**kwargs) + with KERNEL_START_DURATION_SECONDS.time() as timer: + timer.labels( + kernel_name=self.kernel_name, + process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}', + ) + await super().start_kernel(**kwargs) def _capture_user_overrides(self, **kwargs: dict[str, Any] | None) -> None: """ @@ -588,6 +594,31 @@ def request_shutdown(self, restart: bool = False) -> None: if isinstance(self.process_proxy, RemoteProcessProxy): self.process_proxy.shutdown_listener() + async def shutdown_kernel(self, now: bool = False, restart: bool = False): + """Attempts to stop the kernel process cleanly. + + This attempts to shutdown the kernels cleanly by: + + 1. Sending it a shutdown message over the control channel. + 2. If that fails, the kernel is shutdown forcibly by sending it + a signal. + + Parameters + ---------- + now : bool + Should the kernel be forcible killed *now*. This skips the + first, nice shutdown attempt. + restart: bool + Will this kernel be restarted after it is shutdown. When this + is True, connection files will not be cleaned up. + """ + with KERNEL_SHUTDOWN_DURATION_SECONDS.time() as timer: + timer.labels( + kernel_name=self.kernel_name, + process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}', + ) + await super().shutdown_kernel(now=now, restart=restart) + async def restart_kernel(self, now: bool = False, **kwargs: dict[str, Any] | None) -> None: """ Restarts a kernel with the arguments that were used to launch it. diff --git a/enterprise_gateway/tests/test_handlers.py b/enterprise_gateway/tests/test_handlers.py index 19f568c3..dbca9c86 100644 --- a/enterprise_gateway/tests/test_handlers.py +++ b/enterprise_gateway/tests/test_handlers.py @@ -557,6 +557,12 @@ def test_kernel_env_auth_token(self): if ws: ws.close() + @gen_test + def test_get_metrics(self): + """Getting the swagger.json spec should be ok""" + response = yield self.http_client.fetch(self.get_url("/metrics")) + self.assertEqual(response.code, 200) + class TestCustomDefaultKernel(TestHandlers): """Tests gateway behavior when setting a custom default kernelspec.""" diff --git a/enterprise_gateway/webapp.py b/enterprise_gateway/webapp.py new file mode 100644 index 00000000..674ec5d9 --- /dev/null +++ b/enterprise_gateway/webapp.py @@ -0,0 +1,36 @@ +"""Tornado web app for enterprise_gateway.""" + +from tornado import web +from tornado.web import RequestHandler + +from enterprise_gateway.metrics import HTTP_REQUEST_DURATION_SECONDS + + +class EnterpriseGatewayWebApp(web.Application): + """ + Custom Tornado web application that handles all HTTP traffic for the Enterprise Gateway. + """ + + def log_request(self, handler: RequestHandler) -> None: + """ + Tornado log handler for recording RED metrics. + + We record the following metrics: + Rate: the number of requests, per second, your services are serving. + Errors: the number of failed requests per second. + Duration: the amount of time each request takes expressed as a time interval. + + We use a fully qualified name of the handler as a label, + rather than every url path to reduce cardinality. + + This function should be either the value of or called from a function + that is the 'log_function' tornado setting. This makes it get called + at the end of every request, allowing us to record the metrics we need. + """ + super().log_request(handler) + + HTTP_REQUEST_DURATION_SECONDS.labels( + method=handler.request.method, + handler=f'{handler.__class__.__module__}.{type(handler).__name__}', + status_code=handler.get_status(), + ).observe(handler.request.request_time())