Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cull idle kernels #2215

Merged
merged 4 commits into from Apr 5, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
73 changes: 71 additions & 2 deletions notebook/services/kernels/kernelmanager.py
Expand Up @@ -11,15 +11,17 @@

from tornado import gen, web
from tornado.concurrent import Future
from tornado.ioloop import IOLoop
from tornado.ioloop import IOLoop, PeriodicCallback

from jupyter_client.multikernelmanager import MultiKernelManager
from traitlets import Dict, List, Unicode, TraitError, default, validate
from traitlets import Dict, List, Unicode, TraitError, Integer, default, validate

from notebook.utils import to_os_path
from notebook._tz import utcnow, isoformat
from ipython_genutils.py3compat import getcwd

from datetime import datetime, timedelta


class MappingKernelManager(MultiKernelManager):
"""A KernelManager that handles notebook mapping and HTTP error handling"""
Expand All @@ -34,6 +36,10 @@ def _default_kernel_manager_class(self):

_kernel_connections = Dict()

_culler_callback = None

_initialized_culler = False

@default('root_dir')
def _default_root_dir(self):
try:
Expand All @@ -52,6 +58,18 @@ def _update_root_dir(self, proposal):
raise TraitError("kernel root dir %r is not a directory" % value)
return value

cull_idle_timeout_minimum = 300 # 5 minutes
cull_idle_timeout = Integer(0, config=True,
help="""Timeout (in seconds) after which a kernel is considered idle and ready to be culled. Values of 0 or
lower disable culling. The minimum timeout is 300 seconds (5 minutes). Positive values less than the minimum value
will be set to the minimum."""
)

cull_interval_default = 300 # 5 minutes
cull_interval = Integer(cull_interval_default, config=True,
help="""The interval (in seconds) on which to check for idle kernels exceeding the cull timeout value."""
)

#-------------------------------------------------------------------------
# Methods for managing kernels and sessions
#-------------------------------------------------------------------------
Expand Down Expand Up @@ -105,6 +123,11 @@ def start_kernel(self, kernel_id=None, path=None, **kwargs):
else:
self._check_kernel_id(kernel_id)
self.log.info("Using existing kernel: %s" % kernel_id)

# Initialize culling if not already
if not self._initialized_culler:
self.initialize_culler()

# py2-compat
raise gen.Return(kernel_id)

Expand Down Expand Up @@ -225,3 +248,49 @@ def record_activity(msg_list):

kernel._activity_stream.on_recv(record_activity)

def initialize_culler(self):
"""Start idle culler if 'cull_idle_timeout' is greater than zero.

Regardless of that value, set flag that we've been here.
"""
if not self._initialized_culler and self.cull_idle_timeout > 0:
if self._culler_callback is None:
if self.cull_idle_timeout < self.cull_idle_timeout_minimum:
self.log.warning("'cull_idle_timeout' (%s) is less than the minimum value (%s) and has been set to the minimum.",
self.cull_idle_timeout, self.cull_idle_timeout_minimum)
self.cull_idle_timeout = self.cull_idle_timeout_minimum
loop = IOLoop.current()
if self.cull_interval <= 0: #handle case where user set invalid value
self.log.warning("Invalid value for 'cull_interval' detected (%s) - using default value (%s).",
self.cull_interval, self.cull_interval_default)
self.cull_interval = self.cull_interval_default
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also warn for cull_interval < 60s ? it's technically not wrong, but it's likely that user might have thoughs it was minutes. Even anything below 300s seem it can be a mistake.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I agree. I thought implementing a minimum timeout would be good (now that I've tested it :-)). 300 strikes me as good minimum. If folks are making changes in these areas and need shorter timeouts for testing, they can temporarily adjust the constant.

Positive values less than the minimum would result in a warning and auto-adjustment to the minimum (similar to what happens if the interval is <= 0 and culling is enabled).

So, to your comment above, the description would be...

"Timeout (in seconds) after which a kernel is considered idle and ready to be culled. Values of 0 or lower disable culling. The minimum timeout is 300 seconds. Positive values less than the minimum will be adjusted to the minimum."

(or is that too verbose?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Carreau a warning is a good idea.

@kevin-bates that sounds perfect. You might add (5 minutes) next to 300 seconds.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do.

self._culler_callback = PeriodicCallback(
self.cull_kernels, 1000*self.cull_interval, loop)
self.log.info("Culling kernels with idle durations > %s seconds at %s second intervals ...",
self.cull_idle_timeout, self.cull_interval)
self._culler_callback.start()

self._initialized_culler = True

def cull_kernels(self):
self.log.debug("Polling every %s seconds for kernels idle > %s seconds...",
self.cull_interval, self.cull_idle_timeout)
"""Create a separate list of kernels to avoid conflicting updates while iterating"""
for kernel_id in list(self._kernels):
try:
self.cull_kernel_if_idle(kernel_id)
except Exception as e:
self.log.exception("The following exception was encountered while checking the idle duration of kernel %s: %s",
kernel_id, e)

def cull_kernel_if_idle(self, kernel_id):
kernel = self._kernels[kernel_id]
self.log.debug("kernel_id=%s, kernel_name=%s, last_activity=%s", kernel_id, kernel.kernel_name, kernel.last_activity)
if kernel.last_activity is not None:
dt_now = utcnow()
dt_idle = dt_now - kernel.last_activity
if dt_idle > timedelta(seconds=self.cull_idle_timeout): # exceeds timeout, can be culled
idle_duration = int(dt_idle.total_seconds())
self.log.warning("Culling kernel '%s' (%s) due to %s seconds of inactivity.", kernel.kernel_name, kernel_id, idle_duration)
self.shutdown_kernel(kernel_id)