# Binder pod health check

This notebook uses the Kubernetes and JupyterHub APIs
to check the health of a Binder deployment.

This notebook:

- retrieves the user list from jupyterhub
- retrieves user pods from kubernetes
- looks for pods that don't map to users (orphaned pods)
- creates a summary report for current pods (status and orphan)
- deletes orphaned pods, if desired

## Step 1: Connect to Kubernetes

In [None]:
import kubernetes.config
import kubernetes.client

Enter the name of the Kubernetes context and the namespace you want to check:

context could be something like `gke_binder-prod_us-central1-a_prod-a`

In [None]:
context = 'prod-a'
namespace = 'prod'

In [None]:
kubernetes.config.load_kube_config(context=context)
kube = kubernetes.client.CoreV1Api()

In [None]:
[ ns.metadata.name for ns in kube.list_namespace().items ]

## Step 2: get current state

- fetch userlist from JupyterHub
- fetch user pods from kubernetes
  (we only care about user pods, not the rest)

In [None]:
import base64
from operator import itemgetter
import requests

hub_secret = [
    s for s in kube.list_namespaced_secret(namespace).items
    if s.metadata.name == 'hub'
][0]
binder_config = [
    c for c in kube.list_namespaced_config_map(namespace).items
    if c.metadata.name == 'binder-config'
][0]

b64_hub_token = hub_secret.data['hub.services.binder.apiToken']
hub_api_token = base64.b64decode(b64_hub_token).decode('ascii')
hub_url = binder_config.data['binder.hub-url'].rstrip('/')

def hub_api_request(path, verb='GET'):
    """Make an API request to jupyterhub"""
    r = requests.request(verb,
        hub_url + '/hub/api/' + path,
        headers={'Authorization': f'token {hub_api_token}'},
    )
    if r.status_code >= 400:
        print(r.text)
    r.raise_for_status()
    return r.json()

def get_hub_users():
    """Return the list of currently active user models"""
    return sorted(hub_api_request('/users'), key=itemgetter('last_activity'))


In [None]:
def get_jupyter_pods():
    """Get all of the pods that should map to users"""
    return [
        pod for pod in kube.list_namespaced_pod(namespace=namespace).items
        if pod.metadata.labels.get('component') == 'singleuser-server'
        and pod.metadata.labels.get('heritage') == 'jupyterhub'
    ]

fetch users and pods at the same time to minimize
chance of races.

Get users after pods to ensure that a race doesn't look like an orphan.

In [None]:
pods = get_jupyter_pods()
users = get_hub_users()

In [None]:
from datetime import datetime, timezone, timedelta
from dateutil.parser import parse as parse_date

now = datetime.utcnow()

In [None]:
pending_users = [u for u in users if u['pending']]
len(pending_users)

In [None]:
idle_users = [ u for u in users if (not u['pending']) and parse_date(u['last_activity']) < (now - timedelta(minutes=20)) ]
# sanity check: make sure not all pods are idle
assert len(idle_users) != len(users), "All users appear to be idle!"
print(f"{len(idle_users)} apparently idle users:")
[ (u['name'], u['last_activity']) for u in idle_users ]

## Step 3. identify orphaned pods

Construct a set of usernames of active users (ignoring those with stopped servers,
either because they haven't started yet, or have shutdown for some reason).

In [None]:
# filter to active users
active_users = [ u for u in users if u['pending'] or u['server'] ]
# create set of names for active users
usernames = {user['name'] for user in active_users}

Identify the pods that have been orphaned (i.e. do not map to a user)

In [None]:
orphaned_pods = [
    pod for pod in pods
    if pod.metadata.annotations['hub.jupyter.org/username'] not in usernames
]
# sanity check: make sure not all pods are orphans
assert len(orphaned_pods) != len(pods), "All pods appear to be orphans!"
print(f"{len(orphaned_pods)} apparently orphaned pods:")
[ pod.metadata.name for pod in orphaned_pods ]

## Step 4. report!

This builds a simple report of the current pods and their statuses.
Orphaned pods have a big ❌ next to them

In [None]:
from collections import Counter
import datetime

import jinja2
from IPython.display import Markdown

def get_pod_status(pod):
    """Get the pod status"""
    container_state = pod.status.container_statuses[0].state
    if container_state.waiting:
        return container_state.waiting.reason
    else:
        return 'Running'

# sort pods by creation date
pods = sorted(pods, key=lambda pod: pod.metadata.creation_timestamp)
active_pods = [pod for pod in pods if pod not in orphaned_pods]
status_counter = Counter([get_pod_status(pod) for pod in pods])
orphan_status_counter = Counter([get_pod_status(pod) for pod in orphaned_pods])

def relative_date(dt):
    """Render a datetime as a concise relative date"""
    td = datetime.datetime.now(tz=datetime.timezone.utc) - dt
    
    if td.days:
        return f"{td.days}d"
    if td.seconds >= 3600:
        return f"{td.seconds // 3600}h"
    return f"{td.seconds // 60}m"

jinja_env = jinja2.Environment()

jinja_env.filters['get_pod_status'] = get_pod_status
jinja_env.filters['relative_date'] = relative_date

tpl = jinja_env.from_string("""
We have {{pods | length}} pods, of which {{orphaned_pods | length}} are orphaned.

{% if active_pods | length != active_users | length %}
**From the users list, we would expect {{active_users | length}} active pods,
but found {{active_pods | length}}.
Our orphan classification may be incorrect!**
{% endif %}

Pods are in the following states:

{% for status, count in status_counter.items() %}
- {{status}}: {{count}} pods
  {%- if orphan_status_counter[status] %}
  ({{orphan_status_counter[status]}}/{{count}} orphaned)
  {%- endif %}
{% endfor %}

<table>
  <tr>
    <th>orphaned</th>
    <th>pod</th>
    <th>status</th>
    <th>age</th>
  </tr>

  {% for pod in pods %}
  <tr>
    <td>
    {% if pod in orphaned_pods %}
    ❌
    {% endif %}
    </td>
    <td>
    {{ pod.metadata.name }}
    </td>
    <td>
    {{ pod | get_pod_status }}
    </td>
    <td>
    {{ pod.metadata.creation_timestamp | relative_date }}
    </td>
  </tr>
  {% endfor %}
</table>
""")

Markdown(tpl.render(**globals()))


## Step 5: cleanup orphaned pods

Now that we've looked at that report,
we can start to cleanup any pods that shouldn't be there.

We start with a sanity check to ensure we don't proceed beyond this point
if our orphan check isn't trustworthy.

In [None]:
expected = len(pods)
actual = len(orphaned_pods) + len(active_users)
print(f"orphaned pods + active users = {actual} =? {expected}")
print(f"{len(orphaned_pods):13} + {len(active_users):12} = {actual} =? {expected}")
if len(orphaned_pods) + len(active_users) != len(pods):
    raise ValueError("Some of the orphaned could be mislabled!")

Now we can delete the orphaned pods,
if it really looks like they won't be cleaned up by normal Hub means.

In [None]:
from kubernetes.client.rest import ApiException

def delete_pod(name):
    """Delete a single pod
    
    ignore 404 for already deleted pods
    """
    try:
        kube.delete_namespaced_pod(
            name, namespace,
            kubernetes.client.V1DeleteOptions(grace_period_seconds=0),
        )
    except ApiException as e:
        if e.status == 404:
            print(f"Already deleted pod {name}")
        else:
            raise
    
def delete_orphaned_pods(noconfirm=False):
    """Delete all orphaned pods
    
    with confirmation to avoid triggering on Run All
    """
    if not noconfirm:
        r = input(f"Are you sure you want to delete {len(orphaned_pods)} pod(s)? [y/N] ")
        if not r.lower().startswith('y'):
            print("Cancelled")
            return
        
    for pod in orphaned_pods:
        name = pod.metadata.name
        print(f"Deleting orphaned pod {name}")
        delete_pod(name)

In [None]:
# delete_orphaned_pods()

We shouldn't generally have to do this one,
but sometimes a user pod might be left running for a really long time.

This could be intentional, or it could be a window left open,
or it could be an activity-tracking bug.


In [None]:
def delete_pods_older_than(hours, *,noconfirm=False):
    """Delete pods older than the given number of hours
    
    a timedelta can be passed for other time intervals.
    """
    if isinstance(hours, datetime.timedelta):
        td = hours
    else:
        td = datetime.timedelta(hours=hours)
    cutoff = datetime.datetime.now(tz=datetime.timezone.utc) - td
    old_pods = [ pod for pod in pods if pod.metadata.creation_timestamp < cutoff ]
    if not old_pods:
        print("No pods to delete")
        return

    if not noconfirm:
        r = input(f"Are you sure you want to delete {len(old_pods)} pod(s) older than {relative_date(cutoff)}? [y/N] ")
        if not r.lower().startswith('y'):
            print("Cancelled")
            return
    
    for pod in pods:
        name = pod.metadata.name
        created = pod.metadata.creation_timestamp
        if created < cutoff:
            print(f"Deleting {relative_date(created):3} old pod {name}")
            delete_pod(name)

In [None]:
delete_pods_older_than(6)