# Demonstration Notebook for kubeflow enabled dask-kubernetes package

In [1]:
import os
from time import sleep
from datetime import datetime

from dask_kubernetes import KubeCluster
from dask.distributed import Client
from distributed.core import Status
import dask.array as da

## Helper Functions

In [2]:
print(f'working dir: {os.getcwd()}')

def run_test_case(client, array):
    number_of_workers = count_ready_workers(client)
    start_time = datetime.now()
    answer = array.mean().compute()
    elpased_time = datetime.now() - start_time
    print(
        '***\n'
        f'* number of workers: {number_of_workers}, '
        f'elapsed time: {elpased_time}, answer: {answer}'
        '\n***'
    )

def count_ready_workers(client):
    count = 0
    for c in client.cluster.workers:
        if client.cluster.workers[c].status == Status.running:
            count += 1
    return count

working dir: /home/jovyan/dask_kubeflow/tests


## Setup for the test, start DASK KubeCluster

Test case based on [`dask-kubernetes` Quickstart Example](https://kubernetes.dask.org/en/latest/kubecluster.html#quickstart)

In [3]:
# start up cluster with 1 workers
cluster = KubeCluster(
    'worker-spec.yaml', 
    n_workers=1, 
    enable_kubeflow=True  # new parameter to tell DASK that cluster is in Kubeflow cluster
)
client = Client(cluster)
client

Creating scheduler pod on cluster. This may take some time.



+-------------+-----------+------------------------+---------+
| Package     | client    | scheduler              | workers |
+-------------+-----------+------------------------+---------+
| blosc       | None      | 1.10.2                 | None    |
| cloudpickle | 1.6.0     | 2.0.0                  | None    |
| distributed | 2021.12.0 | 2021.12.0+21.gd1cf1d45 | None    |
| lz4         | None      | 3.1.10                 | None    |
| toolz       | 0.11.1    | 0.11.2                 | None    |
+-------------+-----------+------------------------+---------+


0,1
Connection method: Cluster object,Cluster type: dask_kubernetes.KubeCluster
Dashboard: http://dask-jovyan-a2742d03-2.kubeflow-user:8787/status,

0,1
Dashboard: http://dask-jovyan-a2742d03-2.kubeflow-user:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.42.1.108:8786,Workers: 0
Dashboard: http://10.42.1.108:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [4]:
# Create a large array and calculate the mean
array = da.ones((10000, 1000, 1000), chunks=100)

## Initial test case with 1 worker

In [5]:
%%time
print("test case for one worker")
run_test_case(client, array)

test case for one worker
***
* number of workers: 1, elapsed time: 0:00:15.787276, answer: 1.0
***
CPU times: user 1.12 s, sys: 108 ms, total: 1.23 s
Wall time: 15.8 s


## Test case with 3 workers

In [6]:
%%time
# scale up
print("test case for three workers")
cluster.scale(3)
counter = 0
while True:
    ready_worker_count = count_ready_workers(client) 
    sleep(1)
    counter += 1
    print(f'scaling...ready workers: {ready_worker_count}...waiting {counter} seconds')
    if ready_worker_count == 3:
        break
    elif counter > 60:
        raise RuntimeError('Scale up operation, did not complete in required time.')
run_test_case(client, array)

test case for three workers
scaling...ready workers: 1...waiting 1 seconds
scaling...ready workers: 3...waiting 2 seconds
***
* number of workers: 3, elapsed time: 0:00:14.940067, answer: 1.0
***
CPU times: user 1.11 s, sys: 47.3 ms, total: 1.15 s
Wall time: 16.9 s


In [7]:
client

0,1
Connection method: Cluster object,Cluster type: dask_kubernetes.KubeCluster
Dashboard: http://dask-jovyan-a2742d03-2.kubeflow-user:8787/status,

0,1
Dashboard: http://dask-jovyan-a2742d03-2.kubeflow-user:8787/status,Workers: 3
Total threads: 3,Total memory: 2.79 GiB

0,1
Comm: tcp://10.42.1.108:8786,Workers: 3
Dashboard: http://10.42.1.108:8787/status,Total threads: 3
Started: 1 minute ago,Total memory: 2.79 GiB

0,1
Comm: tcp://10.42.0.111:43013,Total threads: 1
Dashboard: http://10.42.0.111:32897/status,Memory: 0.93 GiB
Nanny: tcp://10.42.0.111:36811,
Local directory: /dask-worker-space/worker-xmti943w,Local directory: /dask-worker-space/worker-xmti943w

0,1
Comm: tcp://10.42.0.112:43269,Total threads: 1
Dashboard: http://10.42.0.112:45207/status,Memory: 0.93 GiB
Nanny: tcp://10.42.0.112:37623,
Local directory: /dask-worker-space/worker-6bqh0a2i,Local directory: /dask-worker-space/worker-6bqh0a2i

0,1
Comm: tcp://10.42.1.109:39085,Total threads: 1
Dashboard: http://10.42.1.109:38609/status,Memory: 0.93 GiB
Nanny: tcp://10.42.1.109:33237,
Local directory: /dask-worker-space/worker-3s1hooid,Local directory: /dask-worker-space/worker-3s1hooid


## Test case with 2 workers

In [8]:
%%time
# scale down
print("test case for two workers")
cluster.scale(2)
counter = 0
while True:
    ready_worker_count = count_ready_workers(client) 
    sleep(1)
    counter += 1
    print(f'scaling...ready workers: {ready_worker_count}...waiting {counter} seconds')
    if ready_worker_count == 2:
        break
    elif counter > 60:
        raise RuntimeError('Scale up operation, did not complete in required time.')
run_test_case(client, array)

test case for two workers
scaling...ready workers: 3...waiting 1 seconds
scaling...ready workers: 2...waiting 2 seconds
***
* number of workers: 2, elapsed time: 0:00:09.055950, answer: 1.0
***
CPU times: user 1.02 s, sys: 46.2 ms, total: 1.06 s
Wall time: 11.1 s


## Shutdown cluster

In [9]:
# close DASK Connection
client.close()

# Close KubeCluster
cluster.close()

print('all done')

all done
