# Demonstration Notebook for kubeflow enabled dask-kubernetes package

In [1]:
import os
from time import sleep
from datetime import datetime

from dask_kubernetes import KubeCluster
from dask.distributed import Client
from distributed.core import Status
import dask.array as da

## Show versions of dask related packages

In [2]:
!pip list | grep "\(dask\|distributed\)"

dask                     2021.12.0
dask-kubernetes          kubeflow-enablement-0.2.0-dev1
dask-labextension        5.1.0
distributed              2021.12.0


## Helper Functions

In [3]:
print(f'working dir: {os.getcwd()}')

def run_test_case(client, array):
    number_of_workers = count_ready_workers(client)
    start_time = datetime.now()
    answer = array.mean().compute()
    elpased_time = datetime.now() - start_time
    print(
        '***\n'
        f'* number of workers: {number_of_workers}, '
        f'elapsed time: {elpased_time}, answer: {answer}'
        '\n***'
    )

def count_ready_workers(client):
    count = 0
    for c in client.cluster.workers:
        if client.cluster.workers[c].status == Status.running:
            count += 1
    return count

working dir: /home/jovyan/dask_kubeflow/tests


## Setup for the test, start DASK KubeCluster

Test case based on [`dask-kubernetes` Quickstart Example](https://kubernetes.dask.org/en/latest/kubecluster.html#quickstart)

In [4]:
# start up cluster with 1 workers
cluster = KubeCluster(
    'worker-spec.yaml', 
    n_workers=1, 
    enable_kubeflow=True  # new parameter to tell DASK that cluster is in Kubeflow cluster
)
client = Client(cluster)
client

Creating scheduler pod on cluster. This may take some time.



+-------------+-----------+------------------------+---------+
| Package     | client    | scheduler              | workers |
+-------------+-----------+------------------------+---------+
| blosc       | None      | 1.10.2                 | None    |
| cloudpickle | 1.6.0     | 2.0.0                  | None    |
| distributed | 2021.12.0 | 2021.12.0+22.g96ee7f7b | None    |
| lz4         | None      | 3.1.10                 | None    |
| toolz       | 0.11.1    | 0.11.2                 | None    |
+-------------+-----------+------------------------+---------+


0,1
Connection method: Cluster object,Cluster type: dask_kubernetes.KubeCluster
Dashboard: http://dask-jovyan-cc36198e-d.kubeflow-user:8787/status,

0,1
Dashboard: http://dask-jovyan-cc36198e-d.kubeflow-user:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.42.1.66:8786,Workers: 0
Dashboard: http://10.42.1.66:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [5]:
# Create a large array and calculate the mean
array = da.ones((10000, 1000, 1000), chunks=100)

## Initial test case with 1 worker

In [6]:
%%time
print("test case for one worker")
run_test_case(client, array)

test case for one worker
***
* number of workers: 1, elapsed time: 0:00:42.192557, answer: 1.0
***
CPU times: user 1.46 s, sys: 507 ms, total: 1.96 s
Wall time: 42.2 s


## Test case with 3 workers

In [7]:
%%time
# scale up
print("test case for three workers")
cluster.scale(3)
counter = 0
while True:
    ready_worker_count = count_ready_workers(client) 
    sleep(1)
    counter += 1
    print(f'scaling...ready workers: {ready_worker_count}...waiting {counter} seconds')
    if ready_worker_count == 3:
        break
    elif counter > 60:
        raise RuntimeError('Scale up operation, did not complete in required time.')
run_test_case(client, array)

test case for three workers
scaling...ready workers: 1...waiting 1 seconds
scaling...ready workers: 1...waiting 2 seconds
scaling...ready workers: 3...waiting 3 seconds
***
* number of workers: 3, elapsed time: 0:00:52.051416, answer: 1.0
***
CPU times: user 2.24 s, sys: 623 ms, total: 2.86 s
Wall time: 55.1 s


## Test case with 2 workers

In [8]:
%%time
# scale down
print("test case for two workers")
cluster.scale(2)
counter = 0
while True:
    ready_worker_count = count_ready_workers(client) 
    sleep(1)
    counter += 1
    print(f'scaling...ready workers: {ready_worker_count}...waiting {counter} seconds')
    if ready_worker_count == 2:
        break
    elif counter > 60:
        raise RuntimeError('Scale up operation, did not complete in required time.')
run_test_case(client, array)

test case for two workers
scaling...ready workers: 3...waiting 1 seconds
scaling...ready workers: 2...waiting 2 seconds
***
* number of workers: 2, elapsed time: 0:00:22.240787, answer: 1.0
***
CPU times: user 1.22 s, sys: 39.8 ms, total: 1.26 s
Wall time: 24.2 s


## Shutdown cluster

In [9]:
# close DASK Connection
client.close()

# Close KubeCluster
cluster.close()

print('all done')

all done
