# Demonstration Notebook for kubeflow enabled dask-kubernetes package

In [1]:
import os
from time import sleep
from datetime import datetime

from dask_kubernetes import KubeCluster
from dask.distributed import Client
from distributed.core import Status
import dask.array as da

## Helper Functions

In [2]:
print(f'working dir: {os.getcwd()}')

def run_test_case(client, array):
    number_of_workers = count_ready_workers(client)
    start_time = datetime.now()
    answer = array.mean().compute()
    elpased_time = datetime.now() - start_time
    print(
        '***\n'
        f'* number of workers: {number_of_workers}, '
        f'elapsed time: {elpased_time}, answer: {answer}'
        '\n***'
    )

def count_ready_workers(client):
    count = 0
    for c in client.cluster.workers:
        if client.cluster.workers[c].status == Status.running:
            count += 1
    return count

working dir: /home/jovyan/dask_kubeflow/tests


## Setup for the test, start DASK KubeCluster

Test case based on [`dask-kubernetes` Quickstart Example](https://kubernetes.dask.org/en/latest/kubecluster.html#quickstart)

In [3]:
# start up cluster with 1 workers
cluster = KubeCluster(
    'worker-spec.yaml', 
    n_workers=1, 
    enable_kubeflow=True  # new parameter to tell DASK that cluster is in Kubeflow cluster
)
client = Client(cluster)

# Create a large array and calculate the mean
array = da.ones((10000, 1000, 1000), chunks=100)

Creating scheduler pod on cluster. This may take some time.



+-------------+-----------+------------------------+---------+
| Package     | client    | scheduler              | workers |
+-------------+-----------+------------------------+---------+
| blosc       | None      | 1.10.2                 | None    |
| cloudpickle | 1.6.0     | 2.0.0                  | None    |
| distributed | 2021.12.0 | 2021.12.0+21.gd1cf1d45 | None    |
| lz4         | None      | 3.1.10                 | None    |
| toolz       | 0.11.1    | 0.11.2                 | None    |
+-------------+-----------+------------------------+---------+


## Initial test case with 1 worker

In [4]:
%%time
print("test case for one worker")
run_test_case(client, array)

test case for one worker
***
* number of workers: 1, elapsed time: 0:00:46.238427, answer: 1.0
***
CPU times: user 1.44 s, sys: 181 ms, total: 1.63 s
Wall time: 46.2 s


## Test cse with 3 workers

In [5]:
%%time
# scale up
print("test case for three workers")
cluster.scale(3)
counter = 0
while True:
    ready_worker_count = count_ready_workers(client) 
    sleep(1)
    counter += 1
    print(f'scaling...ready workers: {ready_worker_count}...waiting {counter} seconds')
    if ready_worker_count == 3:
        break
    elif counter > 60:
        raise RuntimeError('Scale up operation, did not complete in required time.')
run_test_case(client, array)

test case for three workers
scaling...ready workers: 1...waiting 1 seconds
scaling...ready workers: 3...waiting 2 seconds
***
* number of workers: 3, elapsed time: 0:00:25.679998, answer: 1.0
***
CPU times: user 1.74 s, sys: 565 ms, total: 2.31 s
Wall time: 27.7 s


## Test case with 2 workers

In [6]:
%%time
# scale down
print("test case for two workers")
cluster.scale(2)
counter = 0
while True:
    ready_worker_count = count_ready_workers(client) 
    sleep(1)
    counter += 1
    print(f'scaling...ready workers: {ready_worker_count}...waiting {counter} seconds')
    if ready_worker_count == 2:
        break
    elif counter > 60:
        raise RuntimeError('Scale up operation, did not complete in required time.')
run_test_case(client, array)

test case for two workers
scaling...ready workers: 3...waiting 1 seconds
scaling...ready workers: 2...waiting 2 seconds
***
* number of workers: 2, elapsed time: 0:00:13.466606, answer: 1.0
***
CPU times: user 1.25 s, sys: 50.4 ms, total: 1.3 s
Wall time: 15.5 s


## Shutdown cluster

In [7]:
# close DASK Connection
client.close()

# Close KubeCluster
cluster.close()

print('all done')

all done
