# split data into train, validation and test sets

In [19]:
import mlrun
import os
import numpy as np
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

**Please be sure to run the notebooks [1. remote archive to local parquet](1.%20remote%20archive%20to%20local%20parquet.ipynb) and [2. parquet to dask cluster](2.%20parquet%20to%20dask%20cluster.ipynb) before running this one.**

Since our data is already loaded into a Dask cluster we use that as our source

In [20]:
DESCRIPTION        = 'split data into train, validation and test splits'

IMAGE              = 'yjbds/mlrun-dask:dev'
JOB_KIND           = 'job'
TASK_NAME          = 'user-task-data-splits'

TARGET_PATH        = '/User/repos/demos/dask/artifacts'
DASK_CLIENT        = 'scheduler.json'
DASK_KEY           = 'airlines'
LABEL_COLUMN       = 'ArrDelay'
CATEGORIES         = ['UniqueCarrier', 'Origin', 'Dest']

# insert run id ... from db here
MLRUN_DB_UID       = '0bb6c8794695400ea79c28900737277b'

RNG                = 1

In [21]:
HANDLER = 'splitter_labelencode'

## split the data

In [22]:
# load function from a local Python file
splitter = mlrun.new_function(command='/User/repos/demos/dask/code/splitter-labelencode.py', 
                              image=IMAGE,
                              kind=JOB_KIND)

splitter.spec.build.image = IMAGE

# export or load function yaml
splitter.export('/User/repos/demos/dask/yaml/splitter-labelencode.yaml')
# splitter = mlrun.import_function('/User/repos/dask/yaml/train_valid_test_splitter-airlines.yaml')

[mlrun] 2020-02-16 14:38:12,464 function spec saved to path: /User/repos/demos/dask/yaml/splitter-labelencode.yaml


In [9]:
splitter.apply(mlrun.mount_v3io())
splitter.deploy(skip_deployed=True, with_mlrun=False)

task_ = mlrun.NewTask(
    TASK_NAME,
    handler=HANDLER,
    params={
        'dask_client'   : DASK_CLIENT,
        'dask_key'      : DASK_KEY,
        'label_column'  : LABEL_COLUMN,
        'categories'    : CATEGORIES,
        'target_path'   : TARGET_PATH,
        'random_state'  : RNG,
    })

tsk2 = splitter.run(task_, handler='splitter_labelencode')

[mlrun] 2020-02-16 13:10:38,701 function spec saved to path: /User/repos/demos/dask/yaml/splitter-labelencode.yaml
[mlrun] 2020-02-16 13:10:38,711 starting run user-task-data-splits uid=c04adfd9c616416f91915d30c39abb71  -> http://mlrun-api:8080
[mlrun] 2020-02-16 13:10:38,784 Job is running in the background, pod: user-task-data-splits-dbqck


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-7bdc5452cbd9>", line 27, in <module>
    tsk2 = splitter.run(task_, handler='splitter_labelencode')
  File "/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/mlrun/runtimes/base.py", line 264, in run
    runspec.logs(True, self._get_db())
  File "/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/mlrun/model.py", line 352, in logs
    watch=watch)
  File "/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/mlrun/db/httpdb.py", line 131, in watch_log
    time.sleep(3)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/User/.pythonlibs/jupyter-1/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2044, in showtraceback
    stb = value._render_tra

KeyboardInterrupt: 

## tests

In [5]:
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

client = Client(scheduler_file='/User/repos/demos/dask/artifacts/scheduler.json')

assert  client.list_datasets() == ('airlines', 'xtrain', 'xvalid', 'ytrain', 'yvalid')

client.list_datasets()


lz4
+--------------------------+---------+
|                          | version |
+--------------------------+---------+
| client                   | 3.0.2   |
| scheduler                | 2.2.1   |
| tcp://10.233.64.53:36132 | 2.2.1   |
| tcp://10.233.64.54:41331 | 2.2.1   |
| tcp://10.233.64.55:37728 | 2.2.1   |
| tcp://10.233.64.56:43233 | 2.2.1   |
| tcp://10.233.64.57:44692 | 2.2.1   |
| tcp://10.233.64.58:45015 | 2.2.1   |
| tcp://10.233.64.59:36351 | 2.2.1   |
| tcp://10.233.64.60:40581 | 2.2.1   |
+--------------------------+---------+

msgpack
+--------------------------+---------+
|                          | version |
+--------------------------+---------+
| client                   | 0.6.1   |
| scheduler                | 0.6.2   |
| tcp://10.233.64.53:36132 | 0.6.2   |
| tcp://10.233.64.54:41331 | 0.6.2   |
| tcp://10.233.64.55:37728 | 0.6.2   |
| tcp://10.233.64.56:43233 | 0.6.2   |
| tcp://10.233.64.57:44692 | 0.6.2   |
| tcp://10.233.64.58:45015 | 0.6.2   |
| tcp://10.

('airlines', 'xtrain', 'xvalid', 'ytrain', 'yvalid')

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/conda/lib/python3.6/site-packages/distributed/utils.py", line 662, in log_errors
    yield
  File "/conda/lib/python3.6/site-packages/distributed/client.py", line 1306, in _close
    await asyncio.wait_for(asyncio.gather(*coroutines), 2)
  File "/conda/lib/python3.6/asyncio/tasks.py", line 351, in wait_for
    yield from waiter
  File "/conda/lib/python3.6/asyncio/futures.py", line 327, in __iter__
    yield self  # This tells Task to wait for completion.
  File "/conda/lib/python3.6/asyncio/tasks.py", line 250, in _wakeup
    future.result()
  File "/conda/lib/python3.6/asyncio/futures.py", line 238, in result
    raise CancelledError
concurrent.futures._base.CancelledError
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/conda/lib/python3.6/site-packages/distributed/utils.py", line 662, in lo