# split data into train, validation and test sets

In [1]:
import mlrun
import os
import numpy as np
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters

**Please be sure to run the notebooks [1. remote archive to local parquet](1.%20remote%20archive%20to%20local%20parquet.ipynb) and [2. parquet to dask cluster](2.%20parquet%20to%20dask%20cluster.ipynb) before running this one.**

Since our data is already loaded into a Dask cluster we use that as our source

In [2]:
FUNCTION           = 'train_valid_test'
DESCRIPTION        = 'split data into train, validation and test splits'

IMAGE              = 'yjbds/mlrun-dask:dev'
JOB_KIND           = 'job'
TASK_NAME          = 'user-task-data-splits'

TARGET_PATH        = '/User/repos/dask/artifacts'
DASK_CLIENT        = 'scheduler.json'
DASK_KEY           = 'airlines'
LABEL_COLUMN       = 'ArrDelay'
CATEGORIES         = ['UniqueCarrier', 'Origin', 'Dest']

# insert run id ... from db here
MLRUN_DB_UID       = '2eab3b2dfad042479b40efba3b96e538'

RNG                = 1

## split the data

In [10]:
# load function from a local Python file
splitter = mlrun.new_function(command='/User/repos/dask/train-valid-test-splitter-airlines.py', 
                              image=IMAGE,
                              kind=JOB_KIND)

splitter.spec.build.image = IMAGE

# export or load function yaml
splitter.export('/User/repos/dask/yaml/train-valid-test-splitter-airlines.yaml')
# splitter = mlrun.import_function('/User/repos/dask/yaml/train_valid_test_splitter-airlines.yaml')

splitter.apply(mlrun.mount_v3io())
splitter.deploy(skip_deployed=True, with_mlrun=False)

task_ = mlrun.NewTask(
    TASK_NAME,
    params={
        'dask_client'   : DASK_CLIENT,
        'dask_key'      : DASK_KEY,
        'label_column'  : LABEL_COLUMN,
        'categories'    : CATEGORIES,
        'target_path'   : TARGET_PATH,
        'random_state'  : RNG,
    })

tsk2 = splitter.run(task_, handler='train_valid_test_splitter')

[mlrun] 2020-02-09 23:34:04,366 function spec saved to path: /User/ecolab/yaml/train-valid-test-splitter-airlines.yaml
[mlrun] 2020-02-09 23:34:04,374 starting run user-task-data-splits uid=e1dfeea63f864641acde116b9d26a459  -> http://mlrun-api:8080
[mlrun] 2020-02-09 23:34:04,444 Job is running in the background, pod: user-task-data-splits-4jgbm
[mlrun] 2020-02-09 23:35:51,497 N FEATURES:
before {before_encoding_shape}
after  {after_encoding_shape}
[mlrun] 2020-02-09 23:35:52,319 log artifact header at /User/ecolab/artifacts/header.pkl, size: None, db: Y
dataset xtrain exists, unpublishing
dataset xvalid exists, unpublishing
dataset ytrain exists, unpublishing
dataset yvalid exists, unpublishing
[mlrun] 2020-02-09 23:40:36,944 log artifact test_set at /User/ecolab/artifacts/test_set.pqt, size: None, db: Y

[mlrun] 2020-02-09 23:40:37,037 run executed, status=completed
Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...26a459,0,Feb 09 23:34:13,completed,train-valid-test-splitter-airlines,host=user-task-data-splits-4jgbmkind=jobowner=admin,,"categories=['UniqueCarrier', 'Origin', 'Dest']dask_client=scheduler.jsondask_key=airlineslabel_column=ArrDelayrandom_state=1target_path=/User/ecolab/artifacts",,headertest_set


to track results use .show() or .logs() or in CLI: 
!mlrun get run e1dfeea63f864641acde116b9d26a459  , !mlrun logs e1dfeea63f864641acde116b9d26a459 
[mlrun] 2020-02-09 23:40:45,577 run executed, status=completed


## tests

In [None]:
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [None]:
client = Client(scheduler_file='/User/repos/dask/artifacts/scheduler.json')

In [None]:
df = client.get_dataset('ytrain')

In [None]:
published_datasets = client.list_datasets()
published_datasets

In [None]:
df.shape[0].compute()

In [None]:
df.values