## Creating a multi-stage KubeFlow Pipeline

* **[Acquire](1.%20remote%20archive%20t%20local%20parquet-airlines.ipynb)** airlines data from remote site and save locally as parquet dataset
* **[Load dataset](2.%20parquet%20to%20dask%20cluster-airlines.ipynb)** into persistent Dask cluster
* Feature engineering and **[data splits](3.%20generate%20train%20and%20test%20sets-airlines.ipynb)** (train/validation/test)
* **[Train](4.%20lightgbm%20on%20dask%20cluster.ipynb)** Dask-LGBM classifier
* **[Evaluate]()** trained model
* Wrap functionality into a KubeFlow pipeline

In [4]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

In [5]:
import kfp
from kfp import dsl

## jobs

In [7]:
acquire_job     = mlrun.import_function('yaml/arc_to_parquet.yaml')
dask_job        = mlrun.import_function('yaml/parquet-to-dask.yaml')
split_job       = mlrun.import_function('yaml/train-valid-test-splitter.yaml')
lgbm_job        = mlrun.import_function('yaml/clf_lgbm_dask.yaml')
logreg_job      = mlrun.import_function('yaml/clf_logreg_dask.yaml')
daal4py_job     = mlrun.import_function('yaml/clf_logreg_daal4py.yaml')

jobs = [acquire_job, dask_job, split_job, lgbm_job, logreg_job, daal4py_job ]

for job in jobs:
    job.apply(mlrun.mount_v3io())
    job.deploy(skip_deployed=True, with_mlrun=False)

In [8]:
srvfn = mlrun.new_model_server('flight-delay-server', 
                               model_class='ClassifierModel', 
                               filename='/User/repos/demos/dask/lgbm_server.ipynb')

srvfn.apply(mlrun.mount_v3io())

<mlrun.runtimes.function.RemoteRuntime at 0x7f39aa261630>

In [10]:
print(srvfn.to_yaml())

kind: remote
metadata:
  name: flight-delay-server
  project: default
spec:
  command: ''
  args: []
  image: ''
  description: ''
  volumes:
  - flexVolume:
      driver: v3io/fuse
      options:
        accessKey: eaed2dc3-4fe8-4ba0-a490-13e505cdce9a
        container: users
        subPath: /admin
    name: v3io
  volume_mounts:
  - mountPath: /User
    name: v3io
  env:
  - name: MODEL_CLASS
    value: ClassifierModel
  - name: ENABLE_EXPLAINER
    value: 'False'
  - name: V3IO_API
    value: v3io-webapi.default-tenant.svc:8081
  - name: V3IO_USERNAME
    value: admin
  - name: V3IO_ACCESS_KEY
    value: eaed2dc3-4fe8-4ba0-a490-13e505cdce9a
  config:
    spec.triggers.http:
      kind: http
      maxWorkers: 8
      attributes:
        ingresses: {}
      annotations: {}
  base_spec:
    apiVersion: nuclio.io/v1
    kind: nuclio:serving
    metadata:
      annotations:
        nuclio.io/generated_by: function generated at 12-02-2020 by admin from /User/repos/demos/dask/lgbm_server.

In [22]:
TARGET_PATH = '/User/repos/demos/dask/artifacts'

In [23]:
@dsl.pipeline(
    name='My Dask-LiightGBM training pipeline',
    description='Shows how to use mlrun with dask and a booster model.'
)
def dask_pipeline(
    max_depth = [3, 4, 5], 
    learning_rate = [0.01, 0.1, 0.5],
):

    acquire = acquire_job.as_step(name='dask_pipeline_acquire', 
                                  # image='yjbds/mlrun-base:dev',
                                  handler='arc_to_parquet',
                                  params = {
                                      'target_path': '/User/repos/demos/dask/dataset',
                                      'name'       : 'airlines.pqt', 
                                      'key'        : 'airlines',
                                      'archive_url': "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv",
                                      'dataset'    : 'partitions',
                                      'part_cols'  : ['Year', 'Month'],
                                      'encoding'   : 'latin-1',
                                      'inc_cols'   : ['Year','Month','DayofMonth','DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'ArrDelay', 'Origin', 'Dest', 'Distance'],
                                      'dtype'      : {
                                          'Distance'   : 'float32',
                                          'ArrDelay'   : 'float64',
                                          'CRSDepTime' : 'float32'}},
                                  outputs=['airlines'], 
                                  out_path=TARGET_PATH).apply(mlrun.mount_v3io())

    
    todask = dask_job.as_step(name='dask_pipeline_todask',
                              # image='yjbds/mlrun-dask:dev',
                              handler='parquet_to_dask',
                              params={
                                  'sample'           : 0.02,
                                  'shards'           : 8,
                                  'threads_per'      : 8,
                                  'memory_limit'     : '5GB',
                                  'dask_key'         : 'airlines',
                                  'target_path'      : TARGET_PATH},
                              inputs = {'parquet_url': acquire.outputs['airlines']}, 
                              outputs=['scheduler'], 
                              out_path=TARGET_PATH).apply(mlrun.mount_v3io())

    
    split = split_job.as_step(name='dask_pipeline_splitter',
                              # image='yjbds/mlrun-dask:dev',
                              handler='train_valid_test',
                              inputs={'dask_client': todask.outputs['scheduler']},
                              outputs=['header', 'test_set'],
                              out_path=TARGET_PATH).apply(mlrun.mount_v3io())
    
    train = train_job.as_step(name='dask_pipeline_train',
                              # image='yjbds/mlrun-dask:dev',
                              handler='clf_lgbm_dask',
                              params={
                                  'train_set'  : ('xtrain', 'ytrain'),
                                  'valid_set'  : ('xvalid', 'yvalid'),
                                  'target_path': TARGET_PATH,
                                  'name'       : 'lgbm-model.pkl',
                                  'key'        : 'lgbm-model',
                                  'params'     : {
                                      'max_depth'        : 3,
                                      'learning_rate'    : 0.1,
                                      'n_estimators'     : 3,
                                      'reg_alpha'        : 0.,
                                      'reg_lambda'       : 0.,
                                      'random_state'     : 1,
                                      'tree_learner'     : 'data',
                                      'silent'           : False}},
                              inputs={'dask_client': todask.outputs['scheduler']},
                              outputs=['lgbm-model'], 
                              out_path=TARGET_PATH).apply(mlrun.mount_v3io())

    deploy = srvfn.deploy_step(project = 'dask', models={'dask_v1': train.outputs['lgbm-model']})

In [24]:
# for debug generate the pipeline dsl
kfp.compiler.Compiler().compile(dask_pipeline, 'yaml/lgbm-dask-pipeline.yaml')



In [25]:
client = kfp.Client(namespace='default-tenant')

In [26]:
arguments = {}

run_result = client.create_run_from_pipeline_func(dask_pipeline, arguments, experiment_name='dask')