## Creating a multi-stage KubeFlow Pipeline

* **[Acquire](1.%20remote%20archive%20t%20local%20parquet-airlines.ipynb)** airlines data from remote site and save locally as parquet dataset
* **[Load dataset](2.%20parquet%20to%20dask%20cluster-airlines.ipynb)** into persistent Dask cluster
* Feature engineering and **[data splits](3.%20generate%20train%20and%20test%20sets-airlines.ipynb)** (train/validation/test)
* **[Train](4.%20lightgbm%20on%20dask%20cluster.ipynb)** Dask-LGBM classifier
* **[Evaluate]()** trained model
* Wrap functionality into a KubeFlow pipeline

In [1]:
import mlrun
mlrun.mlconf.dbpath = 'http://mlrun-api:8080'
mlrun.mlconf.kfp_image = 'yjbds/mlrun-dask:dev'

In [2]:
import kfp
from kfp import dsl

## jobs

In [3]:
acquire_job     = mlrun.import_function('yaml/arc_to_parquet.yaml')
dask_job        = mlrun.import_function('yaml/parquet-to-dask.yaml')
sum_job         = mlrun.import_function('yaml/describe.yaml')
split_job       = mlrun.import_function('yaml/splitter-labelencode.yaml')
lgbm_job        = mlrun.import_function('yaml/clf_lgbm_dask.yaml')
xgb_job         = mlrun.import_function('yaml/clf_xgboost_dask.yaml')

jobs = [acquire_job, dask_job, sum_job, split_job, lgbm_job, xgb_job]

for job in jobs:
    job.apply(mlrun.mount_v3io())
    job.deploy(skip_deployed=True, with_mlrun=False)

In [4]:
srvfn = mlrun.new_model_server('fdsvr', 
                               model_class='ClassifierModel', 
                               filename='/User/repos/demos/dask/model_server.ipynb')

srvfn.apply(mlrun.mount_v3io())

<mlrun.runtimes.function.RemoteRuntime at 0x7f28dde61400>

In [5]:
# print(srvfn.to_yaml())

In [6]:
TARGET_PATH = '/User/repos/demos/dask/artifacts'

In [7]:
@dsl.pipeline(
    name='My Dask training pipeline',
    description='Shows how to use mlrun with dask and a booster model.'
)
def dask_pipeline(
    max_depth = [3, 4, 5], 
    learning_rate = [0.01, 0.1, 0.5],
):

    acquire_op = acquire_job.as_step(name='acquire', 
                                  handler='arc_to_parquet',
                                  params = {
                                      'target_path': '/User/repos/demos/dask/dataset',
                                      'name'       : 'airlines.pqt', 
                                      'key'        : 'airlines',
                                      'archive_url': "https://s3.amazonaws.com/h2o-airlines-unpacked/allyears.csv",
                                      'dataset'    : 'partitions',
                                      'part_cols'  : ['Year', 'Month'],
                                      'encoding'   : 'latin-1',
                                      'inc_cols'   : ['Year','Month','DayofMonth','DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'ArrDelay', 'Origin', 'Dest', 'Distance'],
                                      'dtype'      : {
                                          'Distance'   : 'float32',
                                          'ArrDelay'   : 'float64',
                                          'CRSDepTime' : 'float32'}},
                                  outputs=['airlines'], 
                                  out_path=TARGET_PATH).apply(mlrun.mount_v3io())

    todask_op = dask_job.as_step(name='todask',
                              handler='parquet_to_dask',
                              params={
                                  'sample'           : 0.01,
                                  'shards'           : 8,
                                  'threads_per'      : 8,
                                  'memory_limit'     : '5GB',
                                  'dask_key'         : 'airlines',
                                  'target_path'      : TARGET_PATH},
                              inputs = {'parquet_url': acquire_op.outputs['airlines']}, 
                              outputs=['scheduler'], 
                              out_path=TARGET_PATH).apply(mlrun.mount_v3io())

    sum_op = sum_job.as_step(name='summarize',
                             handler='table_summary',
                             inputs={'dask_client':todask_op.outputs['scheduler']},
                             params={
                                 'dask_key'    :  'airlines',
                                 'target_path' :  TARGET_PATH,
                                 'name'        : 'table-summary.csv',
                                 'key'         : 'table-summary'},
                             outputs=['table-summary']).apply(mlrun.mount_v3io())
    
    split_op = split_job.as_step(name='splitter',
                                 handler='splitter_labelencode',
                                 inputs={'dask_client': todask_op.outputs['scheduler']},
                                 params={'dask_key': 'airlines',
                                         'label_column': 'ArrDelay',
                                         'categories' : ['UniqueCarrier', 'Origin', 'Dest'],
                                         'target_path' : TARGET_PATH},
                                 outputs=['header', 'test_set'],
                                 out_path=TARGET_PATH).apply(mlrun.mount_v3io())
    
    lgbm_op = lgbm_job.as_step(name='lgbm',
                               handler='clf_lgbm_dask',
                               params={
                                   'train_set'  : ('xtrain', 'ytrain'),
                                   'valid_set'  : ('xvalid', 'yvalid'),
                                   'target_path': TARGET_PATH,
                                   'name'       : 'lgbm-model.pkl',
                                   'key'        : 'lgbm-model',
                                   'params'     : {
                                       'max_depth'        : 3,
                                       'learning_rate'    : 0.1,
                                       'n_estimators'     : 3,

                                       'reg_alpha'        : 0.,
                                       'reg_lambda'       : 0.,
                                       'random_state'     : 1,
                                       'tree_learner'     : 'data',
                                       'silent'           : False}},
                               inputs={'dask_client': todask_op.outputs['scheduler']},
                               outputs=['lgbm-model'], 
                               out_path=TARGET_PATH).apply(mlrun.mount_v3io())
     
    xgb_op = xgb_job.as_step(name='xgb',
                             handler='clf_xgboost_dask',
                             params={
                                 'train_set'  : ('xtrain', 'ytrain'),
                                 'valid_set'  : ('xvalid', 'yvalid'),
                                 'target_path': TARGET_PATH,
                                 'name'       : 'xgb-model.pkl',
                                 'key'        : 'xgb-model',
                                 'params'     : {
                                     'max_depth'          : 3, 
                                     'num_boost_round'    : 3,
                                     'eta'                : 1,
                                     'objective'          : 'binary:logistic',
                                     'eval_metric'        : ['auc', 'ams@0'],
                                     'evals'              : [('xvalid', 'yvalid')],
                                     'silent'             : False,
                                     'verbose_eval'       : True}},
                             inputs={'dask_client': todask_op.outputs['scheduler']},
                             outputs=['xgb-model'], 
                             out_path=TARGET_PATH).apply(mlrun.mount_v3io())

    lgbm_op.after(split_op) # since both ops use the same inputs they are run in parallel,
                            # this ensures sequential execution
    
    xgb_op.after(split_op)  # since both ops use the same inputs they are run in parallel,
                            # this ensures sequential execution
    
    deploy = srvfn.deploy_step(project='dask', 
                               models={'dask_v1': lgbm_op.outputs['lgbm-model']})

In [8]:
# for debug generate the pipeline dsl
kfp.compiler.Compiler().compile(dask_pipeline, 'yaml/lgbm-dask-pipeline.yaml')



In [9]:
client = kfp.Client(namespace='default-tenant')

In [10]:
arguments = {}

run_result = client.create_run_from_pipeline_func(dask_pipeline, arguments, experiment_name='dask')