# LightGBM
training a classifier using Microsoft's LightGBM on Dask

In [1]:
import mlrun

mlrun.mlconf.dbpath = 'http://mlrun-api:8080'

## parameters


In [2]:
DESCRIPTION        = 'train a LightGBM classifier on a Dask cluster'       

IMAGE              = 'yjbds/mlrun-dask:dev'                      # custom image
JOB_KIND           = 'job'                                           

TASK_NAME          = 'user-task-train-lgbm-dask'                         # whatever

ARTIFACTS_PATH     = '/User/repos/demos/dask/artifacts'

DASK_CLIENT        = 'scheduler.json'
TRAIN_SET          = ('xtrain', 'ytrain') # published names in the cluster 
VALID_SET          = ('xvalid', 'yvalid')

## load and configure function

In [3]:
FUNC_PY   = '/User/repos/demos/dask/code/clf_lgbm_dask.py'
FUNC_YAML = '/User/repos/demos/dask/yaml/clf_lgbm_dask.yaml'

HANDLER   = 'clf_lgbm_dask'

**If run the first time, create the function and export it as yaml, otherwise import it:**

In [4]:
# load function from a local Python file
clf_lgbm = mlrun.new_function(command=FUNC_PY, 
                               image=IMAGE,
                               kind=JOB_KIND)
clf_lgbm.spec.build.image = IMAGE

clf_lgbm.export(FUNC_YAML)
# clf_lgbm = mlrun.import_function(FUNC_YAML)

[mlrun] 2020-02-18 09:42:41,279 function spec saved to path: /User/repos/demos/dask/yaml/clf_lgbm_dask.yaml


In [None]:
clf_lgbm.apply(mlrun.mount_v3io())

clf_lgbm.deploy(skip_deployed=True, with_mlrun=False)

# create and run the task
train_clf_lgbm_task = mlrun.NewTask(
    TASK_NAME, 
    handler=HANDLER,  
    params={
        'dask_client': DASK_CLIENT,
        'train_set'  : TRAIN_SET,
        'valid_set'  : VALID_SET,
        'target_path': ARTIFACTS_PATH,
        'name'       : 'lgbm-model.pkl',
        'key'        : 'lgbm-model',
        'params'     : {
            'max_depth'          : 3,
            'learning_rate'      : 0.1,
            'n_estimators'       : 3,
            'reg_alpha'          : 0.,
            'reg_lambda'         : 0.,
            'random_state'       : 1,
            'tree_learner'       : 'data',
            'silent'             : False
        #    'categorical_feature': ['UniqueCarrier','Origin','Dest'],
        #    'is_pre_partition'   : True
        }})

# run
rn = clf_lgbm.run(train_clf_lgbm_task)

## tests

* artifact exists
* ...
* model can be loaded
* model has correct properties
* model can predict