In [None]:
from hkube_notebook import AlgorithmBuilder, PipelineBuilder, PipelineExecutor, TrackerType
import hkube_notebook
print(f'hkube_notebook {hkube_notebook.__version__}')
import time
# put appropriate api-server URL
api_server = 'http://localhost:3000/api/v1'
#api_server = 'http://localhost:3035/api/v1'
#api_server = 'https://10.32.10.19/hkube/api-server/api/v1'

# Create alg by function (all imports and internal functions must be nested)

In [None]:
alg_bldr = AlgorithmBuilder(api_server_base_url=api_server)
algs = alg_bldr.get_all()

In [None]:
def my_start(args):
    import numpy as np
    import pandas as pd
    import time
    n = 1000
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=n), 'y': np.random.normal(size=n)})
    print(df.columns)

    input = args["input"]
    print(f'algorithm: start, input: {input}')
    print('working...')
    time.sleep(5)
    array = input[0]
    order = input[1]
    if order == 'asc':
        reverse = False
    elif order == 'desc':
        reverse = True
    else:
        raise Exception('order not supported')

    list.sort(array, reverse=reverse)
    return array

entry, tarfilename = alg_bldr.create_algfile_by_functions(my_start)
config = alg_bldr.create_config('testfunc-alg', entry, version='1.0.1')
alg_bldr.apply(compressed_alg_file=tarfilename, config=config)
algs = alg_bldr.get_all()

In [None]:
# create pipeline and store it
fBuilder = PipelineBuilder(name='testfunc_pipe', api_server_base_url=api_server)
fBuilder.add_node(node_name='testfunc_node', alg_name='testfunc-alg', input=[[23, 4, 12, 18, 7, 13, 40, 20], "desc"])
fBuilder.store()

In [None]:
# execute stored pipeline
fExec = PipelineExecutor(name='testfunc_pipe', api_server_base_url=api_server)
results = fExec.exec()

In [None]:
# delete stored pipeline
fBuilder.delete()

In [None]:
# build async
config = alg_bldr.create_config('other-alg', entry, version='1.0.2')
state = alg_bldr.apply_async(compressed_alg_file=tarfilename, config=config)
if 'buildId' in state.keys():
    alg_bldr.get_build_state(state['buildId'])

# Create Titanic Algs (by ds-alg-example project)

In [None]:
# create algorithm tar.gz file from github project:
tarfilename = alg_mgr.create_algfile_by_github('git@github.com:kube-HPC/ds-alg-example.git', 'algorithm')
tarfilename
# alternatively create algorithm tar.gz file from local project folder:
#tarfilename = alg_mgr.create_algfile_by_folder('my/project/folder')

In [None]:
# create titanicpp-alg
folder = '/home/amiryi/dev/hkube/ds-alg-example/algorithm'
tarfilename = alg_mgr.create_algfile_by_folder(folder)
config = alg_mgr.create_config('titanicpp-alg', 'preprocess_entry.py')
alg_mgr.apply(compressed_alg_file=tarfilename, config=config)
algs = alg_mgr.get_all()

In [None]:
# create titanicsplit-alg
tarfilename = alg_mgr.create_algfile_by_folder(folder)
config = alg_mgr.create_config('titanicsplit-alg', 'split_entry.py')
alg_mgr.apply(compressed_alg_file=tarfilename, config=config)
algs = alg_mgr.get_all()

In [None]:
# create titanicparams-alg
tarfilename = alg_mgr.create_algfile_by_folder(folder)
config = alg_mgr.create_config('titanicparams-alg', 'params_entry.py')
alg_mgr.apply(compressed_alg_file=tarfilename, config=config)
algs = alg_mgr.get_all()

In [None]:
# create titanicrf-alg
tarfilename = alg_mgr.create_algfile_by_folder(folder)
config = alg_mgr.create_config('titanicrf-alg', 'randomforest_entry.py')
alg_mgr.apply(compressed_alg_file=tarfilename, config=config)
algs = alg_mgr.get_all()

In [None]:
# create titanicbestmodel-alg
tarfilename = alg_mgr.create_algfile_by_folder(folder)
config = alg_mgr.create_config('titanicbestmodel-alg', 'bestmodel_entry.py')
alg_mgr.apply(compressed_alg_file=tarfilename, config=config)
algs = alg_mgr.get_all()
# NOTE: make sure all algorithms are included and docker images were created for them!

# Create Titanic Train Pipeline

In [None]:
tBuilder = PipelineBuilder(name='titanic-train2', api_server_base_url=api_server)
tBuilder.add_node(node_name='preprocess', alg_name='titanicpp-alg', input=["@flowInput.df_key"])
tBuilder.add_node(node_name='split', alg_name='titanicsplit-alg', 
                  input=[{ "df_key": "@preprocess.df_key", "test_size": 0.25 }])
tBuilder.add_node(node_name='model-params', alg_name='titanicparams-alg', 
                  input=[{
                      "param_and_range": ["min_samples_split", [2, 3, 10]],
                      "params": {
                          "n_estimators": 10,
                          "max_depth": 3
                      }
                  }])
tBuilder.add_node(node_name='random-forest', alg_name='titanicrf-alg', 
                  input=[{
                      "params_combinations": "#@model-params",
                      "x_train": "@split.x_train",
                      "x_test": "@split.x_test",
                      "y_train": "@split.y_train",
                      "y_test": "@split.y_test"
                      }
                  ])
tBuilder.add_node(node_name='best-model', alg_name='titanicbestmodel-alg', 
                  input=[{
                      "df_key": "@preprocess.df_key",
                      "models_results": "@random-forest"
                  }])
tBuilder.get_raw()
#time.sleep(1)
tBuilder.store()

# Execute Titanic Train Pipeline

In [None]:
tRawExec = PipelineExecutor(raw=tBuilder.get_raw(), api_server_base_url=api_server)
results = tRawExec.exec(input={'df_key': 'train.csv'})