### Install MLRun

In [1]:
# !pip install mlrun==0.5.4

In [2]:
# !pip install --upgrade v3io

### Load Config

In [3]:
import yaml
with open("config.yaml") as f:
    config = yaml.safe_load(f)

### Setup Project

In [4]:
import os
from os import path, getenv
import nuclio
from mlrun import new_project, code_to_function, run_local, NewTask, mlconf, import_function, mount_v3io, new_function

project_name = config["project"]["name"]
project_path = path.abspath('project')
project = new_project(name=project_name, context=project_path)
artifact_path = path.abspath('pipeline')
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.log_level = "DEBUG"

print(f'Project name: {project_name}\nProject path: {project_path}')
print(f'Artifacts path: {artifact_path}\nMLRun DB path: {mlconf.dbpath}')

Project name: dogs-vs-cats-demo
Project path: /User/igz_repos/igz-dogs-vs-cats-pipeline/project
Artifacts path: /User/igz_repos/igz-dogs-vs-cats-pipeline/pipeline
MLRun DB path: http://mlrun-api:8080


### Build Docker Image

In [5]:
image = f"docker-registry.{os.getenv('IGZ_NAMESPACE_DOMAIN')}:80/{config['docker']['s3_image']}"

In [6]:
# # Build Docker Image (only needs to be run once)
# build_image = new_function(name="build-image", kind="job")
# build_image.build_config(
#     image=image, base_image="mlrun/mlrun", commands=["pip install boto3"]
# )
# build_image.deploy(with_mlrun=False)

### Import Functions

In [7]:
project.set_function(func='DownloadS3.ipynb',
                     name='download-s3',
                     kind='job',
                     image=image)
project.set_function(func='PrepData.ipynb',
                     name='prep-data',
                     kind='job',
                     image=image)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fd1cc1da190>

### Pipeline

In [10]:
%%writefile {path.join(project_path, 'workflow.py')}

import os
from kfp import dsl
from mlrun import mount_v3io
import yaml

with open("config.yaml") as f:
    config = yaml.safe_load(f)

# image = f"docker-registry.{os.getenv('IGZ_NAMESPACE_DOMAIN')}:80/inference-benchmarking-demo"
# image = "mlrun/mlrun"
funcs = {}

# Configure function resources and local settings
def init_functions(functions: dict, project=None, secrets=None):
    
    # Mount V3IO filesystem
    for fn in functions.values():
        fn.apply(mount_v3io())
        fn.apply(mount_v3io(name="csv",
                            remote=config["csv"]["s3_images_csv_remote_path"],
                            mount_path=config["csv"]["s3_images_csv_mount_path"]))
        fn.apply(mount_v3io(name="data",
                            remote=config["data"]["remote_download_path"],
                            mount_path=config["data"]["mount_download_path"]))
    
    # Set env var configuation for S3 functions
    s3_functions = ['download-s3']
    for func in s3_functions:
        functions[func].set_env('AWS_ACCESS_KEY_ID', config['aws']['aws_access_key_id'])
        functions[func].set_env('AWS_SECRET_ACCESS_KEY', config['aws']['aws_secret_access_key'])
        functions[func].set_env('AWS_DEFAULT_REGION', config['aws']['aws_default_region'])

# Create a Kubeflow Pipelines pipeline
@dsl.pipeline(
    name="Dogs vs Cats Pipeline",
    description="Kubeflow Pipeline Demo with PyTorch on Dogs vs Cats Dataset"
)
def kfpipeline(bucket_name:str = config['aws']['bucket_name'],
               s3_images_csv:str = f'{config["csv"]["s3_images_csv_mount_path"]}/{config["csv"]["s3_images_csv"]}',
               data_download_path:str = config['data']['mount_download_path'],
               download_data:bool=True,
               debug_logs:bool=True):    
    
    with dsl.Condition(download_data==True):
        inputs = {"bucket_name" : bucket_name,
                  "s3_images_csv" : s3_images_csv,
                  "data_download_path" : data_download_path}
        download_s3 = funcs['download-s3'].as_step(handler="handler",
                                                   inputs=inputs,
                                                   outputs=["s3_image_csv_local"],
                                                   verbose=debug_logs)
        data_local = download_s3.outputs['s3_image_csv_local']
    
    with dsl.Condition(download_data==False):
        data_local = s3_images_csv

Overwriting /User/igz_repos/igz-dogs-vs-cats-pipeline/project/workflow.py


### Save Pipeline

In [11]:
project.set_workflow('main', 'workflow.py')
project.save()

### Run Pipeline

In [12]:
run_id = project.run(
    'main',
    arguments={},
    artifact_path=path.abspath(path.join('pipeline','{{workflow.uid}}')), 
    dirty=True)

> 2020-12-10 19:39:13,305 [info] using in-cluster config.


> 2020-12-10 19:39:13,512 [info] Pipeline run id=6432d5cc-c5af-48ac-a9ab-886efc98be42, check UI or DB for progress


In [13]:
from mlrun import wait_for_pipeline_completion, get_run_db
wait_for_pipeline_completion(run_id);