## Define pipeline to train a model

## Create entry point using fairing

In [1]:
import os
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire

In [2]:
preprocessor = ConvertNotebookPreprocessorWithFire('IssuesLoader', notebook_file='Issues_Loader.ipynb')

if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['embeddings.py', 'inference.py', 'config.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()

[PosixPath('Issues_Loader.py'), 'embeddings.py', 'config.py', 'inference.py']

In [3]:
preprocessor = ConvertNotebookPreprocessorWithFire('RepoMLP', notebook_file='Repo_MLP.ipynb')

if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['mlp.py', 'config.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()

[PosixPath('Repo_MLP.py'), 'config.py', 'mlp.py']

## Use Fairing to build docker image

In [4]:
import os
import sys
import fairing
from fairing.builders import append
from fairing.builders import cluster
from fairing.deployers import job
from fairing.preprocessors.converted_notebook import ConvertNotebookPreprocessorWithFire

In [5]:
# Setting up google container repositories (GCR) for storing output containers
# You can use any docker container registry istead of GCR
GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
print(GCP_PROJECT)
DOCKER_REGISTRY = 'gcr.io/{}/training'.format(GCP_PROJECT)
print(DOCKER_REGISTRY)
PY_VERSION = ".".join([str(x) for x in sys.version_info[0:3]])
BASE_IMAGE = 'python:{}'.format(PY_VERSION)
# ucan use Dockerfile in this repo to build and use the base_image
base_image = 'gcr.io/issue-label-bot-dev/ml-gpu-lite-py3.6'

issue-label-bot-dev
gcr.io/issue-label-bot-dev/training


### Build Docker image

In [6]:
preprocessor = ConvertNotebookPreprocessorWithFire('RepoMLP', notebook_file='Repo_MLP.ipynb')

if not preprocessor.input_files:
    preprocessor.input_files = set()
input_files = ['mlp.py', 'config.py', 'embeddings.py', 'inference.py', 'config.py', 'Issues_Loader.py']
preprocessor.input_files =  set([os.path.normpath(f) for f in input_files])
preprocessor.preprocess()

[PosixPath('Repo_MLP.py'),
 'Issues_Loader.py',
 'embeddings.py',
 'config.py',
 'mlp.py',
 'inference.py']

In [7]:
cluster_builder = cluster.cluster.ClusterBuilder(registry=DOCKER_REGISTRY,
                                                 base_image=base_image,
                                                 namespace='chunhsiang',
                                                 preprocessor=preprocessor,
                                                 pod_spec_mutators=[fairing.cloud.gcp.add_gcp_credentials_if_exists],
                                                 context_source=cluster.gcs_context.GCSContextSource())
cluster_builder.build()

Building image using cluster builder.
Creating docker context: /tmp/fairing_context_jxyoa4um
Waiting for fairing-builder-h9ztc to start...
Waiting for fairing-builder-h9ztc to start...
Waiting for fairing-builder-h9ztc to start...
Waiting for fairing-builder-h9ztc to start...
Pod started running True


[36mINFO[0m[0006] Downloading base image gcr.io/issue-label-bot-dev/ml-gpu-lite-py3.6
[36mINFO[0m[0006] Downloading base image gcr.io/issue-label-bot-dev/ml-gpu-lite-py3.6
[33mWARN[0m[0006] Error while retrieving image from cache: getting image from path: open /cache/sha256:007490fe99543d64363755e69ed47047c45406ae163a30fab2a7a55ec3710ceb: no such file or directory
[36mINFO[0m[0007] Checking for cached layer gcr.io/issue-label-bot-dev/training/fairing-job/cache:5abd94715d7d7183ae347a324cf72f4902c76c64083014668bdc256c7df55814...
[36mINFO[0m[0007] No cached layer found for cmd RUN if [ -e requirements.txt ];then pip install --no-cache -r requirements.txt; fi
[36mINFO[0m[0007] Unpacking rootfs as cmd RUN if [ -e requirements.txt ];then pip install --no-cache -r requirements.txt; fi requires it.
[36mINFO[0m[0170] Taking snapshot of full filesystem...
[36mINFO[0m[0185] Skipping paths under /dev, as it is a whitelisted directory
[36mINFO[0m[0185] Skipping paths under /etc/se

In [8]:
builder = append.append.AppendBuilder(registry=DOCKER_REGISTRY,
                                      base_image=cluster_builder.image_tag,
                                      preprocessor=preprocessor)
builder.build()

Building image using Append builder...
Creating docker context: /tmp/fairing_context_mfh2fxiq
Repo_MLP.py already exists in Fairing context, skipping...
Loading Docker credentials for repository 'gcr.io/issue-label-bot-dev/training/fairing-job:36752B5C'
Invoking 'docker-credential-gcloud' to obtain Docker credentials.
Successfully obtained Docker credentials.
Image successfully built in 0.93903358199168s.
Pushing image gcr.io/issue-label-bot-dev/training/fairing-job:9530A4D2...
Loading Docker credentials for repository 'gcr.io/issue-label-bot-dev/training/fairing-job:9530A4D2'
Invoking 'docker-credential-gcloud' to obtain Docker credentials.
Successfully obtained Docker credentials.
Uploading gcr.io/issue-label-bot-dev/training/fairing-job:9530A4D2
Layer sha256:da348912572d5e12d8c7e54b6a26417c5a2a168238c3a1c961be1ae2363eb2b8 exists, skipping
Layer sha256:c011ade78a64c3bef40c7a4cd87d4274a4808f7826c0b70f6679205931593cc1 exists, skipping
Layer sha256:5b6ac7f35d3dce1cd93b9740c7342fe317b179

## Build pipeline

In [9]:
import kfp
import kfp.components as comp
import kfp.gcp as gcp
import kfp.dsl as dsl
import kfp.compiler as compiler

In [10]:
non_updated_image = 'gcr.io/issue-label-bot-dev/training/fairing-job:EFD117EE'
updated_image = 'gcr.io/issue-label-bot-dev/training/fairing-job:9530A4D2'
target_image = updated_image

In [12]:
@dsl.pipeline(
   name='Training pipeline',
   description='A pipeline that loads embeddings and trains a model for a github repo.'
)
def train_pipeline(owner, repo):
    scrape_op = dsl.ContainerOp(
            name='scrape issues',
            image=target_image,
            command=['python', 'Issues_Loader.py', 'save_issue_embeddings', f'--owner={owner}', f'--repo={repo}'],
            ).apply(
                gcp.use_gcp_secret('user-gcp-sa'),
            )
    scrape_op.container.working_dir = '/app'

    train_op = dsl.ContainerOp(
            name='train',
            image=target_image,
            command=['python', 'Repo_MLP.py', 'train', f'--owner={owner}', f'--repo={repo}'],
            ).apply(
                gcp.use_gcp_secret('user-gcp-sa'),
            )
    train_op.container.working_dir = '/app'
    train_op.after(scrape_op)

### Compile the pipeline

In [13]:
pipeline_func = train_pipeline
pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'
compiler.Compiler().compile(pipeline_func, pipeline_filename)

### Submit the pipeline for execution

In [14]:
EXPERIMENT_NAME = 'MockupModel'

client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)

In [17]:
#Specify pipeline argument values
arguments = {'owner': 'kubeflow', 'repo': 'examples'}

#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)