In [88]:
# Install Pipeline SDK
!pip3 install pip3 install https://storage.googleapis.com/ml-pipeline/release/0.1.3-rc.3/kfp.tar.gz --upgrade
!pip3 install kubernetes

Processing ./kfps.tar.gz
Building wheels for collected packages: kfp
  Running setup.py bdist_wheel for kfp ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-be3a0m_j/wheels/e3/e4/29/e93c75113ecb9f98ce09678b96d45e6b9e629197cda795aead
Successfully built kfp
Installing collected packages: kfp
  Found existing installation: kfp 0.1
    Uninstalling kfp-0.1:
      Successfully uninstalled kfp-0.1
Successfully installed kfp-0.1


In [113]:
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.notebook
from kubernetes import client as k8s_client
from typing import Dict

In [170]:
def default_gcp_op(name: str, image: str, command: str = None,
           arguments: str = None, file_inputs: Dict[dsl.PipelineParam, str] = None,
           file_outputs: Dict[str, str] = None, is_exit_handler=False):
  """An operator that mounts the default GCP service account to the container.

  The user-gcp-sa secret is created as part of the kubeflow deployment that
  stores the access token for kubeflow user service account.

  With this service account, the container has a range of GCP APIs to
  access to. This service account is automatically created as part of the
  kubeflow deployment.

  For the list of the GCP APIs this service account can access to, check
  https://github.com/kubeflow/kubeflow/blob/7b0db0d92d65c0746ac52b000cbc290dac7c62b1/deployment/gke/deployment_manager_configs/iam_bindings_template.yaml#L18

  If you want to call the GCP APIs in a different project, grant the kf-user
  service account access permission.
  """

  return (
    dsl.ContainerOp(
      name,
      image,
      command,
      arguments,
      file_inputs,
      file_outputs,
      is_exit_handler,
    )
      .add_volume(
      k8s_client.V1Volume(
        name='gcp-credentials',
        secret=k8s_client.V1SecretVolumeSource(
          secret_name='user-gcp-sa'
        )
      )
    )
      .add_volume_mount(
      k8s_client.V1VolumeMount(
        mount_path='/secret/gcp-credentials',
        name='gcp-credentials',
      )
    )
      .add_env_variable(
      k8s_client.V1EnvVar(
        name='GOOGLE_APPLICATION_CREDENTIALS',
        value='/secret/gcp-credentials/user-gcp-sa.json'
      )
    )
  )


In [214]:
# Note that this notebook should be running in JupyterHub in the same cluster as the pipeline system.
# Otherwise it will fail to talk to the pipeline system.
client = kfp.Client()
exp = client.create_experiment(name='code-search-21')

In [227]:

def dataflow_function_embedding_op(
        cluster_name: str,
        function_embeddings_bq_table: str,
        function_embeddings_dir: str,
        namespace: str,
        num_workers: int,
        project: 'GcpProject',
        saved_model_dir: 'GcsUri',
        worker_machine_type: str,
        workflow_id: str,
        working_dir: str,):
  return default_gcp_op(
    name='dataflow_function_embedding',
    image='gcr.io/kubeflow-examples/code-search/ks:v20181204-ee47a49-dirty-fa8aa3',
    command=['/usr/local/src/submit_code_embeddings_job.sh'],
    arguments=[
      "--cluster=%s" % cluster_name,
      "--dataDir=%s" % 'gs://code-search-demo/20181104/data',
      "--functionEmbeddingsDir=%s" % function_embeddings_dir,
      "--functionEmbeddingsBQTable=%s" % function_embeddings_bq_table,
      "--modelDir=%s" % saved_model_dir,
      "--namespace=%s" % namespace,
      "--numWorkers=%s" % num_workers,
      "--project=%s" % project,
      "--workerMachineType=%s" % worker_machine_type,
      "--workflowId=%s" % workflow_id,
      "--workingDir=%s" % working_dir,
    ]
  )


In [228]:

def search_index_creator_op(
        cluster_name: str,
        function_embeddings_dir: str,
        index_file: str,
        lookup_file: str,
        namespace: str,
        workflow_id: str):
  return dsl.ContainerOp(
    # use component name as step name
    name='search_index_creator',
    image='gcr.io/kubeflow-examples/code-search/ks:v20181204-ee47a49-dirty-fa8aa3',
    command=['/usr/local/src/launch_search_index_creator_job.sh'],
    arguments=[
      '--cluster=%s' % cluster_name,
      '--functionEmbeddingsDir=%s' % function_embeddings_dir,
      '--indexFile=%s' % index_file,
      '--lookupFile=%s' % lookup_file,
      '--namespace=%s' % namespace,
      '--workflowId=%s' % workflow_id,
    ]
  )


In [229]:
def update_index_op(
        app_dir: str,
        base_branch: str,
        base_git_repo: str,
        bot_email: str,
        fork_git_repo: str,
        index_file: str,
        lookup_file: str,
        workflow_id: str):
  return (
    dsl.ContainerOp(
      name='update_index',
      image='gcr.io/kubeflow-examples/code-search/ks:v20181204-ee47a49-dirty-fa8aa3',
      command=['/usr/local/src/update_index.sh'],
      arguments=[
        '--appDir=%s' % app_dir,
        '--baseBranch=%s' % base_branch,
        '--baseGitRepo=%s' % base_git_repo,
        '--botEmail=%s' % bot_email,
        '--forkGitRepo=%s' % fork_git_repo,
        '--indexFile=%s' % index_file,
        '--lookupFile=%s' % lookup_file,
        '--workflowId=%s' % workflow_id,
      ],
    )
    .add_volume(
      k8s_client.V1Volume(
        name='github-access-token',
        secret=k8s_client.V1SecretVolumeSource(
          secret_name='github-access-token'
        )
      )
    )
    .add_env_variable(
      k8s_client.V1EnvVar(
        name='GITHUB_TOKEN',
        value_from=k8s_client.V1EnvVarSource(
          secret_key_ref=k8s_client.V1SecretKeySelector(
            name='github-access-token',
            key='token',
          )
        )
      )
    )
  )


In [230]:
@dsl.pipeline(
  name='function_embedding',
  description='Example function embedding pipeline'
)
def function_embedding_update(
    project='code-search-demo',
    cluster_name='cs-demo-1103',
    namespace='kubeflow',
    working_dir='gs://code-search-demo/pipeline',
    saved_model_dir='gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/',
    target_dataset='code_search',
    worker_machine_type='n1-highcpu-32',
    num_workers=5,
    base_git_repo='kubeflow/examples',
    base_branch='master',
    app_dir='code_search/ks-web-app',
    fork_git_repo='IronPan/examples',
    bot_email='kf.sample.bot@gmail.com'):
  workflow_name = '{{workflow.name}}'
  # Can't use workflow name as bq_suffix since BQ table doesn't accept '-' and
  # workflow name is assigned at runtime. Pipeline might need to support
  # replacing characters in workflow name.
  bq_suffix = uuid.uuid4().hex[:6].upper()
  working_dir = '%s/%s' % (working_dir, workflow_name)
  lookup_file = '%s/code-embeddings-index/embedding-to-info.csv' % working_dir
  index_file = '%s/code-embeddings-index/embeddings.index'% working_dir
  function_embeddings_dir = '%s/%s' % (working_dir, "code_embeddings")
  function_embeddings_bq_table = \
    '%s:%s.function_embeddings_%s' % (project, target_dataset, bq_suffix)

  function_embedding = dataflow_function_embedding_op(
    cluster_name,
    function_embeddings_bq_table,
    function_embeddings_dir,
    namespace,
    num_workers,
    project,
    saved_model_dir,
    worker_machine_type,
    workflow_name,
    working_dir)

  search_index_creator = search_index_creator_op(
    cluster_name,
    function_embeddings_dir,
    index_file,
    lookup_file,
    namespace,
    workflow_name)
  search_index_creator.after(function_embedding)

  update_index_op(
    app_dir,
    base_branch,
    base_git_repo,
    bot_email,
    fork_git_repo,
    index_file,
    lookup_file,
    workflow_name).after(search_index_creator)



In [231]:
# Compile it into a tar package.
compiler.Compiler().compile(function_embedding_update,  'function_embedding_update.tar.gz')

# Submit a run.
# inputs - experiment id, run name, tarball file
run = client.run_pipeline(exp.id, 'code-search-function-embedding', 'function_embedding_update.tar.gz')