In [13]:
# Install Pipeline SDK
!pip3 install https://storage.googleapis.com/ml-pipeline/release/0.1.3-rc.1/kfp.tar.gz --upgrade
!pip3 install kubernetes

Collecting https://storage.googleapis.com/ml-pipeline/release/0.1.3-rc.1/kfp.tar.gz
  Using cached https://storage.googleapis.com/ml-pipeline/release/0.1.3-rc.1/kfp.tar.gz
Building wheels for collected packages: kfp
  Running setup.py bdist_wheel for kfp ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-r50tt9g3/wheels/e2/11/ba/e849b14be51d60267f8c28afdb57716a7245c7892676273889
Successfully built kfp
Installing collected packages: kfp
  Found existing installation: kfp 0.1
    Uninstalling kfp-0.1:
      Successfully uninstalled kfp-0.1
Successfully installed kfp-0.1




In [14]:
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.notebook


In [16]:
# Note that this notebook should be running in JupyterHub in the same cluster as the pipeline system.
# Otherwise it will fail to talk to the pipeline system.
client = kfp.Client()
exp = client.create_experiment(name='code-search-3')

In [None]:
import kfp.dsl as dsl
from kubernetes import client as k8s_client

def dataflow_preprocess_op(project: 'GcpProject', runner: str, target_dataset: str, 
                           data_dir: 'GcsUri',temp_location: 'GcsUri', staging_location: 'GcsUri',
                           job_name: str, worker_machine_type: str, 
                           num_workers: int, step_name='dataflow_preprocess'):
    return dsl.ContainerOp(
        name = step_name,
        image = 'gcr.io/yang-codesearch/code-search-dataflow:v20181111-15e89a3-dirty-2bd7e7',
        command = [
            'python',
            '-m',
            'code_search.dataflow.cli.preprocess_github_dataset',
        ]
        arguments = [
            '--project', project,
            '--runner', runner,
            '--target_dataset', target_dataset,
            '--data_dir', data_dir,
            '--job_name', job_name,
            '--temp_location', temp_location,
            '--staging_location', staging_location,
            '--worker_machine_type', worker_machine_type,
            '--num_workers', num_workers,
            '--wait_until_finished',
        ]
    )


In [None]:
# The pipeline definition
@dsl.pipeline(
  name='training',
  description='Example training pipeline'
)
def training(
    project,
    working_dir,
    runner=dsl.PipelineParam(name='runnder', value='DataflowRunner'),
    target_dataset=dsl.PipelineParam(name='target-dataset', value='code_search'),
    worker_machine_type=dsl.PipelineParam(name='worker-machine-type', value='n1-highcpu-32'),
    num_workers=dsl.PipelineParam(name='num-workers', value=16)):
    job_name = '{{workflow.name}}'
    data_dir = '%s/data' % working_dir
    temp_location = '%s/dataflow/temp' % working_dir
    staging_location = '%s/dataflow/staging' % working_dir
    preprocess = dataflow_preprocess_op(project, runner, target_dataset,data_dir,
                                temp_location,staging_location,job_name,worker_machine_type,num_workers)

In [None]:
# Compile it into a tar package.
compiler.Compiler().compile(training,  'training.tar.gz')

PROJECT='yang-codesearch'
WORKING_DIR='gs://yang-codesearch/code-search'

# Submit a run.
run = client.run_pipeline(exp.id,
                          'code-search-training', 
                          'training.tar.gz',
                          params={'project': PROJECT,'working-dir': WORKING_DIR})

In [17]:
def dataflow_function_embedding_op(project: 'GcpProject', runner: str, target_dataset: str, problem: str,
                           data_dir: 'GcsUri',saved_model_dir: 'GcsUri',temp_location: 'GcsUri', staging_location: 'GcsUri',
                           job_name: str, worker_machine_type: str, 
                           num_workers: int, step_name='dataflow_function_embedding'):
    return dsl.ContainerOp(
        name = step_name,
        image = 'gcr.io/yang-codesearch/code-search-dataflow:v20181111-15e89a3-dirty-2bd7e7',
        command = [
            'python',
            '-m',
            'code_search.dataflow.cli.create_function_embeddings',
        ],
        arguments = [
            '--project', project,
            '--runner', runner,
            '--target_dataset', target_dataset,
            '--problem', problem,
            '--data_dir', data_dir,
            '--saved_model_dir', saved_model_dir,
            '--job_name', job_name,
            '--temp_location', temp_location,
            '--staging_location', staging_location,
            '--worker_machine_type', worker_machine_type,
            '--num_workers', num_workers,
            '--wait_until_finished',
        ]
    )


In [18]:
def ksonnet_op(working_dir: str, component: str):
    return dsl.ContainerOp(
        # use component name as step name
        name = component,
        image = 'gcr.io/yang-codesearch/code-search-ks:v20181112-e793a78-dirty-b2b269',
        arguments = [
            '--working_dir', working_dir,
            '--component', component
        ]
    )

In [None]:
# kubectl create secret generic gittoken --from-literal=token=92e918211fb8b192b5ba4202cf21deb086c6f363

In [19]:
def git_op(gcs_file: 'GcsUri', git_repo: str, user_email: str, step_name='git_op'):
    return dsl.ContainerOp(
        name = step_name,
        image = 'gcr.io/yang-codesearch/code-search-push-to-git:v20181118-cd85716-dirty-f9548e',
        arguments = [
            '--gcs_file', gcs_file, 
            '--git_repo', git_repo, 
            '--user_email', user_email, 
        ],
    )

In [28]:
# The pipeline definition
@dsl.pipeline(
  name='function_embedding',
  description='Example function embedding pipeline'
)
def function_embedding_update(
    project,
    working_dir,
    saved_model_dir,
    problem=dsl.PipelineParam(name='problem', value='github_function_docstring_extended'),
    runner=dsl.PipelineParam(name='runnder', value='DataflowRunner'),
    target_dataset=dsl.PipelineParam(name='target-dataset', value='code_search'),
    worker_machine_type=dsl.PipelineParam(name='worker-machine-type', value='n1-highcpu-32'),
    num_workers=dsl.PipelineParam(name='num-workers', value=16)):
#     job_name = '{{workflow.name}}'
#     data_dir = '%s/data' % working_dir
#     temp_location = '%s/dataflow/temp' % working_dir
#     staging_location = '%s/dataflow/staging' % working_dir
    function_embedding = dataflow_function_embedding_op(project, runner, target_dataset,problem,data_dir,saved_model_dir,
                                        temp_location,staging_location,job_name,worker_machine_type,num_workers)
    function_embedding.set_volumes([k8s_client.V1Volume(name='gcp-credentials',
                                       secret=k8s_client.V1SecretVolumeSource(
                                           secret_name='user-gcp-sa'))])
    function_embedding.set_volume_mounts([k8s_client.V1VolumeMount(
      mount_path='/secret/gcp-credentials', name='gcp-credentials')])
    function_embedding.set_env_variables([k8s_client.V1EnvVar(
      name='GOOGLE_APPLICATION_CREDENTIALS',
      value='/secret/gcp-credentials/user-gcp-sa.json')])
    index_creator = ksonnet_op(working_dir, 'search-index-creator').after(function_embedding)
    sync_index = git_op('gs://yang-codesearch/code-search/data/func-index-00095-of-00100.csv','IronPan/test','panyang06231989@gmail.com')#.after(index_creator)
    sync_index.set_env_variables([k8s_client.V1EnvVar(
       name='GIT_TOKEN',
       value_from=k8s_client.V1EnvVarSource(
           secret_key_ref=k8s_client.V1SecretKeySelector(
               name='gittoken',
               key='token',
       )))])
    

In [29]:
# Compile it into a tar package.
compiler.Compiler().compile(function_embedding_update,  'function_embedding_update.tar.gz')

PROJECT='yang-codesearch'
WORKING_DIR='gs://yang-codesearch/code-search'
SAVED_MODEL_DIR='gs://yang-codesearch/code-search/model'

# Submit a run.
run = client.run_pipeline(exp.id, 
                          'code-search-function-embedding', 
                          'function_embedding_update.tar.gz', 
                          params={'project': PROJECT,'working-dir': WORKING_DIR,'saved-model-dir':SAVED_MODEL_DIR})