In [None]:
# Install Pipeline SDK
!pip3 install https://storage.googleapis.com/ml-pipeline/release/0.1.1/kfp.tar.gz --upgrade

In [47]:
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.notebook


In [49]:
# Note that this notebook should be running in JupyterHub in the same cluster as the pipeline system.
# Otherwise it will fail to talk to the pipeline system.
client = kfp.Client()
exp = client.create_experiment(name='code-search-v2')

In [65]:
import kfp.dsl as dsl

def dataflow_preprocess_op(project: 'GcpProject', runner: str, target_dataset: str, 
                           data_dir: 'GcsUri',temp_location: 'GcsUri', staging_location: 'GcsUri',
                           job_name: str, worker_machine_type: str, 
                           num_workers: int, step_name='dataflow_preprocess'):
    return dsl.ContainerOp(
        name = step_name,
        image = 'gcr.io/yang-codesearch/code-search-dataflow:v20181111-15e89a3-dirty-2bd7e7',
        command = [
            'python',
            '-m',
            'code_search.dataflow.cli.preprocess_github_dataset',
        ],
        arguments = [
            '--project', project,
            '--runner', runner,
            '--target_dataset', target_dataset,
            '--data_dir', data_dir,
            '--job_name', job_name,
            '--temp_location', temp_location,
            '--staging_location', staging_location,
            '--worker_machine_type', worker_machine_type,
            '--num_workers', num_workers,
            '--wait_until_finished',
        ]
    )


In [66]:
def dataflow_function_embedding_op(project: 'GcpProject', runner: str, target_dataset: str, problem: str,
                           data_dir: 'GcsUri',saved_model_dir: 'GcsUri',temp_location: 'GcsUri', staging_location: 'GcsUri',
                           job_name: str, worker_machine_type: str, 
                           num_workers: int, step_name='dataflow_function_embedding'):
    return dsl.ContainerOp(
        name = step_name,
        image = 'gcr.io/yang-codesearch/code-search-dataflow:v20181111-15e89a3-dirty-2bd7e7',
        command = [
            'python',
            '-m',
            'code_search.dataflow.cli.create_function_embeddings',
        ],
        arguments = [
            '--project', project,
            '--runner', runner,
            '--target_dataset', target_dataset,
            '--problem', problem,
            '--data_dir', data_dir,
            '--saved_model_dir', saved_model_dir,
            '--job_name', job_name,
            '--temp_location', temp_location,
            '--staging_location', staging_location,
            '--worker_machine_type', worker_machine_type,
            '--num_workers', num_workers,
            '--wait_until_finished',
        ]
    )


In [67]:
# The pipeline definition
@dsl.pipeline(
  name='training',
  description='Example training pipeline'
)
def training(
    project,
    working_dir,
    runner=dsl.PipelineParam(name='runnder', value='DataflowRunner'),
    target_dataset=dsl.PipelineParam(name='target-dataset', value='code_search'),
    worker_machine_type=dsl.PipelineParam(name='worker-machine-type', value='n1-highcpu-32'),
    num_workers=dsl.PipelineParam(name='num-workers', value=16)):
    job_name = '{{workflow.name}}'
    data_dir = '%s/data' % working_dir
    temp_location = '%s/dataflow/temp' % working_dir
    staging_location = '%s/dataflow/staging' % working_dir
    df = dataflow_preprocess_op(project, runner, target_dataset,data_dir,
                                temp_location,staging_location,job_name,worker_machine_type,num_workers)

In [68]:
# The pipeline definition
@dsl.pipeline(
  name='training',
  description='Example training pipeline'
)
def function_embedding_update(
    project,
    working_dir,
    saved_model_dir,
    problem=dsl.PipelineParam(name='problem', value='github_function_docstring_extended'),
    runner=dsl.PipelineParam(name='runnder', value='DataflowRunner'),
    target_dataset=dsl.PipelineParam(name='target-dataset', value='code_search'),
    worker_machine_type=dsl.PipelineParam(name='worker-machine-type', value='n1-highcpu-32'),
    num_workers=dsl.PipelineParam(name='num-workers', value=16)):
    job_name = '{{workflow.name}}'
    data_dir = '%s/data' % working_dir
    temp_location = '%s/dataflow/temp' % working_dir
    staging_location = '%s/dataflow/staging' % working_dir
    df = dataflow_function_embedding_op(project, runner, target_dataset,problem,data_dir,saved_model_dir,
                                        temp_location,staging_location,job_name,worker_machine_type,num_workers)

In [70]:
# Compile it into a tar package.
compiler.Compiler().compile(training,  'training.tar.gz')

PROJECT='yang-codesearch'
WORKING_DIR='gs://yang-codesearch/code-search'

# Submit a run.
run = client.run_pipeline(exp.id,
                          'code-search-training', 
                          'training.tar.gz',
                          params={'project': PROJECT,'working-dir': WORKING_DIR})

In [72]:
# Compile it into a tar package.
compiler.Compiler().compile(function_embedding_update,  'function_embedding_update.tar.gz')

PROJECT='yang-codesearch'
WORKING_DIR='gs://yang-codesearch/code-search'
SAVED_MODEL_DIR='gs://yang-codesearch/code-search/model'

# Submit a run.
run = client.run_pipeline(exp.id, 
                          'code-search-function-embedding', 
                          'function_embedding_update.tar.gz', 
                          params={'project': PROJECT,'working-dir': WORKING_DIR,'saved-model-dir':SAVED_MODEL_DIR})