In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex Pipelines: Vertex AI Custom Job OP training wrapper

## Overview
This notebook shows how to use the `custom_training_job_op` wrapper to convert any component to run it as a Vertex AI custom job. This allows users to take advantage of the vertical and horizontal scaling for computation heavy tasks on Vertex AI. This requires that the underlying component has built in support distributed computation. To learn more about Vertex AI Custom Job see [Vertex AI Custom Training](https://cloud.google.com/vertex-ai/docs/training/custom-training).


For `custom_training_job_op` interface please see the [API Docs here](https://google-cloud-pipeline-components.readthedocs.io/).

### Install additional packages

In [None]:
!pip3 install  -U google-cloud-pipeline-components -q

## Before you begin
### Set your Project ID and Location


In [None]:
PROJECT_ID = "python-docs-samples-tests"
LOCATION = "us-central1"

### Import libraries

In [None]:
from google_cloud_pipeline_components.experimental.custom_job import custom_training_job_op
from kfp.v2 import dsl
import kfp

## Create a component, convert it to a custom job to use in a pipeline.
Create a simple component that takes an input and produces an output

In [None]:
producer_op = kfp.components.load_component_from_text(
    """
name: Producer
inputs:
- {name: input_text, type: String, description: 'Represents an input parameter.'}
outputs:
- {name: output_value, type: String, description: 'Represents an output paramter.'}
implementation:
  container:
    image: google/cloud-sdk:latest
    command:
    - sh
    - -c
    - |
      set -e -x
      echo "$0, this is an output parameter" | gsutil cp - "$1"
    - {inputValue: input_text}
    - {outputPath: output_value}
"""
)

## Define the pipeline:
Use `custom_training_job_op` to convert the `producer_op` to `custom_training_producer_op` and use it to construct the pipeline.

In [None]:
@dsl.pipeline(pipeline_root='', name='custom-job-sample-pipeline')
def pipeline(text: str = 'message'):
    custom_training_producer_op= gcpc.experimental.custom_job.custom_training_job_op(producer_op)
    custom_producer_task = custom_training_producer_op(input_text=text, project=PROJECT_ID ,location=REGION)

You can proceed to complie and run the pipeline from here as usual.

# Vertex Pipelines: Distributed Training with Vertex AI Custom Job OP

## Overview
This sample shows how to use the `custom_training_job_op` wrapper to create a distributed training job on Vertex AI. This allows users to take advantage of the vertical and horizontal scaling for computation heavy tasks on Vertex AI. The underlying component must support distributed computation, in this example we will use the [Tensorflow distribution strategy](https://www.tensorflow.org/guide/distributed_training).


## Create a component with distributed strategy support
In this example we use the tf.distribute.Strategy to create a component with distribute training across multiple machines. For additional distribution strategies such as using multiple GPUs or TPUs please see the [Tensorflow distribution strategy guide](https://www.tensorflow.org/guide/distributed_training).

In [None]:
@dsl.component
def distributed_train_mnist(num_epochs: int = 5):
    """Distribute train mnist across multiple machines.
    Args:
        num_epochs: Optional, number of epochs to run the training.
        output_model: A locally accessible filepath for
            output artifact of type `Model`.
    """
    # Installing Tensorflow in the default image.
    # Alternatively you can use a custom base image.
    import subprocess
    subprocess.call(['pip3', 'install', 'tensorflow'])

    import tensorflow as tf

    # Setup a distribution strategy. The component must able to
    # Support the distribution strategy set by custom job wrapper.
    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    with strategy.scope():
        # Load and prepare the MNIST dataset.
        mnist = tf.keras.datasets.mnist
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train, x_test = x_train / 255.0, x_test / 255.0

        # Define a simple model
        model = tf.keras.models.Sequential([
          tf.keras.layers.Flatten(input_shape=(28, 28)),
          tf.keras.layers.Dense(128, activation='relu'),
          tf.keras.layers.Dropout(0.2),
          tf.keras.layers.Dense(10)
        ])

        # Choose a loss function for training
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

        # Compile the model
        model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])

        # Run the training for a few epochs
        model.fit(x_train, y_train, epochs=5)


## Define the pipeline
Convert the component to a Vertex AI Custom Job and define the cluster configuration. In this case we are using three CPU instances to run the training as a MultiWorkerMirroredStrategy job. We use the resulting training_op to in the pipeline definition.

In [None]:
@dsl.pipeline(name='distributed-custom-job-sample-pipeline')
def pipeline(num_epochs: int = 5):
    custom_job_distributed_training_op= custom_training_job_op(distributed_train_mnist, replica_count= 3)
    custom_producer_task = custom_job_distributed_training_op(num_epochs=num_epochs, project=PROJECT_ID, location=LOCATION)

You can proceed to complie and run the pipeline from here as usual. 