In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex Pipelines: Vertex AI Custom Job training wrapper

## Overview
This notebook shows how to use the `run_as_vertex_ai_custom_job` wrapper to convert any component to run it as a Vertex AI custom job. This allows users to take advantage of the vertical and horizontal scaling for computation heavy tasks on Vertex AI. This requires that the underlying component has built in support distributed computation. To learn more about Vertex AI Custom Job see [Vertex AI Custom Training](https://cloud.google.com/vertex-ai/docs/training/custom-training). 


## Vertex AI Custom Job Wrapper Interface

`google_cloud_pipeline_components.experimental.custom_job.custom_job`

```python
def run_as_vertex_ai_custom_job(
    component_spec: Callable,
    display_name: Optional[str] = None,
    replica_count: Optional[int] = None,
    machine_type: Optional[str] = None,
    accelerator_type: Optional[str] = None,
    accelerator_count: Optional[int] = None,
    boot_disk_type: Optional[str] = None,
    boot_disk_size_gb: Optional[int] = None,
    timeout: Optional[str] = None,
    restart_job_on_worker_restart: Optional[bool] = None,
    service_account: Optional[str] = None,
    network: Optional[str] = None,
    worker_pool_specs: Optional[List[Mapping[str, Any]]] = None,
) -> Callable:
```

<dt><b>Args:</b></dt><dd><p>component_spec: The component spec (factory function) to run as aiplatform custom job.
display_name: Optional. The name of the custom job. If not provided the</p>
<div><p>component_spec.name will be used instead.</p>
</div>
<dl class="simple">
<dt>replica_count: Optional. The number of replicas to be split between master</dt><dd><p>workerPoolSpec and worker workerPoolSpec. (master always has 1 replica).</p>
</dd>
<dt>machine_type: Optional. The type of the machine to run the custom job. The</dt><dd><p>default value is “n1-standard-4”.</p>
</dd>
<dt>accelerator_type: Optional. The type of accelerator(s) that may be attached</dt><dd><p>to the machine as per accelerator_count. Optional.</p>
</dd>
<dt>accelerator_count: Optional. The number of accelerators to attach to the</dt><dd><p>machine.</p>
</dd>
<dt>boot_disk_type: Optional. Type of the boot disk (default is “pd-ssd”). Valid</dt><dd><dl class="simple">
<dt>values: “pd-ssd” (Persistent Disk Solid State Drive) or “pd-standard”</dt><dd><p>(Persistent Disk Hard Disk Drive).</p>
</dd>
</dl>
</dd>
</dl>
<p>boot_disk_size_gb: Optional. Size in GB of the boot disk (default is 100GB).
timeout: Optional. The maximum job running time. The default is 7 days. A</p>
<div><p>duration in seconds with up to nine fractional digits, terminated by ‘s’.
Example: “3.5s”</p>
</div>
<dl class="simple">
<dt>restart_job_on_worker_restart: Optional. Restarts the entire CustomJob if a</dt><dd><p>worker gets restarted. This feature can be used by distributed training
jobs that are not resilient to workers leaving and joining a job.</p>
</dd>
<dt>service_account: Optional. Specifies the service account for workload run-as</dt><dd><p>account.</p>
</dd>
<dt>network: Optional. The full name of the Compute Engine network to which the</dt><dd><p>job should be peered. For example, projects/12345/global/networks/myVPC.</p>
</dd>
<dt>worker_pool_specs: Optional, worker_pool_specs for distributed training. this</dt><dd><p>will overwite all other cluster configurations. For details, please see:
<a class="reference external" href="https://cloud.google.com/ai-platform-unified/docs/training/distributed-training">https://cloud.google.com/ai-platform-unified/docs/training/distributed-training</a></p>
</dd>
</dl>
</dd>
<dt><b>Returns:</b></dt><dd><p>A Custom Job component correspoinding to the input component.</p>
</dd>
</dl>
</dd></dl>

### Install additional packages

In [None]:
!pip3 install  -U "git+https://github.com/kubeflow/pipelines.git#egg=kfp&subdirectory=sdk/python" -q
!pip3 install  -U google-cloud-pipeline-components -q
!pip3 install  -U google-cloud-aiplatform -q

### Restart the kernel

After you install the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
# Automatically restart kernel after installs
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

## Before you begin
### Set your Project ID and Pipeline Root


In [None]:
PROJECT_ID = "python-docs-samples-tests"  # @param {type:"string"}
BUCKET_NAME = "gs://[your-bucket-name]"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}
PIPELINE_ROOT = "{}/pipeline_root".format(BUCKET_NAME)
PIPELINE_ROOT

### Import libraries and define constants

In [None]:
import google_cloud_pipeline_components as gcpc
from kfp.v2 import components, compiler, dsl

## Create a component, convert it to a custom job to use in a pipeline.
Create a simple component that takes an input and produces an output

In [None]:
producer_op = components.load_component_from_text(
    """
name: Producer
inputs:
- {name: input_text, type: String, description: 'Represents an input parameter.'}
outputs:
- {name: output_value, type: String, description: 'Represents an output paramter.'}
implementation:
  container:
    image: google/cloud-sdk:latest
    command:
    - sh
    - -c
    - |
      set -e -x
      echo "$0, this is an output parameter" | gsutil cp - "$1"
    - {inputValue: input_text}
    - {outputPath: output_value}
"""
)

Convert the component to a Vertex AI Custom Job

In [None]:
# This is a temporary work around for v0.1.4 to point to the correct base image for custom_job
gcpc.experimental.custom_job.custom_job._DEFAULT_CUSTOM_JOB_CONTAINER_IMAGE = 'gcr.io/ml-pipeline/gcp-launcher:0.1.4'

custom_job_producer_op= gcpc.experimental.custom_job.custom_job.run_as_vertex_ai_custom_job(producer_op)

Define the pipeline:

In [None]:
@dsl.pipeline(pipeline_root='', name='custom-job-sample-pipeline')
def pipeline(text: str = 'message'):
    custom_producer_task = custom_job_producer_op(input_text=text, gcp_project=PROJECT_ID ,gcp_region=REGION)

## Compile and run the pipeline

Compile the pipeline:

In [None]:
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="simple_custom_job_spec.json"
)

The pipeline compilation generates the `simple_custom_job_spec.json` job spec file.

Next, instantiate an API client object and run the defined pipeline like this: 

In [None]:
from kfp.v2.google.client import AIPlatformClient

api_client = AIPlatformClient(project_id=PROJECT_ID, region=REGION)

response = api_client.create_run_from_job_spec(
          job_spec_path='simple_custom_job_spec.json',
          pipeline_root=PIPELINE_ROOT,  enable_caching=False
          )

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:
- Delete Cloud Storage objects that were created.  Uncomment and run the command in the cell below **only if you are not using the `PIPELINE_ROOT` path for any other purpose**.
- Delete your deployed model: first, undeploy it from its *endpoint*, then delete the model and endpoint.


In [None]:
# Warning: this command will delete ALL Cloud Storage objects under the PIPELINE_ROOT path.
# ! gsutil -m rm -r $PIPELINE_ROOT