In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex Pipelines: Dataflow Python Job OP

## Overview
This notebook shows how to use the `DataflowPythonJobOp` to create a Python Dataflow Job component. `DataflowPythonJobOp` creates a pipeline component that prepares data by submitting an Apache Beam job (authored in Python) to Cloud Dataflow for execution. The Python Beam code is run with Cloud Dataflow Runner. learn more about [Google Cloud Dataflow Runner](https://beam.apache.org/documentation/runners/dataflow/) here.


For more details on `DataflowPythonJobOp` interface please see the [API doc](https://google-cloud-pipeline-components.readthedocs.io/).

### Install required packages

In [None]:
!pip3 install  -U google-cloud-pipeline-components -q

## Before you begin
Set your Project ID, Location, Pipeline Root, and a few parameters required for the Dataflow sample.

In [None]:
PROJECT_ID = 'YOUR_PROJECT_ID'
LOCATION = "us-central1"
PIPELINE_ROOT = 'gs://YOUR_BUCKET_NAME' # No ending slash

# Dataflow sample parameters
PIPELINE_NAME = 'dataflow-pipeline-sample'
OUTPUT_FILE = '{}/wc/wordcount.out'.format(PIPELINE_ROOT)

### Import libraries

In [None]:
from google_cloud_pipeline_components.experimental.dataflow import DataflowPythonJobOp
from google_cloud_pipeline_components.experimental.wait_gcp_resources import WaitGcpResourcesOp

## Create a pipeline using DataflowPythonJobOp and WaitGcpResourcesOp
In this section we create a pipeline using the `DataflowPythonJobOp` and the [Apache Beam WordCount Examples](https://beam.apache.org/get-started/wordcount-example/). Then we use the 'WaitGcpResourcesOp' to poll the resource status and wait for it to finish.
To use the 'WaitGcpResourcesOp' component, first create the `DataflowPythonJobOp` component which outputs a JSON formatted [gcp_resources proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/experimental/proto), then pass it to the wait component.

In [None]:
import kfp.dsl as dsl
import json

@dsl.pipeline(
    name=PIPELINE_NAME,
    description='Dataflow launch python pipeline'
)
def pipeline(
    python_file_path:str = 'gs://ml-pipeline-playground/samples/dataflow/wc/wc.py',
    project_id:str = PROJECT_ID,
    location:str = LOCATION,
    staging_dir:str = PIPELINE_ROOT,
    requirements_file_path:str = 'gs://ml-pipeline-playground/samples/dataflow/wc/requirements.txt',
):
    dataflow_python_op = DataflowPythonJobOp(
        project=project_id,
        location=location,
        python_module_path=python_file_path,
        temp_location = staging_dir,
        requirements_file_path = requirements_file_path,
        args = ['--output', OUTPUT_FILE],
    )
    dataflow_wait_op = WaitGcpResourcesOp(
      gcp_resources = dataflow_python_op.outputs["gcp_resources"])

You can proceed to complie and run the pipeline from here as usual.