components/gcp/dataproc/submit_spark_job/component.yaml

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: dataproc_submit_spark_job
description: >-
  Submits a Cloud Dataproc job for running Apache Spark applications on YARN.
metadata:
  labels:
    add-pod-env: 'true'
inputs:
  - name: project_id
    description: >-
      Required. The ID of the Google Cloud Platform project that the cluster
      belongs to.
    type: GCPProjectID
  - name: region
    description: >-
      Required. The Cloud Dataproc region in which to handle the request.
    type: GCPRegion
  - name: cluster_name
    description: 'Required. The cluster to run the job.'
    type: String
  - name: main_jar_file_uri
    default: ''
    description: >-
      The HCFS URI of the jar file that contains the main class.
    type: GCSPath
  - name: main_class
    default: ''
    description: >-
      The name of the driver's main class. The jar file that 
      contains the class must be in the default CLASSPATH or specified in 
      jarFileUris.
    type: String
  - name: args
    default: ''
    description: >-
      Optional. The arguments to pass to the driver. Do not include 
      arguments, such as --conf, that can be set as job properties, since a 
      collision may occur that causes an incorrect job submission.
    type: List
  - name: spark_job
    default: ''
    description: >-
      Optional. The full payload of a
      [SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).
    type: Dict
  - name: job
    default: ''
    description: >-
      Optional. The full payload of a
      [Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
    type: Dict
  - name: wait_interval
    default: '30'
    description: >-
      Optional. The wait seconds between polling the operation.
      Defaults to 30.
    type: Integer
outputs:
  - name: job_id
    description: 'The ID of the created job.'
    type: String
implementation:
  container:
    image: gcr.io/ml-pipeline/ml-pipeline-gcp:0517114dc2b365a4a6d95424af6157ead774eff3
    args: [
      kfp_component.google.dataproc, submit_spark_job,
      --project_id, {inputValue: project_id},
      --region, {inputValue: region},
      --cluster_name, {inputValue: cluster_name},
      --main_jar_file_uri, {inputValue: main_jar_file_uri},
      --main_class, {inputValue: main_class},
      --args, {inputValue: args},
      --spark_job, {inputValue: spark_job},
      --job, {inputValue: job},
      --wait_interval, {inputValue: wait_interval}
    ]
    env:
      KFP_POD_NAME: "{{pod.name}}"
    fileOutputs:
      job_id: /tmp/kfp/output/dataproc/job_id.txt