/
component.yaml
93 lines (92 loc) · 3.08 KB
/
component.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: dataproc_submit_spark_job
description: >-
Submits a Cloud Dataproc job for running Apache Spark applications on YARN.
metadata:
labels:
add-pod-env: 'true'
inputs:
- name: project_id
description: >-
Required. The ID of the Google Cloud Platform project that the cluster
belongs to.
type: GCPProjectID
- name: region
description: >-
Required. The Cloud Dataproc region in which to handle the request.
type: GCPRegion
- name: cluster_name
description: 'Required. The cluster to run the job.'
type: String
- name: main_jar_file_uri
default: ''
description: >-
The HCFS URI of the jar file that contains the main class.
type: GCSPath
- name: main_class
default: ''
description: >-
The name of the driver's main class. The jar file that
contains the class must be in the default CLASSPATH or specified in
jarFileUris.
type: String
- name: args
default: ''
description: >-
Optional. The arguments to pass to the driver. Do not include
arguments, such as --conf, that can be set as job properties, since a
collision may occur that causes an incorrect job submission.
type: List
- name: spark_job
default: ''
description: >-
Optional. The full payload of a
[SparkJob](https://cloud.google.com/dataproc/docs/reference/rest/v1/SparkJob).
type: Dict
- name: job
default: ''
description: >-
Optional. The full payload of a
[Dataproc job](https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.jobs).
type: Dict
- name: wait_interval
default: '30'
description: >-
Optional. The wait seconds between polling the operation.
Defaults to 30.
type: Integer
outputs:
- name: job_id
description: 'The ID of the created job.'
type: String
implementation:
container:
image: gcr.io/ml-pipeline/ml-pipeline-gcp:0517114dc2b365a4a6d95424af6157ead774eff3
args: [
kfp_component.google.dataproc, submit_spark_job,
--project_id, {inputValue: project_id},
--region, {inputValue: region},
--cluster_name, {inputValue: cluster_name},
--main_jar_file_uri, {inputValue: main_jar_file_uri},
--main_class, {inputValue: main_class},
--args, {inputValue: args},
--spark_job, {inputValue: spark_job},
--job, {inputValue: job},
--wait_interval, {inputValue: wait_interval}
]
env:
KFP_POD_NAME: "{{pod.name}}"
fileOutputs:
job_id: /tmp/kfp/output/dataproc/job_id.txt