# Run a pyspark job on Dataproc

1. You should use the component ```load_spark_to_GCS``` to upload ```transform_run.py``` and ```sparkicson-0.1-dependencies.jar```
2. You should use the standard components [create_cluster](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/create_cluster), [submit_pyspark_job](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/submit_pyspark_job) and [delete_cluster](https://github.com/kubeflow/pipelines/tree/master/components/gcp/dataproc/delete_cluster). ```kfp.components.ComponentStore``` could help.
3. Use a template name for the cluster
4. Check ```kfp.dsl.ExitHandler```

In [None]:
import kfp.compiler as compiler
import kfp.components as comp
import kfp.dsl as dsl
import kfp.gcp as gcp

from kfp import Client as KfpClient

import os

In [None]:
component_store = comp.ComponentStore(
  local_search_paths=['components'])

In [None]:
upload_files_to_GCS_op = component_store.load_component('load_spark_to_GCS')

In [None]:
remote_component_store = comp.ComponentStore(
    url_search_prefixes=['https://raw.githubusercontent.com/kubeflow/pipelines/master/components/gcp/'])

In [None]:
dataproc_create_cluster_op = remote_component_store.load_component('dataproc/create_cluster')

In [None]:
dataproc_submit_pyspark_job_op = remote_component_store.load_component('dataproc/submit_pyspark_job')

In [None]:
dataproc_delete_cluster_op = remote_component_store.load_component('dataproc/delete_cluster')

In [None]:
# TODO : remove before push
dataproc_create_cluster_op = comp.load_component_from_file(
    '/Users/lfloretta/IdeaProjects/pipelines/components/gcp/dataproc/create_cluster/component.yaml')
dataproc_submit_pyspark_job_op = comp.load_component_from_file(
    '/Users/lfloretta/IdeaProjects/pipelines/components/gcp/dataproc/submit_pyspark_job/component.yaml')
dataproc_delete_cluster_op = comp.load_component_from_file(
    '/Users/lfloretta/IdeaProjects/pipelines/components/gcp/dataproc/delete_cluster/component.yaml')

In [None]:
BUCKET_NAME = 'lf-ml-demo-eu-w1/kfp_primer/test/dataproc'

In [None]:
@dsl.pipeline(
    name = 'Test',
    description = 'Simple pipeline to exeperiment with KFP'
)
def end_to_end_pyspark(
    cluster_project_id = 'lf-ml-demo', 
    cluster_region = 'europe-west1',
    cluster_name = '{{workflow.uid}}',
    gcs_pkgs_path = 'gs://{0}/output/{{workflow.uid}}/{{pod.name}}/pkgs',
    bq_project_id = 'lf-ml-demo',
    bq_dataset = 'spark_demo',
    bq_table = 'short_data',
    output_path = 'gs://{0}/output/{{workflow.uid}}/{{pod.name}}/test.csv'.format(BUCKET_NAME),
    args='',
    job='{}',
    wait_interval='30'
    ):
    
    
    delete_cluster_task = dataproc_delete_cluster_op(
        cluster_project_id,
        cluster_region,
        cluster_name
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))
    
    delete_cluster_task.set_display_name('delete cluster')
    
    with dsl.ExitHandler(exit_op=delete_cluster_task):
        #create cluster
        create_cluster_task = dataproc_create_cluster_op(
            project_id=cluster_project_id,
            region=cluster_region,
            name=cluster_name).apply(gcp.use_gcp_secret('user-gcp-sa'))
    
        create_cluster_task.set_display_name('create cluster')
        
        #upload file to GCS
        upload_files_to_GCS_task = upload_files_to_GCS_op(
            output_gcs_path=gcs_pkgs_path).apply(gcp.use_gcp_secret('user-gcp-sa'))

        upload_files_to_GCS_task.set_display_name('upload files')
        
        #submit job to dataproc cluster
        dataproc_submit_pyspark_job_task = dataproc_submit_pyspark_job_op(
            project_id=cluster_project_id, 
            region=cluster_region, 
            cluster_name=create_cluster_task.outputs['cluster_name'], 
            main_python_file_uri=upload_files_to_GCS_task.outputs['transform_run_path'], 
            args=args, 
            pyspark_job={
                'main_python_file_uri': upload_files_to_GCS_task.outputs['transform_run_path'],
                'jar_file_uris': upload_files_to_GCS_task.outputs['jar_path'],
                'args' : ['--tableProjectID', bq_project_id, 
                          '--dataset', bq_dataset, 
                          '--table', bq_table,
                          '--output', output_path]
            },  
            job=job, 
            wait_interval=wait_interval).apply(gcp.use_gcp_secret('user-gcp-sa'))
        
        dataproc_submit_pyspark_job_task.set_display_name('run pyspark job')

### Compile pipeline to check for errors

In [None]:
compiler.Compiler().compile(end_to_end_pyspark, end_to_end_pyspark.__name__ + '.pipeline.zip')

### Upload the pipeline to Kubeflow Pipeline

Set `GOOGLE_APPLICATION_CREDENTIALS` for dealing with authorisation. The service account has role `IAP-secured Web App User`.

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/lfloretta/.secrets/lf-ml-demo-20819be29240.json'

In [None]:
client = KfpClient(
    host='https://demo-kubeflow.endpoints.lf-ml-demo.cloud.goog/pipeline',
    client_id='49311432881-9u2qfhilqci5fdthfsh8t0njpuugkj18.apps.googleusercontent.com',
    namespace='kubeflow_lfloretta'
    
)

In [None]:
client.upload_pipeline(
    pipeline_package_path=end_to_end_pyspark.__name__ + '.pipeline.zip', 
    pipeline_name='e2e_pyspark_run') #make the name unique with your username

### Run the pipeline from the UI