# Run a pyspark job on Dataproc - Part 1

In [None]:
import kfp.compiler as compiler
import kfp.components as comp
import kfp.dsl as dsl
import kfp.gcp as gcp

from kfp import Client as KfpClient

import os

In [None]:
component_store = comp.ComponentStore(
  local_search_paths=['components'])

In [None]:
upload_files_to_GCS_op = component_store.load_component('load_spark_to_GCS')

In [None]:
print(comp.func_to_component_text(upload_files_to_GCS_op))

In [None]:
def print_value(x: str) -> None:
    """Helper funtion to print parameter's value"""
    print(x)

In [None]:
print_value_op = comp.func_to_container_op(print_value)

In [None]:
@dsl.pipeline(
    name = 'Test',
    description = 'Simple pipeline to exeperiment with KFP'
)
def write_to_GCS(
    output_uri_template):
    
    upload_files_to_GCS = upload_files_to_GCS_op(
        output_gcs_path=output_uri_template).apply(gcp.use_gcp_secret('user-gcp-sa'))
    
    print_value_task = print_value_op('%s' % upload_files_to_GCS.outputs['transform_run_path'])

In [None]:
compiler.Compiler().compile(write_to_GCS, write_to_GCS.__name__ + '.pipeline.zip')

Set `GOOGLE_APPLICATION_CREDENTIALS` for dealing with authorisation. The service account has role `IAP-secured Web App User`.

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/lfloretta/.secrets/lf-ml-demo-20819be29240.json'

In [None]:
client = KfpClient(
    host='https://demo-kubeflow.endpoints.lf-ml-demo.cloud.goog/pipeline',
    client_id='49311432881-9u2qfhilqci5fdthfsh8t0njpuugkj18.apps.googleusercontent.com',
    namespace='kubeflow_lfloretta'
    
)

Run the pipeline

In [None]:
client.create_run_from_pipeline_func(
    write_to_GCS, 
    {'output_uri_template': 'gs://lf-ml-demo-eu-w1/kfp/primer/{{workflow.uid}}/{{pod.name}}/data'},
    experiment_name='01_pyspark_on_dataproc',
    run_name='002')

Once you have the value of ```output_uri_template``` you can check the file uploaded to GCS using ```gsutil ls``` and ```gsutil cat```.