In [None]:
# Import necessary packages
import kfp
import kfp.dsl as dsl
from kfp.components import load_component_from_file
import kfp.compiler as compiler

In [None]:
# Create path for data_loader component

data_loader_manifest = """\
name: Load 10k Files
description: Download 10k files to Azure Datalake Storage

implementation:
  container:
    image: docaiacr.azurecr.io/data-loader:v1
    command: [ 'python3', '-m', 'data_loader.download_10k_raw' ]
"""

data_loader_path = './data_loader_comp.yaml'
with open(data_loader_path,'w') as data_loader:
    data_loader.write(data_loader_manifest)

In [None]:
# Create path for spark_runner component

spark_runner_manifest = """\
name: Run Spark Job
description: Create docai spark app in the same k8s cluster for Spark-operator
implementation:
  container:
    image: docaiacr.azurecr.io/spark-job:v1  # Update with the image from spark-job dir
    command: ['python', 'create_spark_app.py']
"""

spark_runner_path = './spark_runner_comp.yaml'
with open(spark_runner_path,'w') as spark_runner:
    spark_runner.write(spark_runner_manifest)

In [None]:
# Load pipeline components

create_spark_app_op = load_component_from_file('spark_runner_comp.yaml')

create_data_loader_op = load_component_from_file('data_loader_comp.yaml')

In [None]:
# Define the pipeline

@dsl.pipeline(
    name='docai data pipeline',
    description='Load 10k files to Azure storage, run spark job to pick the data and load back to Azure Storage in delta format'
)
def docai_data_pipeline():
    step1 = create_data_loader_op()

    step2 = create_spark_app_op().after(step1)
    
pipeline_func = docai_data_pipeline

In [None]:
# Compile pipeline
pipeline_filename = pipeline_func.__name__ + '.yaml'

compiler.Compiler().compile(pipeline_func, pipeline_filename)

In [None]:
# Submit and run pipeline
arguments = {}

client = kfp.Client()
experiment = client.create_experiment("test")

run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)