In [1]:
import kfp
from kfp import dsl
from kfp.components import load_component_from_url, create_component_from_func
from kfp.components import InputPath, OutputPath

import sys

sys.path.insert(0, "..")
from constants import NAMESPACE, HOST, NAMESPACE
from utils import get_session_cookie

In [2]:
# Where all the runs belong to the pipeline reside in
EXPERIMENT_NAME = "mle-3-data-passing"

## Define pipeline components

In [3]:
# Load a pre-created op to download a file from an URL
web_downloader_op = load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/contrib/web/Download/component.yaml"
)

In [4]:
# Another op to merge all CSVs
def merge_csv(
    file_path: InputPath("Tarball"), output_csv: OutputPath("CSV")
):  # Type hint: CSV, you can replace CSV with whatever
    import glob
    import pandas as pd
    import tarfile

    tarfile.open(name=file_path, mode="r|gz").extractall("data")
    df = pd.concat(
        [pd.read_csv(csv_file, header=None) for csv_file in glob.glob("data/*.csv")]
    )
    df.to_csv(output_csv, index=False, header=False)


merge_csv_op = create_component_from_func(
    func=merge_csv,
    output_component_file="../../components/merge_csv/component.yaml",  # This is optional. It saves the component spec for future use.
    base_image="python:3.7",
    packages_to_install=["pandas==1.1.4"],
)

In [5]:
# Final op to read CSV metadata
def read_csv_metadata(input_csv: InputPath("CSV")) -> tuple:
    import pandas as pd

    df = pd.read_csv(input_csv, header=None)
    print(f"[DEBUG] df.shape: {df.shape}")
    return df.shape


read_csv_metadata_op = create_component_from_func(
    func=read_csv_metadata,
    output_component_file="../../components/get_csv_info/component.yaml",  # This is optional. It saves the component spec for future use.
    base_image="python:3.7",
    packages_to_install=["pandas==1.1.4"],
)

## Define some pipelines

In [9]:
@dsl.pipeline(name="Data Passing", description="Pass data between components.")
def data_passing(url):
    # A sample pipeline showing how to pass data (small) between components.

    web_downloader_task = web_downloader_op(url=url)
    merge_csv_task = merge_csv_op(file=web_downloader_task.outputs["Data"])
    read_csv_metadata_task = read_csv_metadata_op(
        input_csv=merge_csv_task.outputs["output_csv"]
    )

## Run the pipelines

In [10]:
# Get the token to authenticate to the `ml-pipeline` service
session_cookie = get_session_cookie()

# Initialize the client
client = kfp.Client(
    host=f"{HOST}/pipeline",
    cookies=f"authservice_session={session_cookie}",
    namespace=NAMESPACE,
)

In [11]:
client.create_run_from_pipeline_func(
    data_passing,
    arguments={
        "url": "https://storage.googleapis.com/ml-pipeline-playground/iris-csv-files.tar.gz"
    },
    experiment_name=EXPERIMENT_NAME,
    namespace=NAMESPACE,
)

{'output_csv': {{pipelineparam:op=merge-csv;name=output_csv}}}


RunPipelineResult(run_id=198cf199-3afc-4fba-8c46-379d4cd79754)